aboutsummaryrefslogtreecommitdiffstats
path: root/lib/nix/lexer.mll
diff options
context:
space:
mode:
authorRutger Broekhoff2025-07-07 21:52:08 +0200
committerRutger Broekhoff2025-07-07 21:52:08 +0200
commitba61dfd69504ec6263a9dee9931d93adeb6f3142 (patch)
treed6c9b78e50eeab24e0c1c09ab45909a6ae3fd5db /lib/nix/lexer.mll
downloadverified-dyn-lang-interp-ba61dfd69504ec6263a9dee9931d93adeb6f3142.tar.gz
verified-dyn-lang-interp-ba61dfd69504ec6263a9dee9931d93adeb6f3142.zip
Initialize repository
Diffstat (limited to 'lib/nix/lexer.mll')
-rw-r--r--lib/nix/lexer.mll315
1 files changed, 315 insertions, 0 deletions
diff --git a/lib/nix/lexer.mll b/lib/nix/lexer.mll
new file mode 100644
index 0000000..023d888
--- /dev/null
+++ b/lib/nix/lexer.mll
@@ -0,0 +1,315 @@
1{
2open Core
3open Tokens
4
5exception Error of string
6
7(* Types of curly braces.
8 AQUOTE corresponds to the braces for antiquotation, i.e. '${...}'
9 and SET to an attribute set '{...}'.
10 *)
11type braces =
12 | AQUOTE
13 | SET
14
15let print_stack s =
16 let b = Buffer.create 100 in
17 Buffer.add_string b "[ ";
18 List.iter s ~f:(function
19 | AQUOTE -> Buffer.add_string b "AQUOTE; "
20 | SET -> Buffer.add_string b "SET; "
21 );
22 Buffer.add_string b "]";
23 Buffer.contents b
24
25let token_of_str state buf =
26 match state with
27 | `Start -> STR_START (Buffer.contents buf)
28 | `Mid -> STR_MID (Buffer.contents buf)
29
30let token_of_istr state buf =
31 match state with
32 | `Start -> ISTR_START (Buffer.contents buf)
33 | `Mid -> ISTR_MID (Buffer.contents buf)
34
35(* lookup table for one-character tokens *)
36let char_table = Array.create ~len:94 EOF
37let _ =
38 List.iter ~f:(fun (k, v) -> Array.set char_table ((int_of_char k) - 1) v)
39 [
40 '.', SELECT;
41 '?', QMARK;
42 '!', NOT;
43 '=', ASSIGN;
44 '<', LT;
45 '>', GT;
46 '[', LBRACK;
47 ']', RBRACK;
48 '+', PLUS;
49 '-', MINUS;
50 '*', TIMES;
51 '/', SLASH;
52 '(', LPAREN;
53 ')', RPAREN;
54 ':', COLON;
55 ';', SEMICOLON;
56 ',', COMMA;
57 '@', AS
58 ]
59
60(* lookup table for two- and three-character tokens *)
61let str_table = Hashtbl.create (module String) ~size:10
62let _ =
63 List.iter ~f:(fun (kwd, tok) -> Hashtbl.set str_table ~key:kwd ~data:tok)
64 [
65 "//", MERGE;
66 "++", CONCAT;
67 "<=", LTE;
68 ">=", GTE;
69 "==", EQ;
70 "!=", NEQ;
71 "&&", AND;
72 "||", OR;
73 "->", IMPL;
74 "...", ELLIPSIS
75 ]
76
77(* lookup table for keywords *)
78let keyword_table = Hashtbl.create (module String) ~size:10
79let _ =
80 List.iter ~f:(fun (kwd, tok) -> Hashtbl.set keyword_table ~key:kwd ~data:tok)
81 [ "with", WITH;
82 "rec", REC;
83 "let", LET;
84 "in", IN;
85 "inherit", INHERIT;
86 "if" , IF;
87 "then", THEN;
88 "else", ELSE;
89 "assert", ASSERT;
90 "or", ORDEF ]
91
92(* replace an escape sequence by the corresponding character(s) *)
93let unescape = function
94 | "\\n" -> "\n"
95 | "\\r" -> "\r"
96 | "\\t" -> "\t"
97 | "\\\\" -> "\\"
98 | "\\${" -> "${"
99 | "''$" -> "$"
100 | "$$" -> "$"
101 | "'''" -> "''"
102 | "''\\t" -> "\t"
103 | "''\\r" -> "\r"
104 | "''\\n" -> "\n"
105 | x ->
106 failwith (Printf.sprintf "unescape unexpected arg %s" x)
107
108let collect_tokens lexer q lexbuf =
109 let stack = ref [] in
110 let queue = Stdlib.Queue.create () in
111 let rec go () =
112 match (try Some (Stdlib.Queue.take queue) with Stdlib.Queue.Empty -> None) with
113 | Some token ->
114 (
115 match token, !stack with
116 | AQUOTE_CLOSE, [] ->
117 Stdlib.Queue.add AQUOTE_CLOSE q
118 | EOF, _ ->
119 Stdlib.Queue.add EOF q;
120 | _, _ ->
121 Stdlib.Queue.add token q;
122 go ()
123 )
124 | None ->
125 lexer queue stack lexbuf;
126 go ()
127 in
128 Stdlib.Queue.add AQUOTE_OPEN q;
129 stack := [AQUOTE];
130 lexer queue stack lexbuf;
131 go ()
132
133(* utility functions *)
134let print_position lexbuf =
135 let pos = Lexing.lexeme_start_p lexbuf in
136 Printf.sprintf "%s:%d:%d" pos.pos_fname
137 pos.pos_lnum (pos.pos_cnum - pos.pos_bol + 1)
138
139
140let set_filename fname (lexbuf: Lexing.lexbuf) =
141 let pos = lexbuf.lex_curr_p in
142 lexbuf.lex_curr_p <- { pos with pos_fname = fname }; lexbuf
143
144}
145
146let nzdigit = ['1'-'9']
147let digit = nzdigit | '0'
148let float = (nzdigit digit* '.' digit* | '0'? '.' digit+) (['E' 'e'] ['+' '-']? digit+)?
149let alpha = ['a'-'z' 'A'-'Z']
150let alpha_digit = alpha | digit
151let path_chr = alpha_digit | ['.' '_' '-' '+']
152let path = path_chr* ('/' path_chr+)+
153let spath = alpha_digit path_chr* ('/' path_chr+)*
154let uri_chr = ['%' '/' '?' ':' '@' '&' '=' '+' '$' ',' '-' '_' '.' '!' '~' '*' '\'']
155let scheme = alpha (alpha | ['+' '-' '.'])*
156let uri = scheme ':' (alpha_digit | uri_chr)+
157let char_tokens = ['.' '?' '!' '=' '<' '>' '[' ']' '+' '-' '*' '/' '^' '(' ')' ':' ';' ',' '@']
158
159rule get_tokens q s = parse
160(* skip whitespeces *)
161| [' ' '\t' '\r']
162 { get_tokens q s lexbuf }
163(* increase line count for new lines *)
164| '\n'
165 { Lexing.new_line lexbuf; get_tokens q s lexbuf }
166| char_tokens as c
167 { Stdlib.Queue.add (Array.get char_table ((int_of_char c) - 1)) q }
168| ("//" | "++" | "<=" | ">=" | "==" | "!=" | "&&" | "||" | "->" | "...") as s
169 { Stdlib.Queue.add (Hashtbl.find_exn str_table s) q}
170| digit+ as i
171 { Stdlib.Queue.add (INT i) q }
172| float
173 { Stdlib.Queue.add (FLOAT (Lexing.lexeme lexbuf)) q }
174| path
175 { Stdlib.Queue.add (PATH (Lexing.lexeme lexbuf)) q }
176| '<' (spath as p) '>'
177 { Stdlib.Queue.add (SPATH p) q }
178| '~' path as p
179 { Stdlib.Queue.add (HPATH p) q }
180| uri
181 { Stdlib.Queue.add(URI (Lexing.lexeme lexbuf)) q }
182(* keywords or identifiers *)
183| ((alpha | '_')+ (alpha_digit | ['_' '\'' '-'])*) as id
184 { Stdlib.Queue.add (Hashtbl.find keyword_table id |> Option.value ~default:(ID id)) q}
185(* comments *)
186| '#' ([^ '\n']* as c)
187 { ignore c; get_tokens q s lexbuf}
188| "/*"
189 { comment (Buffer.create 64) lexbuf;
190 get_tokens q s lexbuf
191 }
192(* the following three tokens change the braces stack *)
193| "${"
194 { Stdlib.Queue.add AQUOTE_OPEN q; s := AQUOTE :: !s }
195| '{'
196 { Stdlib.Queue.add LBRACE q; s := SET :: !s }
197| '}'
198 {
199 match !s with
200 | AQUOTE :: rest ->
201 Stdlib.Queue.add AQUOTE_CLOSE q; s := rest
202 | SET :: rest ->
203 Stdlib.Queue.add RBRACE q; s := rest
204 | _ ->
205 let pos = print_position lexbuf in
206 let err = Printf.sprintf "Unbalanced '}' at %s\n" pos in
207 raise (Error err)
208 }
209(* a double-quoted string *)
210| '"'
211 { string `Start (Buffer.create 64) q lexbuf }
212(* an indented string *)
213| "''" (' '+ as ws)
214 { istring `Start (Some (String.length ws)) (Buffer.create 64) q lexbuf }
215| "''"
216 { istring `Start None (Buffer.create 64) q lexbuf }
217(* End of input *)
218| eof
219 { Stdlib.Queue.add EOF q }
220(* any other character raises an exception *)
221| _
222 {
223 let pos = print_position lexbuf in
224 let tok = Lexing.lexeme lexbuf in
225 let err = Printf.sprintf "Unexpected character '%s' at %s\n" tok pos in
226 raise (Error err)
227 }
228
229(* Nix does not allow nested comments, but it is still handy to lex it
230 separately because we can properly increase line count. *)
231and comment buf = parse
232 | '\n'
233 {Lexing.new_line lexbuf; Buffer.add_char buf '\n'; comment buf lexbuf}
234 | "*/"
235 { () }
236 | _ as c
237 { Buffer.add_char buf c; comment buf lexbuf }
238
239and string state buf q = parse
240 | '"' (* terminate when we hit '"' *)
241 { Stdlib.Queue.add (token_of_str state buf) q; Stdlib.Queue.add STR_END q }
242 | '\n'
243 { Lexing.new_line lexbuf; Buffer.add_char buf '\n'; string state buf q lexbuf }
244 | ("\\n" | "\\r" | "\\t" | "\\\\" | "\\${") as s
245 { Buffer.add_string buf (unescape s); string state buf q lexbuf }
246 | "\\" (_ as c) (* add the character verbatim *)
247 { Buffer.add_char buf c; string state buf q lexbuf }
248 | "${" (* collect all the tokens till we hit the matching '}' *)
249 {
250 Stdlib.Queue.add (token_of_str state buf) q;
251 collect_tokens get_tokens q lexbuf;
252 string `Mid (Buffer.create 64) q lexbuf
253 }
254 | _ as c (* otherwise just add the character to the buffer *)
255 { Buffer.add_char buf c; string state buf q lexbuf }
256
257and istring state imin buf q = parse
258 | ('\n' ' '* "''")
259 {
260 Lexing.new_line lexbuf;
261 Buffer.add_string buf "\n";
262 let indent = match imin with | None -> 0 | Some i -> i in
263 Stdlib.Queue.add (token_of_istr state buf) q;
264 Stdlib.Queue.add (ISTR_END indent) q
265 }
266 | "''"
267 {
268 let indent = match imin with | None -> 0 | Some i -> i in
269 Stdlib.Queue.add (token_of_istr state buf) q;
270 Stdlib.Queue.add (ISTR_END indent) q
271 }
272 | ('\n' ' '* '\n') as s
273 {
274 Lexing.new_line lexbuf;
275 Lexing.new_line lexbuf;
276 Buffer.add_string buf s;
277 istring state imin buf q lexbuf
278 }
279 | ('\n' (' '* as ws)) as s
280 {
281 Lexing.new_line lexbuf;
282 Buffer.add_string buf s;
283 let ws_count = String.length ws in
284 match imin with
285 | None ->
286 istring state (Some ws_count) buf q lexbuf
287 | Some i ->
288 istring state (Some (min i ws_count)) buf q lexbuf
289 }
290 | ("''$" | "'''" | "''\\t" | "''\\r" | "''\\n") as s
291 { Buffer.add_string buf (unescape s); istring state imin buf q lexbuf }
292 | "''\\" (_ as c)
293 { Buffer.add_char buf c; istring state imin buf q lexbuf }
294 | "${"
295 {
296 Stdlib.Queue.add (token_of_istr state buf) q;
297 collect_tokens get_tokens q lexbuf;
298 istring `Mid imin (Buffer.create 64) q lexbuf
299 }
300 | _ as c
301 { Buffer.add_char buf c; istring state imin buf q lexbuf }
302{
303
304let rec next_token
305 (q: token Stdlib.Queue.t)
306 (s: braces list ref)
307 (lexbuf: Lexing.lexbuf)
308 : token =
309 match (try Some (Stdlib.Queue.take q) with | Stdlib.Queue.Empty -> None) with
310 | Some token ->
311 token
312 | None ->
313 get_tokens q s lexbuf;
314 next_token q s lexbuf
315}