diff options
Diffstat (limited to 'lib/nix/lexer.mll')
-rw-r--r-- | lib/nix/lexer.mll | 315 |
1 files changed, 315 insertions, 0 deletions
diff --git a/lib/nix/lexer.mll b/lib/nix/lexer.mll new file mode 100644 index 0000000..023d888 --- /dev/null +++ b/lib/nix/lexer.mll | |||
@@ -0,0 +1,315 @@ | |||
1 | { | ||
2 | open Core | ||
3 | open Tokens | ||
4 | |||
5 | exception Error of string | ||
6 | |||
7 | (* Types of curly braces. | ||
8 | AQUOTE corresponds to the braces for antiquotation, i.e. '${...}' | ||
9 | and SET to an attribute set '{...}'. | ||
10 | *) | ||
11 | type braces = | ||
12 | | AQUOTE | ||
13 | | SET | ||
14 | |||
15 | let print_stack s = | ||
16 | let b = Buffer.create 100 in | ||
17 | Buffer.add_string b "[ "; | ||
18 | List.iter s ~f:(function | ||
19 | | AQUOTE -> Buffer.add_string b "AQUOTE; " | ||
20 | | SET -> Buffer.add_string b "SET; " | ||
21 | ); | ||
22 | Buffer.add_string b "]"; | ||
23 | Buffer.contents b | ||
24 | |||
25 | let token_of_str state buf = | ||
26 | match state with | ||
27 | | `Start -> STR_START (Buffer.contents buf) | ||
28 | | `Mid -> STR_MID (Buffer.contents buf) | ||
29 | |||
30 | let token_of_istr state buf = | ||
31 | match state with | ||
32 | | `Start -> ISTR_START (Buffer.contents buf) | ||
33 | | `Mid -> ISTR_MID (Buffer.contents buf) | ||
34 | |||
35 | (* lookup table for one-character tokens *) | ||
36 | let char_table = Array.create ~len:94 EOF | ||
37 | let _ = | ||
38 | List.iter ~f:(fun (k, v) -> Array.set char_table ((int_of_char k) - 1) v) | ||
39 | [ | ||
40 | '.', SELECT; | ||
41 | '?', QMARK; | ||
42 | '!', NOT; | ||
43 | '=', ASSIGN; | ||
44 | '<', LT; | ||
45 | '>', GT; | ||
46 | '[', LBRACK; | ||
47 | ']', RBRACK; | ||
48 | '+', PLUS; | ||
49 | '-', MINUS; | ||
50 | '*', TIMES; | ||
51 | '/', SLASH; | ||
52 | '(', LPAREN; | ||
53 | ')', RPAREN; | ||
54 | ':', COLON; | ||
55 | ';', SEMICOLON; | ||
56 | ',', COMMA; | ||
57 | '@', AS | ||
58 | ] | ||
59 | |||
60 | (* lookup table for two- and three-character tokens *) | ||
61 | let str_table = Hashtbl.create (module String) ~size:10 | ||
62 | let _ = | ||
63 | List.iter ~f:(fun (kwd, tok) -> Hashtbl.set str_table ~key:kwd ~data:tok) | ||
64 | [ | ||
65 | "//", MERGE; | ||
66 | "++", CONCAT; | ||
67 | "<=", LTE; | ||
68 | ">=", GTE; | ||
69 | "==", EQ; | ||
70 | "!=", NEQ; | ||
71 | "&&", AND; | ||
72 | "||", OR; | ||
73 | "->", IMPL; | ||
74 | "...", ELLIPSIS | ||
75 | ] | ||
76 | |||
77 | (* lookup table for keywords *) | ||
78 | let keyword_table = Hashtbl.create (module String) ~size:10 | ||
79 | let _ = | ||
80 | List.iter ~f:(fun (kwd, tok) -> Hashtbl.set keyword_table ~key:kwd ~data:tok) | ||
81 | [ "with", WITH; | ||
82 | "rec", REC; | ||
83 | "let", LET; | ||
84 | "in", IN; | ||
85 | "inherit", INHERIT; | ||
86 | "if" , IF; | ||
87 | "then", THEN; | ||
88 | "else", ELSE; | ||
89 | "assert", ASSERT; | ||
90 | "or", ORDEF ] | ||
91 | |||
92 | (* replace an escape sequence by the corresponding character(s) *) | ||
93 | let unescape = function | ||
94 | | "\\n" -> "\n" | ||
95 | | "\\r" -> "\r" | ||
96 | | "\\t" -> "\t" | ||
97 | | "\\\\" -> "\\" | ||
98 | | "\\${" -> "${" | ||
99 | | "''$" -> "$" | ||
100 | | "$$" -> "$" | ||
101 | | "'''" -> "''" | ||
102 | | "''\\t" -> "\t" | ||
103 | | "''\\r" -> "\r" | ||
104 | | "''\\n" -> "\n" | ||
105 | | x -> | ||
106 | failwith (Printf.sprintf "unescape unexpected arg %s" x) | ||
107 | |||
108 | let collect_tokens lexer q lexbuf = | ||
109 | let stack = ref [] in | ||
110 | let queue = Stdlib.Queue.create () in | ||
111 | let rec go () = | ||
112 | match (try Some (Stdlib.Queue.take queue) with Stdlib.Queue.Empty -> None) with | ||
113 | | Some token -> | ||
114 | ( | ||
115 | match token, !stack with | ||
116 | | AQUOTE_CLOSE, [] -> | ||
117 | Stdlib.Queue.add AQUOTE_CLOSE q | ||
118 | | EOF, _ -> | ||
119 | Stdlib.Queue.add EOF q; | ||
120 | | _, _ -> | ||
121 | Stdlib.Queue.add token q; | ||
122 | go () | ||
123 | ) | ||
124 | | None -> | ||
125 | lexer queue stack lexbuf; | ||
126 | go () | ||
127 | in | ||
128 | Stdlib.Queue.add AQUOTE_OPEN q; | ||
129 | stack := [AQUOTE]; | ||
130 | lexer queue stack lexbuf; | ||
131 | go () | ||
132 | |||
133 | (* utility functions *) | ||
134 | let print_position lexbuf = | ||
135 | let pos = Lexing.lexeme_start_p lexbuf in | ||
136 | Printf.sprintf "%s:%d:%d" pos.pos_fname | ||
137 | pos.pos_lnum (pos.pos_cnum - pos.pos_bol + 1) | ||
138 | |||
139 | |||
140 | let set_filename fname (lexbuf: Lexing.lexbuf) = | ||
141 | let pos = lexbuf.lex_curr_p in | ||
142 | lexbuf.lex_curr_p <- { pos with pos_fname = fname }; lexbuf | ||
143 | |||
144 | } | ||
145 | |||
146 | let nzdigit = ['1'-'9'] | ||
147 | let digit = nzdigit | '0' | ||
148 | let float = (nzdigit digit* '.' digit* | '0'? '.' digit+) (['E' 'e'] ['+' '-']? digit+)? | ||
149 | let alpha = ['a'-'z' 'A'-'Z'] | ||
150 | let alpha_digit = alpha | digit | ||
151 | let path_chr = alpha_digit | ['.' '_' '-' '+'] | ||
152 | let path = path_chr* ('/' path_chr+)+ | ||
153 | let spath = alpha_digit path_chr* ('/' path_chr+)* | ||
154 | let uri_chr = ['%' '/' '?' ':' '@' '&' '=' '+' '$' ',' '-' '_' '.' '!' '~' '*' '\''] | ||
155 | let scheme = alpha (alpha | ['+' '-' '.'])* | ||
156 | let uri = scheme ':' (alpha_digit | uri_chr)+ | ||
157 | let char_tokens = ['.' '?' '!' '=' '<' '>' '[' ']' '+' '-' '*' '/' '^' '(' ')' ':' ';' ',' '@'] | ||
158 | |||
159 | rule get_tokens q s = parse | ||
160 | (* skip whitespeces *) | ||
161 | | [' ' '\t' '\r'] | ||
162 | { get_tokens q s lexbuf } | ||
163 | (* increase line count for new lines *) | ||
164 | | '\n' | ||
165 | { Lexing.new_line lexbuf; get_tokens q s lexbuf } | ||
166 | | char_tokens as c | ||
167 | { Stdlib.Queue.add (Array.get char_table ((int_of_char c) - 1)) q } | ||
168 | | ("//" | "++" | "<=" | ">=" | "==" | "!=" | "&&" | "||" | "->" | "...") as s | ||
169 | { Stdlib.Queue.add (Hashtbl.find_exn str_table s) q} | ||
170 | | digit+ as i | ||
171 | { Stdlib.Queue.add (INT i) q } | ||
172 | | float | ||
173 | { Stdlib.Queue.add (FLOAT (Lexing.lexeme lexbuf)) q } | ||
174 | | path | ||
175 | { Stdlib.Queue.add (PATH (Lexing.lexeme lexbuf)) q } | ||
176 | | '<' (spath as p) '>' | ||
177 | { Stdlib.Queue.add (SPATH p) q } | ||
178 | | '~' path as p | ||
179 | { Stdlib.Queue.add (HPATH p) q } | ||
180 | | uri | ||
181 | { Stdlib.Queue.add(URI (Lexing.lexeme lexbuf)) q } | ||
182 | (* keywords or identifiers *) | ||
183 | | ((alpha | '_')+ (alpha_digit | ['_' '\'' '-'])*) as id | ||
184 | { Stdlib.Queue.add (Hashtbl.find keyword_table id |> Option.value ~default:(ID id)) q} | ||
185 | (* comments *) | ||
186 | | '#' ([^ '\n']* as c) | ||
187 | { ignore c; get_tokens q s lexbuf} | ||
188 | | "/*" | ||
189 | { comment (Buffer.create 64) lexbuf; | ||
190 | get_tokens q s lexbuf | ||
191 | } | ||
192 | (* the following three tokens change the braces stack *) | ||
193 | | "${" | ||
194 | { Stdlib.Queue.add AQUOTE_OPEN q; s := AQUOTE :: !s } | ||
195 | | '{' | ||
196 | { Stdlib.Queue.add LBRACE q; s := SET :: !s } | ||
197 | | '}' | ||
198 | { | ||
199 | match !s with | ||
200 | | AQUOTE :: rest -> | ||
201 | Stdlib.Queue.add AQUOTE_CLOSE q; s := rest | ||
202 | | SET :: rest -> | ||
203 | Stdlib.Queue.add RBRACE q; s := rest | ||
204 | | _ -> | ||
205 | let pos = print_position lexbuf in | ||
206 | let err = Printf.sprintf "Unbalanced '}' at %s\n" pos in | ||
207 | raise (Error err) | ||
208 | } | ||
209 | (* a double-quoted string *) | ||
210 | | '"' | ||
211 | { string `Start (Buffer.create 64) q lexbuf } | ||
212 | (* an indented string *) | ||
213 | | "''" (' '+ as ws) | ||
214 | { istring `Start (Some (String.length ws)) (Buffer.create 64) q lexbuf } | ||
215 | | "''" | ||
216 | { istring `Start None (Buffer.create 64) q lexbuf } | ||
217 | (* End of input *) | ||
218 | | eof | ||
219 | { Stdlib.Queue.add EOF q } | ||
220 | (* any other character raises an exception *) | ||
221 | | _ | ||
222 | { | ||
223 | let pos = print_position lexbuf in | ||
224 | let tok = Lexing.lexeme lexbuf in | ||
225 | let err = Printf.sprintf "Unexpected character '%s' at %s\n" tok pos in | ||
226 | raise (Error err) | ||
227 | } | ||
228 | |||
229 | (* Nix does not allow nested comments, but it is still handy to lex it | ||
230 | separately because we can properly increase line count. *) | ||
231 | and comment buf = parse | ||
232 | | '\n' | ||
233 | {Lexing.new_line lexbuf; Buffer.add_char buf '\n'; comment buf lexbuf} | ||
234 | | "*/" | ||
235 | { () } | ||
236 | | _ as c | ||
237 | { Buffer.add_char buf c; comment buf lexbuf } | ||
238 | |||
239 | and string state buf q = parse | ||
240 | | '"' (* terminate when we hit '"' *) | ||
241 | { Stdlib.Queue.add (token_of_str state buf) q; Stdlib.Queue.add STR_END q } | ||
242 | | '\n' | ||
243 | { Lexing.new_line lexbuf; Buffer.add_char buf '\n'; string state buf q lexbuf } | ||
244 | | ("\\n" | "\\r" | "\\t" | "\\\\" | "\\${") as s | ||
245 | { Buffer.add_string buf (unescape s); string state buf q lexbuf } | ||
246 | | "\\" (_ as c) (* add the character verbatim *) | ||
247 | { Buffer.add_char buf c; string state buf q lexbuf } | ||
248 | | "${" (* collect all the tokens till we hit the matching '}' *) | ||
249 | { | ||
250 | Stdlib.Queue.add (token_of_str state buf) q; | ||
251 | collect_tokens get_tokens q lexbuf; | ||
252 | string `Mid (Buffer.create 64) q lexbuf | ||
253 | } | ||
254 | | _ as c (* otherwise just add the character to the buffer *) | ||
255 | { Buffer.add_char buf c; string state buf q lexbuf } | ||
256 | |||
257 | and istring state imin buf q = parse | ||
258 | | ('\n' ' '* "''") | ||
259 | { | ||
260 | Lexing.new_line lexbuf; | ||
261 | Buffer.add_string buf "\n"; | ||
262 | let indent = match imin with | None -> 0 | Some i -> i in | ||
263 | Stdlib.Queue.add (token_of_istr state buf) q; | ||
264 | Stdlib.Queue.add (ISTR_END indent) q | ||
265 | } | ||
266 | | "''" | ||
267 | { | ||
268 | let indent = match imin with | None -> 0 | Some i -> i in | ||
269 | Stdlib.Queue.add (token_of_istr state buf) q; | ||
270 | Stdlib.Queue.add (ISTR_END indent) q | ||
271 | } | ||
272 | | ('\n' ' '* '\n') as s | ||
273 | { | ||
274 | Lexing.new_line lexbuf; | ||
275 | Lexing.new_line lexbuf; | ||
276 | Buffer.add_string buf s; | ||
277 | istring state imin buf q lexbuf | ||
278 | } | ||
279 | | ('\n' (' '* as ws)) as s | ||
280 | { | ||
281 | Lexing.new_line lexbuf; | ||
282 | Buffer.add_string buf s; | ||
283 | let ws_count = String.length ws in | ||
284 | match imin with | ||
285 | | None -> | ||
286 | istring state (Some ws_count) buf q lexbuf | ||
287 | | Some i -> | ||
288 | istring state (Some (min i ws_count)) buf q lexbuf | ||
289 | } | ||
290 | | ("''$" | "'''" | "''\\t" | "''\\r" | "''\\n") as s | ||
291 | { Buffer.add_string buf (unescape s); istring state imin buf q lexbuf } | ||
292 | | "''\\" (_ as c) | ||
293 | { Buffer.add_char buf c; istring state imin buf q lexbuf } | ||
294 | | "${" | ||
295 | { | ||
296 | Stdlib.Queue.add (token_of_istr state buf) q; | ||
297 | collect_tokens get_tokens q lexbuf; | ||
298 | istring `Mid imin (Buffer.create 64) q lexbuf | ||
299 | } | ||
300 | | _ as c | ||
301 | { Buffer.add_char buf c; istring state imin buf q lexbuf } | ||
302 | { | ||
303 | |||
304 | let rec next_token | ||
305 | (q: token Stdlib.Queue.t) | ||
306 | (s: braces list ref) | ||
307 | (lexbuf: Lexing.lexbuf) | ||
308 | : token = | ||
309 | match (try Some (Stdlib.Queue.take q) with | Stdlib.Queue.Empty -> None) with | ||
310 | | Some token -> | ||
311 | token | ||
312 | | None -> | ||
313 | get_tokens q s lexbuf; | ||
314 | next_token q s lexbuf | ||
315 | } | ||