lib/nix/lexer.mll


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315

{
open Core
open Tokens

exception Error of string

(* Types of curly braces.
   AQUOTE corresponds to the braces for antiquotation, i.e. '${...}'
   and SET to an attribute set '{...}'.
 *)
type braces =
  | AQUOTE
  | SET

let print_stack s =
  let b = Buffer.create 100 in
  Buffer.add_string b "[ ";
  List.iter s ~f:(function
      | AQUOTE -> Buffer.add_string b "AQUOTE; "
      | SET -> Buffer.add_string b "SET; "
    );
  Buffer.add_string b "]";
  Buffer.contents b

let token_of_str state buf =
  match state with
        | `Start -> STR_START (Buffer.contents buf)
        | `Mid -> STR_MID (Buffer.contents buf)

let token_of_istr state buf =
  match state with
        | `Start -> ISTR_START (Buffer.contents buf)
        | `Mid -> ISTR_MID (Buffer.contents buf)

(* lookup table for one-character tokens *)
let char_table = Array.create ~len:94 EOF
let _ =
  List.iter ~f:(fun (k, v) -> Array.set char_table ((int_of_char k) - 1) v)
    [
      '.', SELECT;
      '?', QMARK;
      '!', NOT;
      '=', ASSIGN;
      '<', LT;
      '>', GT;
      '[', LBRACK;
      ']', RBRACK;
      '+', PLUS;
      '-', MINUS;
      '*', TIMES;
      '/', SLASH;
      '(', LPAREN;
      ')', RPAREN;
      ':', COLON;
      ';', SEMICOLON;
      ',', COMMA;
      '@', AS
    ]

(* lookup table for two- and three-character tokens *)
let str_table = Hashtbl.create (module String) ~size:10
let _ =
  List.iter ~f:(fun (kwd, tok) -> Hashtbl.set str_table ~key:kwd ~data:tok)
    [
      "//", MERGE;
      "++", CONCAT;
      "<=", LTE;
      ">=", GTE;
      "==", EQ;
      "!=", NEQ;
      "&&", AND;
      "||", OR;
      "->", IMPL;
      "...", ELLIPSIS
    ]

(* lookup table for keywords *)
let keyword_table = Hashtbl.create (module String) ~size:10
let _ =
  List.iter ~f:(fun (kwd, tok) -> Hashtbl.set keyword_table ~key:kwd ~data:tok)
    [ "with", WITH;
      "rec", REC;
      "let", LET;
      "in", IN;
      "inherit", INHERIT;
      "if" , IF;
      "then", THEN;
      "else", ELSE;
      "assert", ASSERT;
      "or", ORDEF ]

(* replace an escape sequence by the corresponding character(s) *)
let unescape = function
  | "\\n" -> "\n"
  | "\\r" -> "\r"
  | "\\t" -> "\t"
  | "\\\\" -> "\\"
  | "\\${" -> "${"
  | "''$" -> "$"
  | "$$" -> "$"
  | "'''" -> "''"
  | "''\\t" -> "\t"
  | "''\\r" -> "\r"
  | "''\\n" -> "\n"
  | x ->
    failwith (Printf.sprintf "unescape unexpected arg %s" x)

let collect_tokens lexer q lexbuf =
  let stack = ref [] in
  let queue = Stdlib.Queue.create () in
  let rec go () =
    match (try Some (Stdlib.Queue.take queue) with Stdlib.Queue.Empty -> None) with
    | Some token ->
      (
        match token, !stack with
        | AQUOTE_CLOSE, [] ->
          Stdlib.Queue.add AQUOTE_CLOSE q
        | EOF, _ ->
          Stdlib.Queue.add EOF q;
        | _, _ ->
          Stdlib.Queue.add token q;
          go ()
      )
    | None ->
      lexer queue stack lexbuf;
      go ()
  in
  Stdlib.Queue.add AQUOTE_OPEN q;
  stack := [AQUOTE];
  lexer queue stack lexbuf;
  go ()

(* utility functions *)
let print_position lexbuf =
  let pos = Lexing.lexeme_start_p lexbuf in
  Printf.sprintf "%s:%d:%d" pos.pos_fname
    pos.pos_lnum (pos.pos_cnum - pos.pos_bol + 1)


let set_filename fname (lexbuf: Lexing.lexbuf)  =
  let pos = lexbuf.lex_curr_p in
  lexbuf.lex_curr_p <- { pos with pos_fname = fname }; lexbuf

}

let nzdigit = ['1'-'9']
let digit = nzdigit | '0'
let float = (nzdigit digit* '.' digit* | '0'? '.' digit+) (['E' 'e'] ['+' '-']? digit+)?
let alpha = ['a'-'z' 'A'-'Z']
let alpha_digit = alpha | digit
let path_chr = alpha_digit | ['.' '_' '-' '+']
let path = path_chr* ('/' path_chr+)+
let spath = alpha_digit path_chr* ('/' path_chr+)*
let uri_chr = ['%' '/' '?' ':' '@' '&' '=' '+' '$' ',' '-' '_' '.' '!' '~' '*' '\'']
let scheme = alpha (alpha | ['+' '-' '.'])*
let uri = scheme ':' (alpha_digit | uri_chr)+
let char_tokens = ['.' '?' '!' '=' '<' '>' '[' ']' '+' '-' '*' '/' '^' '(' ')' ':' ';' ',' '@']

rule get_tokens q s = parse
(* skip whitespeces *)
| [' ' '\t' '\r']
    { get_tokens q s lexbuf }
(* increase line count for new lines *)
| '\n'
    { Lexing.new_line lexbuf; get_tokens q s lexbuf }
| char_tokens as c
    { Stdlib.Queue.add (Array.get char_table ((int_of_char c) - 1)) q }
| ("//" | "++" | "<=" | ">=" | "==" | "!=" | "&&" | "||" | "->" | "...") as s
    { Stdlib.Queue.add (Hashtbl.find_exn str_table s) q}
| digit+ as i
    { Stdlib.Queue.add (INT i) q }
| float
    { Stdlib.Queue.add (FLOAT (Lexing.lexeme lexbuf)) q }
| path
    { Stdlib.Queue.add (PATH (Lexing.lexeme lexbuf)) q }
| '<' (spath as p) '>'
    { Stdlib.Queue.add (SPATH  p) q }
| '~' path as p
    { Stdlib.Queue.add (HPATH  p) q }
| uri
    { Stdlib.Queue.add(URI (Lexing.lexeme lexbuf)) q }
(* keywords or identifiers *)
| ((alpha | '_')+ (alpha_digit | ['_' '\'' '-'])*) as id
    { Stdlib.Queue.add (Hashtbl.find keyword_table id |> Option.value ~default:(ID id)) q}
(* comments *)
| '#' ([^ '\n']* as c)
    { ignore c; get_tokens q s lexbuf}
| "/*"
    { comment (Buffer.create 64) lexbuf;
      get_tokens q s lexbuf
    }
(* the following three tokens change the braces stack *)
| "${"
    { Stdlib.Queue.add AQUOTE_OPEN q; s := AQUOTE :: !s }
| '{'
    { Stdlib.Queue.add LBRACE q; s := SET :: !s }
| '}'
    {
      match !s with
      | AQUOTE :: rest ->
        Stdlib.Queue.add AQUOTE_CLOSE q; s := rest
      | SET :: rest ->
        Stdlib.Queue.add RBRACE q; s := rest
      | _ ->
        let pos = print_position lexbuf in
        let err = Printf.sprintf "Unbalanced '}' at %s\n" pos in
        raise (Error err)
    }
(* a double-quoted string *)
| '"'
    { string `Start (Buffer.create 64) q lexbuf }
(* an indented string *)
| "''" (' '+ as ws)
    { istring `Start (Some (String.length ws)) (Buffer.create 64) q lexbuf }
| "''"
    { istring `Start None (Buffer.create 64) q lexbuf }
(* End of input *)
| eof
    { Stdlib.Queue.add EOF q }
(* any other character raises an exception *)
| _
    {
      let pos = print_position lexbuf in
      let tok = Lexing.lexeme lexbuf in
      let err = Printf.sprintf "Unexpected character '%s' at %s\n" tok pos in
      raise (Error err)
    }

(* Nix does not allow nested comments, but it is still handy to lex it
   separately because we can properly increase line count. *)
and comment buf = parse
  | '\n'
    {Lexing.new_line lexbuf; Buffer.add_char buf '\n'; comment buf lexbuf}
  | "*/"
    { () }
  | _ as c
    { Buffer.add_char buf c; comment buf lexbuf }

and string state buf q = parse
  | '"'                         (* terminate when we hit '"' *)
    { Stdlib.Queue.add (token_of_str state buf) q; Stdlib.Queue.add STR_END q }
  | '\n'
    { Lexing.new_line lexbuf; Buffer.add_char buf '\n'; string state buf q lexbuf }
  | ("\\n" | "\\r" | "\\t" | "\\\\" | "\\${") as s
      { Buffer.add_string buf (unescape s); string state buf q lexbuf }
  | "\\" (_ as c)               (* add the character verbatim *)
      { Buffer.add_char buf c; string state buf q lexbuf }
  | "${"               (* collect all the tokens till we hit the matching '}' *)
    {
      Stdlib.Queue.add (token_of_str state buf) q;
      collect_tokens get_tokens q lexbuf;
      string `Mid (Buffer.create 64) q lexbuf
    }
  | _ as c                  (* otherwise just add the character to the buffer *)
    { Buffer.add_char buf c; string state buf q lexbuf }

and istring state imin buf q = parse
  | ('\n' ' '* "''")
      {
        Lexing.new_line lexbuf;
        Buffer.add_string buf "\n";
        let indent = match imin with | None -> 0 | Some i -> i in
        Stdlib.Queue.add (token_of_istr state buf) q;
        Stdlib.Queue.add (ISTR_END indent) q
      }
  | "''"
      {
        let indent = match imin with | None -> 0 | Some i -> i in
        Stdlib.Queue.add (token_of_istr state buf) q;
        Stdlib.Queue.add (ISTR_END indent) q
      }
  | ('\n' ' '* '\n') as s
    {
      Lexing.new_line lexbuf;
      Lexing.new_line lexbuf;
      Buffer.add_string buf s;
      istring state imin buf q lexbuf
    }
  | ('\n' (' '* as ws)) as s
    {
      Lexing.new_line lexbuf;
      Buffer.add_string buf s;
      let ws_count = String.length ws in
      match imin with
      | None ->
        istring state (Some ws_count) buf q lexbuf
      | Some i ->
        istring state (Some (min i ws_count)) buf q lexbuf
    }
  | ("''$" | "'''" | "''\\t" | "''\\r" | "''\\n") as s
      { Buffer.add_string buf (unescape s); istring state imin buf q lexbuf }
  | "''\\" (_ as c)
      { Buffer.add_char buf c; istring state imin buf q lexbuf }
  | "${"
    {
      Stdlib.Queue.add (token_of_istr state buf) q;
      collect_tokens get_tokens q lexbuf;
      istring `Mid imin (Buffer.create 64) q lexbuf
    }
  | _ as c
    { Buffer.add_char buf c; istring state imin buf q lexbuf }
{

let rec next_token
    (q: token Stdlib.Queue.t)
    (s: braces list ref)
    (lexbuf: Lexing.lexbuf)
  : token =
  match (try Some (Stdlib.Queue.take q) with | Stdlib.Queue.Empty -> None) with
  | Some token ->
    token
  | None ->
    get_tokens q s lexbuf;
    next_token q s lexbuf
}