From ba61dfd69504ec6263a9dee9931d93adeb6f3142 Mon Sep 17 00:00:00 2001 From: Rutger Broekhoff Date: Mon, 7 Jul 2025 21:52:08 +0200 Subject: Initialize repository --- lib/nix/lexer.mll | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 lib/nix/lexer.mll (limited to 'lib/nix/lexer.mll') diff --git a/lib/nix/lexer.mll b/lib/nix/lexer.mll new file mode 100644 index 0000000..023d888 --- /dev/null +++ b/lib/nix/lexer.mll @@ -0,0 +1,315 @@ +{ +open Core +open Tokens + +exception Error of string + +(* Types of curly braces. + AQUOTE corresponds to the braces for antiquotation, i.e. '${...}' + and SET to an attribute set '{...}'. + *) +type braces = + | AQUOTE + | SET + +let print_stack s = + let b = Buffer.create 100 in + Buffer.add_string b "[ "; + List.iter s ~f:(function + | AQUOTE -> Buffer.add_string b "AQUOTE; " + | SET -> Buffer.add_string b "SET; " + ); + Buffer.add_string b "]"; + Buffer.contents b + +let token_of_str state buf = + match state with + | `Start -> STR_START (Buffer.contents buf) + | `Mid -> STR_MID (Buffer.contents buf) + +let token_of_istr state buf = + match state with + | `Start -> ISTR_START (Buffer.contents buf) + | `Mid -> ISTR_MID (Buffer.contents buf) + +(* lookup table for one-character tokens *) +let char_table = Array.create ~len:94 EOF +let _ = + List.iter ~f:(fun (k, v) -> Array.set char_table ((int_of_char k) - 1) v) + [ + '.', SELECT; + '?', QMARK; + '!', NOT; + '=', ASSIGN; + '<', LT; + '>', GT; + '[', LBRACK; + ']', RBRACK; + '+', PLUS; + '-', MINUS; + '*', TIMES; + '/', SLASH; + '(', LPAREN; + ')', RPAREN; + ':', COLON; + ';', SEMICOLON; + ',', COMMA; + '@', AS + ] + +(* lookup table for two- and three-character tokens *) +let str_table = Hashtbl.create (module String) ~size:10 +let _ = + List.iter ~f:(fun (kwd, tok) -> Hashtbl.set str_table ~key:kwd ~data:tok) + [ + "//", MERGE; + "++", CONCAT; + "<=", LTE; + ">=", GTE; + "==", EQ; + "!=", NEQ; + "&&", AND; + "||", OR; + "->", IMPL; + "...", ELLIPSIS + ] + +(* lookup table for keywords *) +let keyword_table = Hashtbl.create (module String) ~size:10 +let _ = + List.iter ~f:(fun (kwd, tok) -> Hashtbl.set keyword_table ~key:kwd ~data:tok) + [ "with", WITH; + "rec", REC; + "let", LET; + "in", IN; + "inherit", INHERIT; + "if" , IF; + "then", THEN; + "else", ELSE; + "assert", ASSERT; + "or", ORDEF ] + +(* replace an escape sequence by the corresponding character(s) *) +let unescape = function + | "\\n" -> "\n" + | "\\r" -> "\r" + | "\\t" -> "\t" + | "\\\\" -> "\\" + | "\\${" -> "${" + | "''$" -> "$" + | "$$" -> "$" + | "'''" -> "''" + | "''\\t" -> "\t" + | "''\\r" -> "\r" + | "''\\n" -> "\n" + | x -> + failwith (Printf.sprintf "unescape unexpected arg %s" x) + +let collect_tokens lexer q lexbuf = + let stack = ref [] in + let queue = Stdlib.Queue.create () in + let rec go () = + match (try Some (Stdlib.Queue.take queue) with Stdlib.Queue.Empty -> None) with + | Some token -> + ( + match token, !stack with + | AQUOTE_CLOSE, [] -> + Stdlib.Queue.add AQUOTE_CLOSE q + | EOF, _ -> + Stdlib.Queue.add EOF q; + | _, _ -> + Stdlib.Queue.add token q; + go () + ) + | None -> + lexer queue stack lexbuf; + go () + in + Stdlib.Queue.add AQUOTE_OPEN q; + stack := [AQUOTE]; + lexer queue stack lexbuf; + go () + +(* utility functions *) +let print_position lexbuf = + let pos = Lexing.lexeme_start_p lexbuf in + Printf.sprintf "%s:%d:%d" pos.pos_fname + pos.pos_lnum (pos.pos_cnum - pos.pos_bol + 1) + + +let set_filename fname (lexbuf: Lexing.lexbuf) = + let pos = lexbuf.lex_curr_p in + lexbuf.lex_curr_p <- { pos with pos_fname = fname }; lexbuf + +} + +let nzdigit = ['1'-'9'] +let digit = nzdigit | '0' +let float = (nzdigit digit* '.' digit* | '0'? '.' digit+) (['E' 'e'] ['+' '-']? digit+)? +let alpha = ['a'-'z' 'A'-'Z'] +let alpha_digit = alpha | digit +let path_chr = alpha_digit | ['.' '_' '-' '+'] +let path = path_chr* ('/' path_chr+)+ +let spath = alpha_digit path_chr* ('/' path_chr+)* +let uri_chr = ['%' '/' '?' ':' '@' '&' '=' '+' '$' ',' '-' '_' '.' '!' '~' '*' '\''] +let scheme = alpha (alpha | ['+' '-' '.'])* +let uri = scheme ':' (alpha_digit | uri_chr)+ +let char_tokens = ['.' '?' '!' '=' '<' '>' '[' ']' '+' '-' '*' '/' '^' '(' ')' ':' ';' ',' '@'] + +rule get_tokens q s = parse +(* skip whitespeces *) +| [' ' '\t' '\r'] + { get_tokens q s lexbuf } +(* increase line count for new lines *) +| '\n' + { Lexing.new_line lexbuf; get_tokens q s lexbuf } +| char_tokens as c + { Stdlib.Queue.add (Array.get char_table ((int_of_char c) - 1)) q } +| ("//" | "++" | "<=" | ">=" | "==" | "!=" | "&&" | "||" | "->" | "...") as s + { Stdlib.Queue.add (Hashtbl.find_exn str_table s) q} +| digit+ as i + { Stdlib.Queue.add (INT i) q } +| float + { Stdlib.Queue.add (FLOAT (Lexing.lexeme lexbuf)) q } +| path + { Stdlib.Queue.add (PATH (Lexing.lexeme lexbuf)) q } +| '<' (spath as p) '>' + { Stdlib.Queue.add (SPATH p) q } +| '~' path as p + { Stdlib.Queue.add (HPATH p) q } +| uri + { Stdlib.Queue.add(URI (Lexing.lexeme lexbuf)) q } +(* keywords or identifiers *) +| ((alpha | '_')+ (alpha_digit | ['_' '\'' '-'])*) as id + { Stdlib.Queue.add (Hashtbl.find keyword_table id |> Option.value ~default:(ID id)) q} +(* comments *) +| '#' ([^ '\n']* as c) + { ignore c; get_tokens q s lexbuf} +| "/*" + { comment (Buffer.create 64) lexbuf; + get_tokens q s lexbuf + } +(* the following three tokens change the braces stack *) +| "${" + { Stdlib.Queue.add AQUOTE_OPEN q; s := AQUOTE :: !s } +| '{' + { Stdlib.Queue.add LBRACE q; s := SET :: !s } +| '}' + { + match !s with + | AQUOTE :: rest -> + Stdlib.Queue.add AQUOTE_CLOSE q; s := rest + | SET :: rest -> + Stdlib.Queue.add RBRACE q; s := rest + | _ -> + let pos = print_position lexbuf in + let err = Printf.sprintf "Unbalanced '}' at %s\n" pos in + raise (Error err) + } +(* a double-quoted string *) +| '"' + { string `Start (Buffer.create 64) q lexbuf } +(* an indented string *) +| "''" (' '+ as ws) + { istring `Start (Some (String.length ws)) (Buffer.create 64) q lexbuf } +| "''" + { istring `Start None (Buffer.create 64) q lexbuf } +(* End of input *) +| eof + { Stdlib.Queue.add EOF q } +(* any other character raises an exception *) +| _ + { + let pos = print_position lexbuf in + let tok = Lexing.lexeme lexbuf in + let err = Printf.sprintf "Unexpected character '%s' at %s\n" tok pos in + raise (Error err) + } + +(* Nix does not allow nested comments, but it is still handy to lex it + separately because we can properly increase line count. *) +and comment buf = parse + | '\n' + {Lexing.new_line lexbuf; Buffer.add_char buf '\n'; comment buf lexbuf} + | "*/" + { () } + | _ as c + { Buffer.add_char buf c; comment buf lexbuf } + +and string state buf q = parse + | '"' (* terminate when we hit '"' *) + { Stdlib.Queue.add (token_of_str state buf) q; Stdlib.Queue.add STR_END q } + | '\n' + { Lexing.new_line lexbuf; Buffer.add_char buf '\n'; string state buf q lexbuf } + | ("\\n" | "\\r" | "\\t" | "\\\\" | "\\${") as s + { Buffer.add_string buf (unescape s); string state buf q lexbuf } + | "\\" (_ as c) (* add the character verbatim *) + { Buffer.add_char buf c; string state buf q lexbuf } + | "${" (* collect all the tokens till we hit the matching '}' *) + { + Stdlib.Queue.add (token_of_str state buf) q; + collect_tokens get_tokens q lexbuf; + string `Mid (Buffer.create 64) q lexbuf + } + | _ as c (* otherwise just add the character to the buffer *) + { Buffer.add_char buf c; string state buf q lexbuf } + +and istring state imin buf q = parse + | ('\n' ' '* "''") + { + Lexing.new_line lexbuf; + Buffer.add_string buf "\n"; + let indent = match imin with | None -> 0 | Some i -> i in + Stdlib.Queue.add (token_of_istr state buf) q; + Stdlib.Queue.add (ISTR_END indent) q + } + | "''" + { + let indent = match imin with | None -> 0 | Some i -> i in + Stdlib.Queue.add (token_of_istr state buf) q; + Stdlib.Queue.add (ISTR_END indent) q + } + | ('\n' ' '* '\n') as s + { + Lexing.new_line lexbuf; + Lexing.new_line lexbuf; + Buffer.add_string buf s; + istring state imin buf q lexbuf + } + | ('\n' (' '* as ws)) as s + { + Lexing.new_line lexbuf; + Buffer.add_string buf s; + let ws_count = String.length ws in + match imin with + | None -> + istring state (Some ws_count) buf q lexbuf + | Some i -> + istring state (Some (min i ws_count)) buf q lexbuf + } + | ("''$" | "'''" | "''\\t" | "''\\r" | "''\\n") as s + { Buffer.add_string buf (unescape s); istring state imin buf q lexbuf } + | "''\\" (_ as c) + { Buffer.add_char buf c; istring state imin buf q lexbuf } + | "${" + { + Stdlib.Queue.add (token_of_istr state buf) q; + collect_tokens get_tokens q lexbuf; + istring `Mid imin (Buffer.create 64) q lexbuf + } + | _ as c + { Buffer.add_char buf c; istring state imin buf q lexbuf } +{ + +let rec next_token + (q: token Stdlib.Queue.t) + (s: braces list ref) + (lexbuf: Lexing.lexbuf) + : token = + match (try Some (Stdlib.Queue.take q) with | Stdlib.Queue.Empty -> None) with + | Some token -> + token + | None -> + get_tokens q s lexbuf; + next_token q s lexbuf +} -- cgit v1.2.3