{ open Core open Tokens exception Error of string (* Types of curly braces. AQUOTE corresponds to the braces for antiquotation, i.e. '${...}' and SET to an attribute set '{...}'. *) type braces = | AQUOTE | SET let print_stack s = let b = Buffer.create 100 in Buffer.add_string b "[ "; List.iter s ~f:(function | AQUOTE -> Buffer.add_string b "AQUOTE; " | SET -> Buffer.add_string b "SET; " ); Buffer.add_string b "]"; Buffer.contents b let token_of_str state buf = match state with | `Start -> STR_START (Buffer.contents buf) | `Mid -> STR_MID (Buffer.contents buf) let token_of_istr state buf = match state with | `Start -> ISTR_START (Buffer.contents buf) | `Mid -> ISTR_MID (Buffer.contents buf) (* lookup table for one-character tokens *) let char_table = Array.create ~len:94 EOF let _ = List.iter ~f:(fun (k, v) -> Array.set char_table ((int_of_char k) - 1) v) [ '.', SELECT; '?', QMARK; '!', NOT; '=', ASSIGN; '<', LT; '>', GT; '[', LBRACK; ']', RBRACK; '+', PLUS; '-', MINUS; '*', TIMES; '/', SLASH; '(', LPAREN; ')', RPAREN; ':', COLON; ';', SEMICOLON; ',', COMMA; '@', AS ] (* lookup table for two- and three-character tokens *) let str_table = Hashtbl.create (module String) ~size:10 let _ = List.iter ~f:(fun (kwd, tok) -> Hashtbl.set str_table ~key:kwd ~data:tok) [ "//", MERGE; "++", CONCAT; "<=", LTE; ">=", GTE; "==", EQ; "!=", NEQ; "&&", AND; "||", OR; "->", IMPL; "...", ELLIPSIS ] (* lookup table for keywords *) let keyword_table = Hashtbl.create (module String) ~size:10 let _ = List.iter ~f:(fun (kwd, tok) -> Hashtbl.set keyword_table ~key:kwd ~data:tok) [ "with", WITH; "rec", REC; "let", LET; "in", IN; "inherit", INHERIT; "if" , IF; "then", THEN; "else", ELSE; "assert", ASSERT; "or", ORDEF ] (* replace an escape sequence by the corresponding character(s) *) let unescape = function | "\\n" -> "\n" | "\\r" -> "\r" | "\\t" -> "\t" | "\\\\" -> "\\" | "\\${" -> "${" | "''$" -> "$" | "$$" -> "$" | "'''" -> "''" | "''\\t" -> "\t" | "''\\r" -> "\r" | "''\\n" -> "\n" | x -> failwith (Printf.sprintf "unescape unexpected arg %s" x) let collect_tokens lexer q lexbuf = let stack = ref [] in let queue = Stdlib.Queue.create () in let rec go () = match (try Some (Stdlib.Queue.take queue) with Stdlib.Queue.Empty -> None) with | Some token -> ( match token, !stack with | AQUOTE_CLOSE, [] -> Stdlib.Queue.add AQUOTE_CLOSE q | EOF, _ -> Stdlib.Queue.add EOF q; | _, _ -> Stdlib.Queue.add token q; go () ) | None -> lexer queue stack lexbuf; go () in Stdlib.Queue.add AQUOTE_OPEN q; stack := [AQUOTE]; lexer queue stack lexbuf; go () (* utility functions *) let print_position lexbuf = let pos = Lexing.lexeme_start_p lexbuf in Printf.sprintf "%s:%d:%d" pos.pos_fname pos.pos_lnum (pos.pos_cnum - pos.pos_bol + 1) let set_filename fname (lexbuf: Lexing.lexbuf) = let pos = lexbuf.lex_curr_p in lexbuf.lex_curr_p <- { pos with pos_fname = fname }; lexbuf } let nzdigit = ['1'-'9'] let digit = nzdigit | '0' let float = (nzdigit digit* '.' digit* | '0'? '.' digit+) (['E' 'e'] ['+' '-']? digit+)? let alpha = ['a'-'z' 'A'-'Z'] let alpha_digit = alpha | digit let path_chr = alpha_digit | ['.' '_' '-' '+'] let path = path_chr* ('/' path_chr+)+ let spath = alpha_digit path_chr* ('/' path_chr+)* let uri_chr = ['%' '/' '?' ':' '@' '&' '=' '+' '$' ',' '-' '_' '.' '!' '~' '*' '\''] let scheme = alpha (alpha | ['+' '-' '.'])* let uri = scheme ':' (alpha_digit | uri_chr)+ let char_tokens = ['.' '?' '!' '=' '<' '>' '[' ']' '+' '-' '*' '/' '^' '(' ')' ':' ';' ',' '@'] rule get_tokens q s = parse (* skip whitespeces *) | [' ' '\t' '\r'] { get_tokens q s lexbuf } (* increase line count for new lines *) | '\n' { Lexing.new_line lexbuf; get_tokens q s lexbuf } | char_tokens as c { Stdlib.Queue.add (Array.get char_table ((int_of_char c) - 1)) q } | ("//" | "++" | "<=" | ">=" | "==" | "!=" | "&&" | "||" | "->" | "...") as s { Stdlib.Queue.add (Hashtbl.find_exn str_table s) q} | digit+ as i { Stdlib.Queue.add (INT i) q } | float { Stdlib.Queue.add (FLOAT (Lexing.lexeme lexbuf)) q } | path { Stdlib.Queue.add (PATH (Lexing.lexeme lexbuf)) q } | '<' (spath as p) '>' { Stdlib.Queue.add (SPATH p) q } | '~' path as p { Stdlib.Queue.add (HPATH p) q } | uri { Stdlib.Queue.add(URI (Lexing.lexeme lexbuf)) q } (* keywords or identifiers *) | ((alpha | '_')+ (alpha_digit | ['_' '\'' '-'])*) as id { Stdlib.Queue.add (Hashtbl.find keyword_table id |> Option.value ~default:(ID id)) q} (* comments *) | '#' ([^ '\n']* as c) { ignore c; get_tokens q s lexbuf} | "/*" { comment (Buffer.create 64) lexbuf; get_tokens q s lexbuf } (* the following three tokens change the braces stack *) | "${" { Stdlib.Queue.add AQUOTE_OPEN q; s := AQUOTE :: !s } | '{' { Stdlib.Queue.add LBRACE q; s := SET :: !s } | '}' { match !s with | AQUOTE :: rest -> Stdlib.Queue.add AQUOTE_CLOSE q; s := rest | SET :: rest -> Stdlib.Queue.add RBRACE q; s := rest | _ -> let pos = print_position lexbuf in let err = Printf.sprintf "Unbalanced '}' at %s\n" pos in raise (Error err) } (* a double-quoted string *) | '"' { string `Start (Buffer.create 64) q lexbuf } (* an indented string *) | "''" (' '+ as ws) { istring `Start (Some (String.length ws)) (Buffer.create 64) q lexbuf } | "''" { istring `Start None (Buffer.create 64) q lexbuf } (* End of input *) | eof { Stdlib.Queue.add EOF q } (* any other character raises an exception *) | _ { let pos = print_position lexbuf in let tok = Lexing.lexeme lexbuf in let err = Printf.sprintf "Unexpected character '%s' at %s\n" tok pos in raise (Error err) } (* Nix does not allow nested comments, but it is still handy to lex it separately because we can properly increase line count. *) and comment buf = parse | '\n' {Lexing.new_line lexbuf; Buffer.add_char buf '\n'; comment buf lexbuf} | "*/" { () } | _ as c { Buffer.add_char buf c; comment buf lexbuf } and string state buf q = parse | '"' (* terminate when we hit '"' *) { Stdlib.Queue.add (token_of_str state buf) q; Stdlib.Queue.add STR_END q } | '\n' { Lexing.new_line lexbuf; Buffer.add_char buf '\n'; string state buf q lexbuf } | ("\\n" | "\\r" | "\\t" | "\\\\" | "\\${") as s { Buffer.add_string buf (unescape s); string state buf q lexbuf } | "\\" (_ as c) (* add the character verbatim *) { Buffer.add_char buf c; string state buf q lexbuf } | "${" (* collect all the tokens till we hit the matching '}' *) { Stdlib.Queue.add (token_of_str state buf) q; collect_tokens get_tokens q lexbuf; string `Mid (Buffer.create 64) q lexbuf } | _ as c (* otherwise just add the character to the buffer *) { Buffer.add_char buf c; string state buf q lexbuf } and istring state imin buf q = parse | ('\n' ' '* "''") { Lexing.new_line lexbuf; Buffer.add_string buf "\n"; let indent = match imin with | None -> 0 | Some i -> i in Stdlib.Queue.add (token_of_istr state buf) q; Stdlib.Queue.add (ISTR_END indent) q } | "''" { let indent = match imin with | None -> 0 | Some i -> i in Stdlib.Queue.add (token_of_istr state buf) q; Stdlib.Queue.add (ISTR_END indent) q } | ('\n' ' '* '\n') as s { Lexing.new_line lexbuf; Lexing.new_line lexbuf; Buffer.add_string buf s; istring state imin buf q lexbuf } | ('\n' (' '* as ws)) as s { Lexing.new_line lexbuf; Buffer.add_string buf s; let ws_count = String.length ws in match imin with | None -> istring state (Some ws_count) buf q lexbuf | Some i -> istring state (Some (min i ws_count)) buf q lexbuf } | ("''$" | "'''" | "''\\t" | "''\\r" | "''\\n") as s { Buffer.add_string buf (unescape s); istring state imin buf q lexbuf } | "''\\" (_ as c) { Buffer.add_char buf c; istring state imin buf q lexbuf } | "${" { Stdlib.Queue.add (token_of_istr state buf) q; collect_tokens get_tokens q lexbuf; istring `Mid imin (Buffer.create 64) q lexbuf } | _ as c { Buffer.add_char buf c; istring state imin buf q lexbuf } { let rec next_token (q: token Stdlib.Queue.t) (s: braces list ref) (lexbuf: Lexing.lexbuf) : token = match (try Some (Stdlib.Queue.take q) with | Stdlib.Queue.Empty -> None) with | Some token -> token | None -> get_tokens q s lexbuf; next_token q s lexbuf }