verified-dyn-lang-interp - Artifact for https://doi.org/10.1145/3747537

{
open Core
open Tokens

exception Error of string

(* Types of curly braces.
   AQUOTE corresponds to the braces for antiquotation, i.e. '${...}'
   and SET to an attribute set '{...}'.
 *)
type braces =
  | AQUOTE
  | SET

let print_stack s =
  let b = Buffer.create 100 in
  Buffer.add_string b "[ ";
  List.iter s ~f:(function
      | AQUOTE -> Buffer.add_string b "AQUOTE; "
      | SET -> Buffer.add_string b "SET; "
    );
  Buffer.add_string b "]";
  Buffer.contents b

let token_of_str state buf =
  match state with
        | `Start -> STR_START (Buffer.contents buf)
        | `Mid -> STR_MID (Buffer.contents buf)

let token_of_istr state buf =
  match state with
        | `Start -> ISTR_START (Buffer.contents buf)
        | `Mid -> ISTR_MID (Buffer.contents buf)

(* lookup table for one-character tokens *)
let char_table = Array.create ~len:94 EOF
let _ =
  List.iter ~f:(fun (k, v) -> Array.set char_table ((int_of_char k) - 1) v)
    [
      '.', SELECT;
      '?', QMARK;
      '!', NOT;
      '=', ASSIGN;
      '<', LT;
      '>', GT;
      '[', LBRACK;
      ']', RBRACK;
      '+', PLUS;
      '-', MINUS;
      '*', TIMES;
      '/', SLASH;
      '(', LPAREN;
      ')', RPAREN;
      ':', COLON;
      ';', SEMICOLON;
      ',', COMMA;
      '@', AS
    ]

(* lookup table for two- and three-character tokens *)
let str_table = Hashtbl.create (module String) ~size:10
let _ =
  List.iter ~f:(fun (kwd, tok) -> Hashtbl.set str_table ~key:kwd ~data:tok)
    [
      "//", MERGE;
      "++", CONCAT;
      "<=", LTE;
      ">=", GTE;
      "==", EQ;
      "!=", NEQ;
      "&&", AND;
      "||", OR;
      "->", IMPL;
      "...", ELLIPSIS
    ]

(* lookup table for keywords *)
let keyword_table = Hashtbl.create (module String) ~size:10
let _ =
  List.iter ~f:(fun (kwd, tok) -> Hashtbl.set keyword_table ~key:kwd ~data:tok)
    [ "with", WITH;
      "rec", REC;
      "let", LET;
      "in", IN;
      "inherit", INHERIT;
      "if" , IF;
      "then", THEN;
      "else", ELSE;
      "assert", ASSERT;
      "or", ORDEF ]

(* replace an escape sequence by the corresponding character(s) *)
let unescape = function
  | "\\n" -> "\n"
  | "\\r" -> "\r"
  | "\\t" -> "\t"
  | "\\\\" -> "\\"
  | "\\${" -> "${"
  | "''$" -> "$"
  | "$$" -> "$"
  | "'''" -> "''"
  | "''\\t" -> "\t"
  | "''\\r" -> "\r"
  | "''\\n" -> "\n"
  | x ->
    failwith (Printf.sprintf "unescape unexpected arg %s" x)

let collect_tokens lexer q lexbuf =
  let stack = ref [] in
  let queue = Stdlib.Queue.create () in
  let rec go () =
    match (try Some (Stdlib.Queue.take queue) with Stdlib.Queue.Empty -> None) with
    | Some token ->
      (
        match token, !stack with
        | AQUOTE_CLOSE, [] ->
          Stdlib.Queue.add AQUOTE_CLOSE q
        | EOF, _ ->
          Stdlib.Queue.add EOF q;
        | _, _ ->
          Stdlib.Queue.add token q;
          go ()
      )
    | None ->
      lexer queue stack lexbuf;
      go ()
  in
  Stdlib.Queue.add AQUOTE_OPEN q;
  stack := [AQUOTE];
  lexer queue stack lexbuf;
  go ()

(* utility functions *)
let print_position lexbuf =
  let pos = Lexing.lexeme_start_p lexbuf in
  Printf.sprintf "%s:%d:%d" pos.pos_fname
    pos.pos_lnum (pos.pos_cnum - pos.pos_bol + 1)


let set_filename fname (lexbuf: Lexing.lexbuf)  =
  let pos = lexbuf.lex_curr_p in
  lexbuf.lex_curr_p <- { pos with pos_fname = fname }; lexbuf

}

let nzdigit = ['1'-'9']
let digit = nzdigit | '0'
let float = (nzdigit digit* '.' digit* | '0'? '.' digit+) (['E' 'e'] ['+' '-']? digit+)?
let alpha = ['a'-'z' 'A'-'Z']
let alpha_digit = alpha | digit
let path_chr = alpha_digit | ['.' '_' '-' '+']
let path = path_chr* ('/' path_chr+)+
let spath = alpha_digit path_chr* ('/' path_chr+)*
let uri_chr = ['%' '/' '?' ':' '@' '&' '=' '+' '$' ',' '-' '_' '.' '!' '~' '*' '\'']
let scheme = alpha (alpha | ['+' '-' '.'])*
let uri = scheme ':' (alpha_digit | uri_chr)+
let char_tokens = ['.' '?' '!' '=' '<' '>' '[' ']' '+' '-' '*' '/' '^' '(' ')' ':' ';' ',' '@']

rule get_tokens q s = parse
(* skip whitespeces *)
| [' ' '\t' '\r']
    { get_tokens q s lexbuf }
(* increase line count for new lines *)
| '\n'
    { Lexing.new_line lexbuf; get_tokens q s lexbuf }
| char_tokens as c
    { Stdlib.Queue.add (Array.get char_table ((int_of_char c) - 1)) q }
| ("//" | "++" | "<=" | ">=" | "==" | "!=" | "&&" | "||" | "->" | "...") as s
    { Stdlib.Queue.add (Hashtbl.find_exn str_table s) q}
| digit+ as i
    { Stdlib.Queue.add (INT i) q }
| float
    { Stdlib.Queue.add (FLOAT (Lexing.lexeme lexbuf)) q }
| path
    { Stdlib.Queue.add (PATH (Lexing.lexeme lexbuf)) q }
| '<' (spath as p) '>'
    { Stdlib.Queue.add (SPATH  p) q }
| '~' path as p
    { Stdlib.Queue.add (HPATH  p) q }
| uri
    { Stdlib.Queue.add(URI (Lexing.lexeme lexbuf)) q }
(* keywords or identifiers *)
| ((alpha | '_')+ (alpha_digit | ['_' '\'' '-'])*) as id
    { Stdlib.Queue.add (Hashtbl.find keyword_table id |> Option.value ~default:(ID id)) q}
(* comments *)
| '#' ([^ '\n']* as c)
    { ignore c; get_tokens q s lexbuf}
| "/*"
    { comment (Buffer.create 64) lexbuf;
      get_tokens q s lexbuf
    }
(* the following three tokens change the braces stack *)
| "${"
    { Stdlib.Queue.add AQUOTE_OPEN q; s := AQUOTE :: !s }
| '{'
    { Stdlib.Queue.add LBRACE q; s := SET :: !s }
| '}'
    {
      match !s with
      | AQUOTE :: rest ->
        Stdlib.Queue.add AQUOTE_CLOSE q; s := rest
      | SET :: rest ->
        Stdlib.Queue.add RBRACE q; s := rest
      | _ ->
        let pos = print_position lexbuf in
        let err = Printf.sprintf "Unbalanced '}' at %s\n" pos in
        raise (Error err)
    }
(* a double-quoted string *)
| '"'
    { string `Start (Buffer.create 64) q lexbuf }
(* an indented string *)
| "''" (' '+ as ws)
    { istring `Start (Some (String.length ws)) (Buffer.create 64) q lexbuf }
| "''"
    { istring `Start None (Buffer.create 64) q lexbuf }
(* End of input *)
| eof
    { Stdlib.Queue.add EOF q }
(* any other character raises an exception *)
| _
    {
      let pos = print_position lexbuf in
      let tok = Lexing.lexeme lexbuf in
      let err = Printf.sprintf "Unexpected character '%s' at %s\n" tok pos in
      raise (Error err)
    }

(* Nix does not allow nested comments, but it is still handy to lex it
   separately because we can properly increase line count. *)
and comment buf = parse
  | '\n'
    {Lexing.new_line lexbuf; Buffer.add_char buf '\n'; comment buf lexbuf}
  | "*/"
    { () }
  | _ as c
    { Buffer.add_char buf c; comment buf lexbuf }

and string state buf q = parse
  | '"'                         (* terminate when we hit '"' *)
    { Stdlib.Queue.add (token_of_str state buf) q; Stdlib.Queue.add STR_END q }
  | '\n'
    { Lexing.new_line lexbuf; Buffer.add_char buf '\n'; string state buf q lexbuf }
  | ("\\n" | "\\r" | "\\t" | "\\\\" | "\\${") as s
      { Buffer.add_string buf (unescape s); string state buf q lexbuf }
  | "\\" (_ as c)               (* add the character verbatim *)
      { Buffer.add_char buf c; string state buf q lexbuf }
  | "${"               (* collect all the tokens till we hit the matching '}' *)
    {
      Stdlib.Queue.add (token_of_str state buf) q;
      collect_tokens get_tokens q lexbuf;
      string `Mid (Buffer.create 64) q lexbuf
    }
  | _ as c                  (* otherwise just add the character to the buffer *)
    { Buffer.add_char buf c; string state buf q lexbuf }

and istring state imin buf q = parse
  | ('\n' ' '* "''")
      {
        Lexing.new_line lexbuf;
        Buffer.add_string buf "\n";
        let indent = match imin with | None -> 0 | Some i -> i in
        Stdlib.Queue.add (token_of_istr state buf) q;
        Stdlib.Queue.add (ISTR_END indent) q
      }
  | "''"
      {
        let indent = match imin with | None -> 0 | Some i -> i in
        Stdlib.Queue.add (token_of_istr state buf) q;
        Stdlib.Queue.add (ISTR_END indent) q
      }
  | ('\n' ' '* '\n') as s
    {
      Lexing.new_line lexbuf;
      Lexing.new_line lexbuf;
      Buffer.add_string buf s;
      istring state imin buf q lexbuf
    }
  | ('\n' (' '* as ws)) as s
    {
      Lexing.new_line lexbuf;
      Buffer.add_string buf s;
      let ws_count = String.length ws in
      match imin with
      | None ->
        istring state (Some ws_count) buf q lexbuf
      | Some i ->
        istring state (Some (min i ws_count)) buf q lexbuf
    }
  | ("''$" | "'''" | "''\\t" | "''\\r" | "''\\n") as s
      { Buffer.add_string buf (unescape s); istring state imin buf q lexbuf }
  | "''\\" (_ as c)
      { Buffer.add_char buf c; istring state imin buf q lexbuf }
  | "${"
    {
      Stdlib.Queue.add (token_of_istr state buf) q;
      collect_tokens get_tokens q lexbuf;
      istring `Mid imin (Buffer.create 64) q lexbuf
    }
  | _ as c
    { Buffer.add_char buf c; istring state imin buf q lexbuf }
{

let rec next_token
    (q: token Stdlib.Queue.t)
    (s: braces list ref)
    (lexbuf: Lexing.lexbuf)
  : token =
  match (try Some (Stdlib.Queue.take q) with | Stdlib.Queue.Empty -> None) with
  | Some token ->
    token
  | None ->
    get_tokens q s lexbuf;
    next_token q s lexbuf
}