From 17a3ea880402338420699e03bcb24181e4ff3924 Mon Sep 17 00:00:00 2001 From: Rutger Broekhoff Date: Thu, 2 May 2024 20:27:40 +0200 Subject: Initial commit Based on dc4ba6a --- src/querykv1/grammar.ebnf | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 src/querykv1/grammar.ebnf (limited to 'src/querykv1/grammar.ebnf') diff --git a/src/querykv1/grammar.ebnf b/src/querykv1/grammar.ebnf new file mode 100644 index 0000000..94f8cde --- /dev/null +++ b/src/querykv1/grammar.ebnf @@ -0,0 +1,47 @@ +/* This grammar does allow fields to contain stray LFs, not after any specific + * CR. I took the liberty to take some inspiration from the somewhat similar + * IETF RFC 4180. + */ +document ::= (header NEWLINE)? (comment | record | empty-line) (NEWLINE (comment | record | empty-line))* NEWLINE? | header + +header ::= OPENBRACK NOTCR* +comment ::= SEMICOLON NOTCR* + +empty-line ::= WHITESPACE* + +record ::= field (PIPE field)* +field ::= WHITESPACE* field-data WHITESPACE* +field-data ::= DQUOTE escaped DQUOTE | unescaped + +/* Unescaped fields are also allowed to contain double quotes, they are just + * not interpreted in any special way. + */ +escaped ::= (TEXTDATA | WHITESPACE | NEWLINE | PIPE | DQUOTE DQUOTE)* +unescaped ::= (TEXTDATA (WHITESPACE* (TEXTDATA | DQUOTE))*)? + +HTAB ::= #x09 /* */ +LF ::= #x0A /* */ +VTAB ::= #x0B /* */ +FF ::= #x0C /*
*/ +CR ::= #x0D /* */ +SPACE ::= #x20 /* */ +DQUOTE ::= #x22 /* " */ +SEMICOLON ::= #x3B /* ; */ +OPENBRACK ::= #x5B /* [ */ +PIPE ::= #x7C /* | */ + +/* All codepoints, except CR, LF, SPACE, FF, HTAB, VTAB, PIPE, DQUOTE. + * Semicolon is included, as comments are only defined as 'lines starting with + * a semicolon'. So it should be fine if a semicolon is part of a field, the + * rest of the line would not be interpreted as a comment in that case. + */ +TEXTDATA ::= [#x00-#x08#x0E-#x1F#x21#x23-#x5A#x5C-#x7B#x7D-#x10FFFF] + +/* Including LF here as TMI8/KV1 does not consider it a newline, + * as newlines are defined as 'CR optionally followed by LF' + */ +WHITESPACE ::= SPACE | LF | FF | HTAB | VTAB + +/* All codepoints excluding CR and LF */ +NOTCR ::= [#x00-#x0C#x0E-#x10FFFF] +NEWLINE ::= CR LF? -- cgit v1.2.3