src/querykv1/grammar.ebnf


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

/* This grammar does allow fields to contain stray LFs, not after any specific
 * CR. I took the liberty to take some inspiration from the somewhat similar
 * IETF RFC 4180.
 */
document   ::= (header NEWLINE)? (comment | record | empty-line) (NEWLINE (comment | record | empty-line))* NEWLINE? | header

header     ::= OPENBRACK NOTCR*
comment    ::= SEMICOLON NOTCR*

empty-line ::= WHITESPACE*

record     ::= field (PIPE field)*
field      ::= WHITESPACE* field-data WHITESPACE*
field-data ::= DQUOTE escaped DQUOTE | unescaped

/* Unescaped fields are also allowed to contain double quotes, they are just
 * not interpreted in any special way.
 */
escaped    ::= (TEXTDATA | WHITESPACE | NEWLINE | PIPE | DQUOTE DQUOTE)*
unescaped  ::= (TEXTDATA (WHITESPACE* (TEXTDATA | DQUOTE))*)?

HTAB       ::= #x09  /* <horizontal tab,  "\t"> */
LF         ::= #x0A  /* <line feed,       "\n"> */
VTAB       ::= #x0B  /* <vertical tab,    "\v"> */
FF         ::= #x0C  /* <form feed,       "\f"> */
CR         ::= #x0D  /* <carriage return, "\r"> */
SPACE      ::= #x20  /* <space, " "> */
DQUOTE     ::= #x22  /* " */
SEMICOLON  ::= #x3B  /* ; */
OPENBRACK  ::= #x5B  /* [ */
PIPE       ::= #x7C  /* | */

/* All codepoints, except CR, LF, SPACE, FF, HTAB, VTAB, PIPE, DQUOTE.
 * Semicolon is included, as comments are only defined as 'lines starting with
 * a semicolon'. So it should be fine if a semicolon is part of a field, the
 * rest of the line would not be interpreted as a comment in that case.
 */
TEXTDATA   ::= [#x00-#x08#x0E-#x1F#x21#x23-#x5A#x5C-#x7B#x7D-#x10FFFF]

/* Including LF here as TMI8/KV1 does not consider it a newline,
 * as newlines are defined as 'CR optionally followed by LF'
 */
WHITESPACE ::= SPACE | LF | FF | HTAB | VTAB

/* All codepoints excluding CR and LF */
NOTCR      ::= [#x00-#x0C#x0E-#x10FFFF]
NEWLINE    ::= CR LF?