diff options
author | Rutger Broekhoff | 2024-05-02 20:27:40 +0200 |
---|---|---|
committer | Rutger Broekhoff | 2024-05-02 20:27:40 +0200 |
commit | 17a3ea880402338420699e03bcb24181e4ff3924 (patch) | |
tree | da666ef91e0b60d20aa0b01529644c136fd1f4ab /lib/libtmi8/src/kv1_lexer.cpp | |
download | oeuf-17a3ea880402338420699e03bcb24181e4ff3924.tar.gz oeuf-17a3ea880402338420699e03bcb24181e4ff3924.zip |
Initial commit
Based on dc4ba6a
Diffstat (limited to 'lib/libtmi8/src/kv1_lexer.cpp')
-rw-r--r-- | lib/libtmi8/src/kv1_lexer.cpp | 152 |
1 files changed, 152 insertions, 0 deletions
diff --git a/lib/libtmi8/src/kv1_lexer.cpp b/lib/libtmi8/src/kv1_lexer.cpp new file mode 100644 index 0000000..028127b --- /dev/null +++ b/lib/libtmi8/src/kv1_lexer.cpp | |||
@@ -0,0 +1,152 @@ | |||
1 | // vim:set sw=2 ts=2 sts et: | ||
2 | |||
3 | #include <tmi8/kv1_lexer.hpp> | ||
4 | |||
5 | Kv1Lexer::Kv1Lexer(std::string_view input) | ||
6 | : input(input), slice(input) | ||
7 | {} | ||
8 | |||
9 | // Does not eat newline character. | ||
10 | void Kv1Lexer::eatRestOfLine() { | ||
11 | size_t end = slice.size(); | ||
12 | for (size_t i = 0; i < slice.size(); i++) { | ||
13 | if (slice[i] == '\r' || slice[i] == '\n') { | ||
14 | end = i; | ||
15 | break; | ||
16 | } | ||
17 | } | ||
18 | slice = slice.substr(end); | ||
19 | } | ||
20 | |||
21 | void Kv1Lexer::lexOptionalHeader() { | ||
22 | if (slice.starts_with('[')) eatRestOfLine(); | ||
23 | } | ||
24 | |||
25 | void Kv1Lexer::lexOptionalComment() { | ||
26 | if (slice.starts_with(';')) eatRestOfLine(); | ||
27 | } | ||
28 | |||
29 | inline bool Kv1Lexer::isWhitespace(int c) { | ||
30 | return c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v'; | ||
31 | } | ||
32 | |||
33 | void Kv1Lexer::readQuotedColumn() { | ||
34 | Kv1Token token{ .type = KV1_TOKEN_CELL }; | ||
35 | |||
36 | if (slice.size() == 0 || slice[0] != '"') { | ||
37 | errors.push_back("(internal error) readQuotedColumn: slice[0] != '\"'"); | ||
38 | return; | ||
39 | } | ||
40 | slice = slice.substr(1); | ||
41 | while (true) { | ||
42 | size_t quote = slice.find('"'); | ||
43 | if (quote == std::string_view::npos) { | ||
44 | errors.push_back("readQuotedColumn: no matching closing quote found"); | ||
45 | return; | ||
46 | } | ||
47 | if (quote+1 == slice.size() || slice[quote + 1] != '"') { | ||
48 | token.data.append(slice.substr(0, quote)); | ||
49 | break; | ||
50 | } | ||
51 | token.data.append(slice.substr(0, quote + 1)); | ||
52 | slice = slice.substr(quote + 2); | ||
53 | } | ||
54 | |||
55 | size_t end = slice.size(); | ||
56 | for (size_t i = 0; i < slice.size(); i++) { | ||
57 | if (slice[i] == '|' || slice[i] == '\r' || slice[i] == '\n') { | ||
58 | end = i; | ||
59 | break; | ||
60 | } | ||
61 | if (!isWhitespace(slice[i])) { | ||
62 | errors.push_back("readQuotedColumn: encountered non-whitespace character after closing quote"); | ||
63 | return; | ||
64 | } | ||
65 | } | ||
66 | if (end != std::string_view::npos) slice = slice.substr(end); | ||
67 | else slice = slice.substr(slice.size()); | ||
68 | |||
69 | tokens.push_back(std::move(token)); | ||
70 | } | ||
71 | |||
72 | void Kv1Lexer::readUnquotedColumn() { | ||
73 | size_t end = slice.size(); | ||
74 | size_t content_end = 0; | ||
75 | for (size_t i = 0; i < slice.size(); i++) { | ||
76 | if (slice[i] == '|' || slice[i] == '\r' || slice[i] == '\n') { | ||
77 | end = i; | ||
78 | break; | ||
79 | } else if (!isWhitespace(slice[i])) { | ||
80 | content_end = i + 1; | ||
81 | } | ||
82 | } | ||
83 | tokens.emplace_back(KV1_TOKEN_CELL, std::string(slice.substr(0, content_end))); | ||
84 | if (end != std::string_view::npos) slice = slice.substr(end); | ||
85 | else slice = slice.substr(slice.size()); | ||
86 | } | ||
87 | |||
88 | void Kv1Lexer::lexRow() { | ||
89 | size_t cols = 0; | ||
90 | while (slice.size() > 0 && slice[0] != '\r' && slice[0] != '\n') { | ||
91 | if (slice[0] == '"') readQuotedColumn(); | ||
92 | else readUnquotedColumn(); | ||
93 | if (!errors.empty()) return; | ||
94 | cols++; | ||
95 | if (slice.size() != 0) { | ||
96 | if (slice[0] == '|') { | ||
97 | slice = slice.substr(1); | ||
98 | // A newline/eof right after pipe? That means an empty field at the end | ||
99 | // of the record, we also want to emit that as a token. | ||
100 | if (slice.size() == 0 || slice[0] == '\r' || slice[0] == '\n') { | ||
101 | tokens.push_back({ .type = KV1_TOKEN_CELL }); | ||
102 | } | ||
103 | } else if (slice[0] == '\r') { | ||
104 | if (slice.size() > 1 && slice[1] == '\n') slice = slice.substr(2); | ||
105 | else slice = slice.substr(1); | ||
106 | break; | ||
107 | } else if (slice[0] == '\n') { | ||
108 | slice = slice.substr(1); | ||
109 | break; | ||
110 | } else { | ||
111 | errors.push_back("lexRow: expected CR, LF or |"); | ||
112 | return; | ||
113 | } | ||
114 | } | ||
115 | } | ||
116 | tokens.push_back({ .type = KV1_TOKEN_ROW_END }); | ||
117 | } | ||
118 | |||
119 | // Returns true when a line ending was consumed. | ||
120 | bool Kv1Lexer::eatWhitespace() { | ||
121 | for (size_t i = 0; i < slice.size(); i++) { | ||
122 | if (slice[i] == '\r') { | ||
123 | slice = slice.substr(i + 1); | ||
124 | if (slice.size() > 1 && slice[i + 1] == '\n') | ||
125 | slice = slice.substr(i + 2); | ||
126 | return true; | ||
127 | } | ||
128 | if (slice[i] == '\n') { | ||
129 | slice = slice.substr(i + 1); | ||
130 | return true; | ||
131 | } | ||
132 | |||
133 | if (slice[i] != ' ' && slice[i] != '\f' && slice[i] != '\t' && slice[i] != '\v') { | ||
134 | slice = slice.substr(i); | ||
135 | return false; | ||
136 | } | ||
137 | } | ||
138 | return false; | ||
139 | } | ||
140 | |||
141 | void Kv1Lexer::lex() { | ||
142 | lexOptionalHeader(); | ||
143 | eatWhitespace(); | ||
144 | |||
145 | while (errors.empty() && !slice.empty()) { | ||
146 | lexOptionalComment(); | ||
147 | bool newline = eatWhitespace(); | ||
148 | if (newline) continue; | ||
149 | // We are now either (1) at the end of the file or (2) at the start of some column data | ||
150 | if (errors.empty()) lexRow(); | ||
151 | } | ||
152 | } | ||