diff options
| author | Rutger Broekhoff | 2024-05-02 20:27:40 +0200 | 
|---|---|---|
| committer | Rutger Broekhoff | 2024-05-02 20:27:40 +0200 | 
| commit | 17a3ea880402338420699e03bcb24181e4ff3924 (patch) | |
| tree | da666ef91e0b60d20aa0b01529644c136fd1f4ab /lib/libtmi8/src/kv1_lexer.cpp | |
| download | oeuf-17a3ea880402338420699e03bcb24181e4ff3924.tar.gz oeuf-17a3ea880402338420699e03bcb24181e4ff3924.zip  | |
Initial commit
Based on dc4ba6a
Diffstat (limited to 'lib/libtmi8/src/kv1_lexer.cpp')
| -rw-r--r-- | lib/libtmi8/src/kv1_lexer.cpp | 152 | 
1 files changed, 152 insertions, 0 deletions
diff --git a/lib/libtmi8/src/kv1_lexer.cpp b/lib/libtmi8/src/kv1_lexer.cpp new file mode 100644 index 0000000..028127b --- /dev/null +++ b/lib/libtmi8/src/kv1_lexer.cpp  | |||
| @@ -0,0 +1,152 @@ | |||
| 1 | // vim:set sw=2 ts=2 sts et: | ||
| 2 | |||
| 3 | #include <tmi8/kv1_lexer.hpp> | ||
| 4 | |||
| 5 | Kv1Lexer::Kv1Lexer(std::string_view input) | ||
| 6 | : input(input), slice(input) | ||
| 7 | {} | ||
| 8 | |||
| 9 | // Does not eat newline character. | ||
| 10 | void Kv1Lexer::eatRestOfLine() { | ||
| 11 | size_t end = slice.size(); | ||
| 12 | for (size_t i = 0; i < slice.size(); i++) { | ||
| 13 | if (slice[i] == '\r' || slice[i] == '\n') { | ||
| 14 | end = i; | ||
| 15 | break; | ||
| 16 | } | ||
| 17 | } | ||
| 18 | slice = slice.substr(end); | ||
| 19 | } | ||
| 20 | |||
| 21 | void Kv1Lexer::lexOptionalHeader() { | ||
| 22 | if (slice.starts_with('[')) eatRestOfLine(); | ||
| 23 | } | ||
| 24 | |||
| 25 | void Kv1Lexer::lexOptionalComment() { | ||
| 26 | if (slice.starts_with(';')) eatRestOfLine(); | ||
| 27 | } | ||
| 28 | |||
| 29 | inline bool Kv1Lexer::isWhitespace(int c) { | ||
| 30 | return c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v'; | ||
| 31 | } | ||
| 32 | |||
| 33 | void Kv1Lexer::readQuotedColumn() { | ||
| 34 | Kv1Token token{ .type = KV1_TOKEN_CELL }; | ||
| 35 | |||
| 36 | if (slice.size() == 0 || slice[0] != '"') { | ||
| 37 | errors.push_back("(internal error) readQuotedColumn: slice[0] != '\"'"); | ||
| 38 | return; | ||
| 39 | } | ||
| 40 | slice = slice.substr(1); | ||
| 41 | while (true) { | ||
| 42 | size_t quote = slice.find('"'); | ||
| 43 | if (quote == std::string_view::npos) { | ||
| 44 | errors.push_back("readQuotedColumn: no matching closing quote found"); | ||
| 45 | return; | ||
| 46 | } | ||
| 47 | if (quote+1 == slice.size() || slice[quote + 1] != '"') { | ||
| 48 | token.data.append(slice.substr(0, quote)); | ||
| 49 | break; | ||
| 50 | } | ||
| 51 | token.data.append(slice.substr(0, quote + 1)); | ||
| 52 | slice = slice.substr(quote + 2); | ||
| 53 | } | ||
| 54 | |||
| 55 | size_t end = slice.size(); | ||
| 56 | for (size_t i = 0; i < slice.size(); i++) { | ||
| 57 | if (slice[i] == '|' || slice[i] == '\r' || slice[i] == '\n') { | ||
| 58 | end = i; | ||
| 59 | break; | ||
| 60 | } | ||
| 61 | if (!isWhitespace(slice[i])) { | ||
| 62 | errors.push_back("readQuotedColumn: encountered non-whitespace character after closing quote"); | ||
| 63 | return; | ||
| 64 | } | ||
| 65 | } | ||
| 66 | if (end != std::string_view::npos) slice = slice.substr(end); | ||
| 67 | else slice = slice.substr(slice.size()); | ||
| 68 | |||
| 69 | tokens.push_back(std::move(token)); | ||
| 70 | } | ||
| 71 | |||
| 72 | void Kv1Lexer::readUnquotedColumn() { | ||
| 73 | size_t end = slice.size(); | ||
| 74 | size_t content_end = 0; | ||
| 75 | for (size_t i = 0; i < slice.size(); i++) { | ||
| 76 | if (slice[i] == '|' || slice[i] == '\r' || slice[i] == '\n') { | ||
| 77 | end = i; | ||
| 78 | break; | ||
| 79 | } else if (!isWhitespace(slice[i])) { | ||
| 80 | content_end = i + 1; | ||
| 81 | } | ||
| 82 | } | ||
| 83 | tokens.emplace_back(KV1_TOKEN_CELL, std::string(slice.substr(0, content_end))); | ||
| 84 | if (end != std::string_view::npos) slice = slice.substr(end); | ||
| 85 | else slice = slice.substr(slice.size()); | ||
| 86 | } | ||
| 87 | |||
| 88 | void Kv1Lexer::lexRow() { | ||
| 89 | size_t cols = 0; | ||
| 90 | while (slice.size() > 0 && slice[0] != '\r' && slice[0] != '\n') { | ||
| 91 | if (slice[0] == '"') readQuotedColumn(); | ||
| 92 | else readUnquotedColumn(); | ||
| 93 | if (!errors.empty()) return; | ||
| 94 | cols++; | ||
| 95 | if (slice.size() != 0) { | ||
| 96 | if (slice[0] == '|') { | ||
| 97 | slice = slice.substr(1); | ||
| 98 | // A newline/eof right after pipe? That means an empty field at the end | ||
| 99 | // of the record, we also want to emit that as a token. | ||
| 100 | if (slice.size() == 0 || slice[0] == '\r' || slice[0] == '\n') { | ||
| 101 | tokens.push_back({ .type = KV1_TOKEN_CELL }); | ||
| 102 | } | ||
| 103 | } else if (slice[0] == '\r') { | ||
| 104 | if (slice.size() > 1 && slice[1] == '\n') slice = slice.substr(2); | ||
| 105 | else slice = slice.substr(1); | ||
| 106 | break; | ||
| 107 | } else if (slice[0] == '\n') { | ||
| 108 | slice = slice.substr(1); | ||
| 109 | break; | ||
| 110 | } else { | ||
| 111 | errors.push_back("lexRow: expected CR, LF or |"); | ||
| 112 | return; | ||
| 113 | } | ||
| 114 | } | ||
| 115 | } | ||
| 116 | tokens.push_back({ .type = KV1_TOKEN_ROW_END }); | ||
| 117 | } | ||
| 118 | |||
| 119 | // Returns true when a line ending was consumed. | ||
| 120 | bool Kv1Lexer::eatWhitespace() { | ||
| 121 | for (size_t i = 0; i < slice.size(); i++) { | ||
| 122 | if (slice[i] == '\r') { | ||
| 123 | slice = slice.substr(i + 1); | ||
| 124 | if (slice.size() > 1 && slice[i + 1] == '\n') | ||
| 125 | slice = slice.substr(i + 2); | ||
| 126 | return true; | ||
| 127 | } | ||
| 128 | if (slice[i] == '\n') { | ||
| 129 | slice = slice.substr(i + 1); | ||
| 130 | return true; | ||
| 131 | } | ||
| 132 | |||
| 133 | if (slice[i] != ' ' && slice[i] != '\f' && slice[i] != '\t' && slice[i] != '\v') { | ||
| 134 | slice = slice.substr(i); | ||
| 135 | return false; | ||
| 136 | } | ||
| 137 | } | ||
| 138 | return false; | ||
| 139 | } | ||
| 140 | |||
| 141 | void Kv1Lexer::lex() { | ||
| 142 | lexOptionalHeader(); | ||
| 143 | eatWhitespace(); | ||
| 144 | |||
| 145 | while (errors.empty() && !slice.empty()) { | ||
| 146 | lexOptionalComment(); | ||
| 147 | bool newline = eatWhitespace(); | ||
| 148 | if (newline) continue; | ||
| 149 | // We are now either (1) at the end of the file or (2) at the start of some column data | ||
| 150 | if (errors.empty()) lexRow(); | ||
| 151 | } | ||
| 152 | } | ||