From 17a3ea880402338420699e03bcb24181e4ff3924 Mon Sep 17 00:00:00 2001 From: Rutger Broekhoff Date: Thu, 2 May 2024 20:27:40 +0200 Subject: Initial commit Based on dc4ba6a --- lib/libtmi8/src/kv1_lexer.cpp | 152 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 lib/libtmi8/src/kv1_lexer.cpp (limited to 'lib/libtmi8/src/kv1_lexer.cpp') diff --git a/lib/libtmi8/src/kv1_lexer.cpp b/lib/libtmi8/src/kv1_lexer.cpp new file mode 100644 index 0000000..028127b --- /dev/null +++ b/lib/libtmi8/src/kv1_lexer.cpp @@ -0,0 +1,152 @@ +// vim:set sw=2 ts=2 sts et: + +#include + +Kv1Lexer::Kv1Lexer(std::string_view input) + : input(input), slice(input) +{} + +// Does not eat newline character. +void Kv1Lexer::eatRestOfLine() { + size_t end = slice.size(); + for (size_t i = 0; i < slice.size(); i++) { + if (slice[i] == '\r' || slice[i] == '\n') { + end = i; + break; + } + } + slice = slice.substr(end); +} + +void Kv1Lexer::lexOptionalHeader() { + if (slice.starts_with('[')) eatRestOfLine(); +} + +void Kv1Lexer::lexOptionalComment() { + if (slice.starts_with(';')) eatRestOfLine(); +} + +inline bool Kv1Lexer::isWhitespace(int c) { + return c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v'; +} + +void Kv1Lexer::readQuotedColumn() { + Kv1Token token{ .type = KV1_TOKEN_CELL }; + + if (slice.size() == 0 || slice[0] != '"') { + errors.push_back("(internal error) readQuotedColumn: slice[0] != '\"'"); + return; + } + slice = slice.substr(1); + while (true) { + size_t quote = slice.find('"'); + if (quote == std::string_view::npos) { + errors.push_back("readQuotedColumn: no matching closing quote found"); + return; + } + if (quote+1 == slice.size() || slice[quote + 1] != '"') { + token.data.append(slice.substr(0, quote)); + break; + } + token.data.append(slice.substr(0, quote + 1)); + slice = slice.substr(quote + 2); + } + + size_t end = slice.size(); + for (size_t i = 0; i < slice.size(); i++) { + if (slice[i] == '|' || slice[i] == '\r' || slice[i] == '\n') { + end = i; + break; + } + if (!isWhitespace(slice[i])) { + errors.push_back("readQuotedColumn: encountered non-whitespace character after closing quote"); + return; + } + } + if (end != std::string_view::npos) slice = slice.substr(end); + else slice = slice.substr(slice.size()); + + tokens.push_back(std::move(token)); +} + +void Kv1Lexer::readUnquotedColumn() { + size_t end = slice.size(); + size_t content_end = 0; + for (size_t i = 0; i < slice.size(); i++) { + if (slice[i] == '|' || slice[i] == '\r' || slice[i] == '\n') { + end = i; + break; + } else if (!isWhitespace(slice[i])) { + content_end = i + 1; + } + } + tokens.emplace_back(KV1_TOKEN_CELL, std::string(slice.substr(0, content_end))); + if (end != std::string_view::npos) slice = slice.substr(end); + else slice = slice.substr(slice.size()); +} + +void Kv1Lexer::lexRow() { + size_t cols = 0; + while (slice.size() > 0 && slice[0] != '\r' && slice[0] != '\n') { + if (slice[0] == '"') readQuotedColumn(); + else readUnquotedColumn(); + if (!errors.empty()) return; + cols++; + if (slice.size() != 0) { + if (slice[0] == '|') { + slice = slice.substr(1); + // A newline/eof right after pipe? That means an empty field at the end + // of the record, we also want to emit that as a token. + if (slice.size() == 0 || slice[0] == '\r' || slice[0] == '\n') { + tokens.push_back({ .type = KV1_TOKEN_CELL }); + } + } else if (slice[0] == '\r') { + if (slice.size() > 1 && slice[1] == '\n') slice = slice.substr(2); + else slice = slice.substr(1); + break; + } else if (slice[0] == '\n') { + slice = slice.substr(1); + break; + } else { + errors.push_back("lexRow: expected CR, LF or |"); + return; + } + } + } + tokens.push_back({ .type = KV1_TOKEN_ROW_END }); +} + +// Returns true when a line ending was consumed. +bool Kv1Lexer::eatWhitespace() { + for (size_t i = 0; i < slice.size(); i++) { + if (slice[i] == '\r') { + slice = slice.substr(i + 1); + if (slice.size() > 1 && slice[i + 1] == '\n') + slice = slice.substr(i + 2); + return true; + } + if (slice[i] == '\n') { + slice = slice.substr(i + 1); + return true; + } + + if (slice[i] != ' ' && slice[i] != '\f' && slice[i] != '\t' && slice[i] != '\v') { + slice = slice.substr(i); + return false; + } + } + return false; +} + +void Kv1Lexer::lex() { + lexOptionalHeader(); + eatWhitespace(); + + while (errors.empty() && !slice.empty()) { + lexOptionalComment(); + bool newline = eatWhitespace(); + if (newline) continue; + // We are now either (1) at the end of the file or (2) at the start of some column data + if (errors.empty()) lexRow(); + } +} -- cgit v1.2.3