Initial commit

Based on dc4ba6a
author: Rutger Broekhoff 2024-05-02 20:27:40 +0200
committer: Rutger Broekhoff 2024-05-02 20:27:40 +0200
commit: 17a3ea880402338420699e03bcb24181e4ff3924 (patch)
tree: da666ef91e0b60d20aa0b01529644c136fd1f4ab /lib/libtmi8/src/kv1_lexer.cpp
download: oeuf-17a3ea880402338420699e03bcb24181e4ff3924.tar.gz
oeuf-17a3ea880402338420699e03bcb24181e4ff3924.zip
1 files changed, 152 insertions, 0 deletions
diff --git a/lib/libtmi8/src/kv1_lexer.cpp b/lib/libtmi8/src/kv1_lexer.cpp
new file mode 100644
index 0000000..028127b
--- /dev/null
+++ b/lib/libtmi8/src/kv1_lexer.cpp
@@ -0,0 +1,152 @@
+// vim:set sw=2 ts=2 sts et:
+#include <tmi8/kv1_lexer.hpp>
+Kv1Lexer::Kv1Lexer(std::string_view input)
+  : input(input), slice(input)
+{}
+// Does not eat newline character.
+void Kv1Lexer::eatRestOfLine() {
+  size_t end = slice.size();
+  for (size_t i = 0; i < slice.size(); i++) {
+    if (slice[i] == '\r' || slice[i] == '\n') {
+      end = i;
+      break;
+    }
+  }
+  slice = slice.substr(end);
+}
+void Kv1Lexer::lexOptionalHeader() {
+  if (slice.starts_with('[')) eatRestOfLine();
+}
+void Kv1Lexer::lexOptionalComment() {
+  if (slice.starts_with(';')) eatRestOfLine();
+}
+inline bool Kv1Lexer::isWhitespace(int c) {
+  return c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v';
+}
+void Kv1Lexer::readQuotedColumn() {
+  Kv1Token token{ .type = KV1_TOKEN_CELL };
+  if (slice.size() == 0 || slice[0] != '"') {
+    errors.push_back("(internal error) readQuotedColumn: slice[0] != '\"'");
+    return;
+  }
+  slice = slice.substr(1);
+  while (true) {
+    size_t quote = slice.find('"');
+    if (quote == std::string_view::npos) {
+      errors.push_back("readQuotedColumn: no matching closing quote found");
+      return;
+    }
+    if (quote+1 == slice.size() || slice[quote + 1] != '"') {
+      token.data.append(slice.substr(0, quote));
+      break;
+    }
+    token.data.append(slice.substr(0, quote + 1));
+    slice = slice.substr(quote + 2);
+  }
+  size_t end = slice.size();
+  for (size_t i = 0; i < slice.size(); i++) {
+    if (slice[i] == '|' || slice[i] == '\r' || slice[i] == '\n') {
+      end = i;
+      break;
+    }
+    if (!isWhitespace(slice[i])) {
+      errors.push_back("readQuotedColumn: encountered non-whitespace character after closing quote");
+      return;
+    }
+  }
+  if (end != std::string_view::npos) slice = slice.substr(end);
+  else slice = slice.substr(slice.size());
+  tokens.push_back(std::move(token));
+}
+void Kv1Lexer::readUnquotedColumn() {
+  size_t end = slice.size();
+  size_t content_end = 0;
+  for (size_t i = 0; i < slice.size(); i++) {
+    if (slice[i] == '|' || slice[i] == '\r' || slice[i] == '\n') {
+      end = i;
+      break;
+    } else if (!isWhitespace(slice[i])) {
+      content_end = i + 1;
+    }
+  }
+  tokens.emplace_back(KV1_TOKEN_CELL, std::string(slice.substr(0, content_end)));
+  if (end != std::string_view::npos) slice = slice.substr(end);
+  else slice = slice.substr(slice.size());
+}
+void Kv1Lexer::lexRow() {
+  size_t cols = 0;
+  while (slice.size() > 0 && slice[0] != '\r' && slice[0] != '\n') {
+    if (slice[0] == '"') readQuotedColumn();
+    else readUnquotedColumn();
+    if (!errors.empty()) return;
+    cols++;
+    if (slice.size() != 0) {
+      if (slice[0] == '|') {
+        slice = slice.substr(1);
+        // A newline/eof right after pipe? That means an empty field at the end
+        // of the record, we also want to emit that as a token.
+        if (slice.size() == 0 || slice[0] == '\r' || slice[0] == '\n') {
+          tokens.push_back({ .type = KV1_TOKEN_CELL });
+        }
+      } else if (slice[0] == '\r') {
+        if (slice.size() > 1 && slice[1] == '\n') slice = slice.substr(2);
+        else slice = slice.substr(1);
+        break;
+      } else if (slice[0] == '\n') {
+        slice = slice.substr(1);
+        break;
+      } else {
+        errors.push_back("lexRow: expected CR, LF or |");
+        return;
+      }
+    }
+  }
+  tokens.push_back({ .type = KV1_TOKEN_ROW_END });
+}
+// Returns true when a line ending was consumed.
+bool Kv1Lexer::eatWhitespace() {
+  for (size_t i = 0; i < slice.size(); i++) {
+    if (slice[i] == '\r') {
+      slice = slice.substr(i + 1);
+      if (slice.size() > 1 && slice[i + 1] == '\n')
+        slice = slice.substr(i + 2);
+      return true;
+    }
+    if (slice[i] == '\n') {
+      slice = slice.substr(i + 1);
+      return true;
+    }
+    
+    if (slice[i] != ' ' && slice[i] != '\f' && slice[i] != '\t' && slice[i] != '\v') {
+      slice = slice.substr(i);
+      return false;
+    }
+  }
+  return false;
+}
+void Kv1Lexer::lex() {
+  lexOptionalHeader();
+  eatWhitespace();
+  while (errors.empty() && !slice.empty()) {
+    lexOptionalComment();
+    bool newline = eatWhitespace();
+    if (newline) continue;
+    // We are now either (1) at the end of the file or (2) at the start of some column data
+    if (errors.empty()) lexRow();
+  }
+}
author	Rutger Broekhoff	2024-05-02 20:27:40 +0200
committer	Rutger Broekhoff	2024-05-02 20:27:40 +0200
commit	17a3ea880402338420699e03bcb24181e4ff3924 (patch)
tree	da666ef91e0b60d20aa0b01529644c136fd1f4ab /lib/libtmi8/src/kv1_lexer.cpp
download	oeuf-17a3ea880402338420699e03bcb24181e4ff3924.tar.gz oeuf-17a3ea880402338420699e03bcb24181e4ff3924.zip

diff --git a/lib/libtmi8/src/kv1_lexer.cpp b/lib/libtmi8/src/kv1_lexer.cpp new file mode 100644 index 0000000..028127b --- /dev/null +++ b/lib/libtmi8/src/kv1_lexer.cpp
@@ -0,0 +1,152 @@
	1	// vim:set sw=2 ts=2 sts et:
	2
	3	#include <tmi8/kv1_lexer.hpp>
	4
	5	Kv1Lexer::Kv1Lexer(std::string_view input)
	6	: input(input), slice(input)
	7	{}
	8
	9	// Does not eat newline character.
	10	void Kv1Lexer::eatRestOfLine() {
	11	size_t end = slice.size();
	12	for (size_t i = 0; i < slice.size(); i++) {
	13	if (slice[i] == '\r' \|\| slice[i] == '\n') {
	14	end = i;
	15	break;
	16	}
	17	}
	18	slice = slice.substr(end);
	19	}
	20
	21	void Kv1Lexer::lexOptionalHeader() {
	22	if (slice.starts_with('[')) eatRestOfLine();
	23	}
	24
	25	void Kv1Lexer::lexOptionalComment() {
	26	if (slice.starts_with(';')) eatRestOfLine();
	27	}
	28
	29	inline bool Kv1Lexer::isWhitespace(int c) {
	30	return c == ' ' \|\| c == '\n' \|\| c == '\r' \|\| c == '\t' \|\| c == '\v';
	31	}
	32
	33	void Kv1Lexer::readQuotedColumn() {
	34	Kv1Token token{ .type = KV1_TOKEN_CELL };
	35
	36	if (slice.size() == 0 \|\| slice[0] != '"') {
	37	errors.push_back("(internal error) readQuotedColumn: slice[0] != '\"'");
	38	return;
	39	}
	40	slice = slice.substr(1);
	41	while (true) {
	42	size_t quote = slice.find('"');
	43	if (quote == std::string_view::npos) {
	44	errors.push_back("readQuotedColumn: no matching closing quote found");
	45	return;
	46	}
	47	if (quote+1 == slice.size() \|\| slice[quote + 1] != '"') {
	48	token.data.append(slice.substr(0, quote));
	49	break;
	50	}
	51	token.data.append(slice.substr(0, quote + 1));
	52	slice = slice.substr(quote + 2);
	53	}
	54
	55	size_t end = slice.size();
	56	for (size_t i = 0; i < slice.size(); i++) {
	57	if (slice[i] == '\|' \|\| slice[i] == '\r' \|\| slice[i] == '\n') {
	58	end = i;
	59	break;
	60	}
	61	if (!isWhitespace(slice[i])) {
	62	errors.push_back("readQuotedColumn: encountered non-whitespace character after closing quote");
	63	return;
	64	}
	65	}
	66	if (end != std::string_view::npos) slice = slice.substr(end);
	67	else slice = slice.substr(slice.size());
	68
	69	tokens.push_back(std::move(token));
	70	}
	71
	72	void Kv1Lexer::readUnquotedColumn() {
	73	size_t end = slice.size();
	74	size_t content_end = 0;
	75	for (size_t i = 0; i < slice.size(); i++) {
	76	if (slice[i] == '\|' \|\| slice[i] == '\r' \|\| slice[i] == '\n') {
	77	end = i;
	78	break;
	79	} else if (!isWhitespace(slice[i])) {
	80	content_end = i + 1;
	81	}
	82	}
	83	tokens.emplace_back(KV1_TOKEN_CELL, std::string(slice.substr(0, content_end)));
	84	if (end != std::string_view::npos) slice = slice.substr(end);
	85	else slice = slice.substr(slice.size());
	86	}
	87
	88	void Kv1Lexer::lexRow() {
	89	size_t cols = 0;
	90	while (slice.size() > 0 && slice[0] != '\r' && slice[0] != '\n') {
	91	if (slice[0] == '"') readQuotedColumn();
	92	else readUnquotedColumn();
	93	if (!errors.empty()) return;
	94	cols++;
	95	if (slice.size() != 0) {
	96	if (slice[0] == '\|') {
	97	slice = slice.substr(1);
	98	// A newline/eof right after pipe? That means an empty field at the end
	99	// of the record, we also want to emit that as a token.
	100	if (slice.size() == 0 \|\| slice[0] == '\r' \|\| slice[0] == '\n') {
	101	tokens.push_back({ .type = KV1_TOKEN_CELL });
	102	}
	103	} else if (slice[0] == '\r') {
	104	if (slice.size() > 1 && slice[1] == '\n') slice = slice.substr(2);
	105	else slice = slice.substr(1);
	106	break;
	107	} else if (slice[0] == '\n') {
	108	slice = slice.substr(1);
	109	break;
	110	} else {
	111	errors.push_back("lexRow: expected CR, LF or \|");
	112	return;
	113	}
	114	}
	115	}
	116	tokens.push_back({ .type = KV1_TOKEN_ROW_END });
	117	}
	118
	119	// Returns true when a line ending was consumed.
	120	bool Kv1Lexer::eatWhitespace() {
	121	for (size_t i = 0; i < slice.size(); i++) {
	122	if (slice[i] == '\r') {
	123	slice = slice.substr(i + 1);
	124	if (slice.size() > 1 && slice[i + 1] == '\n')
	125	slice = slice.substr(i + 2);
	126	return true;
	127	}
	128	if (slice[i] == '\n') {
	129	slice = slice.substr(i + 1);
	130	return true;
	131	}
	132
	133	if (slice[i] != ' ' && slice[i] != '\f' && slice[i] != '\t' && slice[i] != '\v') {
	134	slice = slice.substr(i);
	135	return false;
	136	}
	137	}
	138	return false;
	139	}
	140
	141	void Kv1Lexer::lex() {
	142	lexOptionalHeader();
	143	eatWhitespace();
	144
	145	while (errors.empty() && !slice.empty()) {
	146	lexOptionalComment();
	147	bool newline = eatWhitespace();
	148	if (newline) continue;
	149	// We are now either (1) at the end of the file or (2) at the start of some column data
	150	if (errors.empty()) lexRow();
	151	}
	152	}