aboutsummaryrefslogtreecommitdiffstats
path: root/lib/libtmi8/src/kv1_lexer.cpp
blob: 9b0e3f8da98568a0d64bc3562c7e129e9c82b756 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// vim:set sw=2 ts=2 sts et:
//
// Copyright 2024 Rutger Broekhoff. Licensed under the EUPL.

#include <tmi8/kv1_lexer.hpp>

Kv1Lexer::Kv1Lexer(std::string_view input)
  : input(input), slice(input)
{}

// Does not eat newline character.
void Kv1Lexer::eatRestOfLine() {
  size_t end = slice.size();
  for (size_t i = 0; i < slice.size(); i++) {
    if (slice[i] == '\r' || slice[i] == '\n') {
      end = i;
      break;
    }
  }
  slice = slice.substr(end);
}

void Kv1Lexer::lexOptionalHeader() {
  if (slice.starts_with('[')) eatRestOfLine();
}

void Kv1Lexer::lexOptionalComment() {
  if (slice.starts_with(';')) eatRestOfLine();
}

inline bool Kv1Lexer::isWhitespace(int c) {
  return c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v';
}

void Kv1Lexer::readQuotedColumn() {
  Kv1Token token{ .type = KV1_TOKEN_CELL };

  if (slice.size() == 0 || slice[0] != '"') {
    errors.push_back("(internal error) readQuotedColumn: slice[0] != '\"'");
    return;
  }
  slice = slice.substr(1);
  while (true) {
    size_t quote = slice.find('"');
    if (quote == std::string_view::npos) {
      errors.push_back("readQuotedColumn: no matching closing quote found");
      return;
    }
    if (quote+1 == slice.size() || slice[quote + 1] != '"') {
      token.data.append(slice.substr(0, quote));
      break;
    }
    token.data.append(slice.substr(0, quote + 1));
    slice = slice.substr(quote + 2);
  }

  size_t end = slice.size();
  for (size_t i = 0; i < slice.size(); i++) {
    if (slice[i] == '|' || slice[i] == '\r' || slice[i] == '\n') {
      end = i;
      break;
    }
    if (!isWhitespace(slice[i])) {
      errors.push_back("readQuotedColumn: encountered non-whitespace character after closing quote");
      return;
    }
  }
  if (end != std::string_view::npos) slice = slice.substr(end);
  else slice = slice.substr(slice.size());

  tokens.push_back(std::move(token));
}

void Kv1Lexer::readUnquotedColumn() {
  size_t end = slice.size();
  size_t content_end = 0;
  for (size_t i = 0; i < slice.size(); i++) {
    if (slice[i] == '|' || slice[i] == '\r' || slice[i] == '\n') {
      end = i;
      break;
    } else if (!isWhitespace(slice[i])) {
      content_end = i + 1;
    }
  }
  tokens.emplace_back(KV1_TOKEN_CELL, std::string(slice.substr(0, content_end)));
  if (end != std::string_view::npos) slice = slice.substr(end);
  else slice = slice.substr(slice.size());
}

void Kv1Lexer::lexRow() {
  size_t cols = 0;
  while (slice.size() > 0 && slice[0] != '\r' && slice[0] != '\n') {
    if (slice[0] == '"') readQuotedColumn();
    else readUnquotedColumn();
    if (!errors.empty()) return;
    cols++;
    if (slice.size() != 0) {
      if (slice[0] == '|') {
        slice = slice.substr(1);
        // A newline/eof right after pipe? That means an empty field at the end
        // of the record, we also want to emit that as a token.
        if (slice.size() == 0 || slice[0] == '\r' || slice[0] == '\n') {
          tokens.push_back({ .type = KV1_TOKEN_CELL });
        }
      } else if (slice[0] == '\r') {
        if (slice.size() > 1 && slice[1] == '\n') slice = slice.substr(2);
        else slice = slice.substr(1);
        break;
      } else if (slice[0] == '\n') {
        slice = slice.substr(1);
        break;
      } else {
        errors.push_back("lexRow: expected CR, LF or |");
        return;
      }
    }
  }
  tokens.push_back({ .type = KV1_TOKEN_ROW_END });
}

// Returns true when a line ending was consumed.
bool Kv1Lexer::eatWhitespace() {
  for (size_t i = 0; i < slice.size(); i++) {
    if (slice[i] == '\r') {
      slice = slice.substr(i + 1);
      if (slice.size() > 1 && slice[i + 1] == '\n')
        slice = slice.substr(i + 2);
      return true;
    }
    if (slice[i] == '\n') {
      slice = slice.substr(i + 1);
      return true;
    }
    
    if (slice[i] != ' ' && slice[i] != '\f' && slice[i] != '\t' && slice[i] != '\v') {
      slice = slice.substr(i);
      return false;
    }
  }
  return false;
}

void Kv1Lexer::lex() {
  lexOptionalHeader();
  eatWhitespace();

  while (errors.empty() && !slice.empty()) {
    lexOptionalComment();
    bool newline = eatWhitespace();
    if (newline) continue;
    // We are now either (1) at the end of the file or (2) at the start of some column data
    if (errors.empty()) lexRow();
  }
}