aboutsummaryrefslogtreecommitdiffstats
path: root/src/bundleparquet/spliturl.cpp
blob: f5328a9fdc409b671466e86e9333157777523b89 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
// vim:set sw=2 ts=2 sts et:

#include <cstring>
#include <iostream>
#include <optional>
#include <sstream>
#include <string>

#include <curl/curl.h>

#include "spliturl.hpp"

// splitUrl takes a URL of the shape '[http[s]://]HOST[:PORT][/PATH]', and
// splits it into two URLs:
//   - scheme + host -> '[http[s]://]HOST'
//   - port   + path -> '[PORT][/PATH]'
// In case an IPv6 address is provided, the host must be enclosed in square
// brackets. The zone ID may also be indicated. Note that in the resulting
// parts, the colon preceding the port number is omitted. This is on purpose.
std::optional<SplitUrl> splitUrl(const std::string &url, std::string *error) {
  std::stringstream errs;
  std::optional<SplitUrl> result;
  char   *processed      = nullptr;
  char   *scheme         = nullptr;
  char   *user           = nullptr;
  char   *password       = nullptr;
  char   *zoneid         = nullptr;
  char   *query          = nullptr;
  char   *fragment       = nullptr;
  CURLU  *schemehost     = nullptr;
  char   *schemehost_url = nullptr;
  char   *portpath_url   = nullptr;

  // Parse the URL, allowing the user to omit the scheme. CURL will use 'https'
  // by default if no scheme is specified.

  CURLU *parsed = curl_url();
  CURLUcode rc = curl_url_set(parsed, CURLUPART_URL, url.c_str(), CURLU_DEFAULT_SCHEME);
  if (rc != CURLUE_OK) {
    errs << "Failed to parse URL: " << curl_url_strerror(rc);
    goto Exit;
  }

  // As we parse the URL with the option CURLU_DEFAULT_SCHEME, the CURL API
  // won't require the user to provide the scheme part of the URL. It will
  // automatically default the scheme to https. However, we do not usually want
  // it to default to HTTPS, but HTTP instead. (In this specific use case of
  // connecting to a PushGateway server, we assume that the PushGateway server
  // is available over a trusted network, and only using unsecured HTTP).
  // 
  // This is why we check if the scheme was put there by CURL and otherwise set
  // it to HTTP. We also check for any other schemes that the user may have
  // provided, and reject anything that is not http/https.
  if (!url.starts_with("http://") && !url.starts_with("https://")) {
    rc = curl_url_get(parsed, CURLUPART_SCHEME, &scheme, 0);
    if (rc != CURLUE_OK) {
      errs << "Could not get scheme from parsed URL: " << curl_url_strerror(rc);
      goto Exit;
    }
    if (strcmp(scheme, "https")) {
      errs << "Unexpected scheme" << scheme << "in provided URL (expected http or https)";
      goto Exit;
    }
    rc = curl_url_set(parsed, CURLUPART_SCHEME, "http", 0);
    if (rc != CURLUE_OK) {
      errs << "Could not set URL scheme to http: " << curl_url_strerror(rc);
      goto Exit;
    }
  }

  // Turn the parsed URL back into a string.
  rc = curl_url_get(parsed, CURLUPART_URL, &processed, 0);
  if (rc != CURLUE_OK) {
    errs << "Failed to output parsed URL: " << curl_url_strerror(rc);
    goto Exit;
  }

  // This part of the code checks if no prohibited parts are present in the URL
  // (basic auth: (user, password), query, fragment).

  rc = curl_url_get(parsed, CURLUPART_USER, &user, 0);
  if (rc == CURLUE_OK && strlen(user) != 0) {
    errs << "Provided URL should not contain a user part";
    goto Exit;
  } else if (rc != CURLUE_NO_USER && rc != CURLUE_OK) {
    errs << "Failed to get check user part existence in provided url: " << curl_url_strerror(rc);
    goto Exit;
  }

  rc = curl_url_get(parsed, CURLUPART_PASSWORD, &password, 0);
  if (rc == CURLUE_OK && strlen(password) != 0) {
    errs << "Provided URL should not contain a password part";
    goto Exit;
  } else if (rc != CURLUE_NO_PASSWORD && rc != CURLUE_OK) {
    errs << "Failed to get check password part existence in provided url: " << curl_url_strerror(rc);
    goto Exit;
  }

  rc = curl_url_get(parsed, CURLUPART_QUERY, &query, 0);
  if (rc == CURLUE_OK && strlen(query) != 0) {
    errs << "Provided URL should not contain a query part";
    goto Exit;
  } else if (rc != CURLUE_NO_QUERY && rc != CURLUE_OK) {
    errs << "Failed to get check query part existence in provided url: " << curl_url_strerror(rc);
    goto Exit;
  }

  rc = curl_url_get(parsed, CURLUPART_FRAGMENT, &fragment, 0);
  if (rc == CURLUE_OK && strlen(fragment) != 0) {
    errs << "Provided URL should not contain a fragment part";
    goto Exit;
  } else if (rc != CURLUE_NO_FRAGMENT && rc != CURLUE_OK) {
    errs << "Failed to get check fragment part existence in provided url: " << curl_url_strerror(rc);
    goto Exit;
  }

  // Now that we know that the provided URL makes sense, we can start doing
  // some arts and crafts. We get started by copying the parsed URL into
  // schemehost and simply delete all parts which are not scheme + host.

  schemehost = curl_url_dup(parsed);

  // CURL BUG WORKAROUND: CURLUPART_ZONEID is NOT copied by curl_url_dup!
  //      ^ fixed in CURL 8.3.0 after https://curl.se/mail/lib-2023-07/0047.html
  rc = curl_url_get(parsed, CURLUPART_ZONEID, &zoneid, 0);
  if (rc == CURLUE_OK) {
    rc = curl_url_set(schemehost, CURLUPART_ZONEID, zoneid, 0);
    if (rc != CURLUE_OK) {
      errs << "Could not copy zone ID to duplicated URL: " << curl_url_strerror(rc);
      goto Exit;
    }
  }
  rc = curl_url_set(schemehost, CURLUPART_PORT, nullptr, 0);
  if (rc != CURLUE_OK) {
    errs << "Could not unset port in duplicated URL: " << curl_url_strerror(rc);
    goto Exit;
  }
  rc = curl_url_set(schemehost, CURLUPART_PATH, nullptr, 0);
  if (rc != CURLUE_OK) {
    errs << "Could not unset path in duplicated URL: " << curl_url_strerror(rc);
    goto Exit;
  }

  // Okay, now we have the schemehost CURLU all ready to go. Note that a URL
  // only consisting of a scheme and host is considered valid, so CURL will be
  // more than happy to actually turn it into a string for us. Which is exactly
  // what we do here :)

  rc = curl_url_get(schemehost, CURLUPART_URL, &schemehost_url, 0);
  if (rc != CURLUE_OK) {
    errs << "Could not get scheme + host URL: " << curl_url_strerror(rc);
    goto Exit;
  }

  // Remove any trailing slash after the scheme + host URL that CURL might have
  // put there -- we still want to get a valid URL if we paste the port + path
  // part behind it.

  if (strlen(schemehost_url) > 0) {
    if (schemehost_url[strlen(schemehost_url) - 1] != '/') {
      errs << "Scheme + host URL does not end with a slash";
      goto Exit;
    }
    schemehost_url[strlen(schemehost_url) - 1] = '\0';
  }

  // Look, this is really gross. Because the port + path part of the URL is not
  // a valid URL itself, but the scheme + host should be a prefix of the full
  // URL containing the port + path, we can simply check if it is indeed a
  // prefix, and then strip it from the full URL, giving us the port + path
  // (after deleting the colon preceding the port).

  if (!std::string_view(processed).starts_with(schemehost_url)) {
    errs << "Scheme + host URL is not a prefix of the processed URL";
    goto Exit;
  }

  portpath_url = processed + strlen(schemehost_url);
  // We should not have the colon before the port, prometheus-cpp inserts it
  if (strlen(portpath_url) > 0 && portpath_url[0] == ':') portpath_url++;
  // We do not need a trailing slash
  if (strlen(portpath_url) > 0 && portpath_url[strlen(portpath_url)-1] == '/')
    portpath_url[strlen(portpath_url)-1] = '\0';

  // It has been done. BLECH
  result = std::make_optional<SplitUrl>(schemehost_url, portpath_url);

Exit:
  curl_free(processed);
  curl_free(scheme);
  curl_free(user);
  curl_free(password);
  curl_free(query);
  curl_free(fragment);
  curl_free(zoneid);
  curl_free(schemehost_url);
  curl_url_cleanup(schemehost);
  curl_url_cleanup(parsed);

  if (!result && error)
    *error = errs.str();

  return result;
}