aboutsummaryrefslogtreecommitdiffstats
path: root/src/bundleparquet/spliturl.cpp
diff options
context:
space:
mode:
authorLibravatar Rutger Broekhoff2024-05-02 20:27:40 +0200
committerLibravatar Rutger Broekhoff2024-05-02 20:27:40 +0200
commit17a3ea880402338420699e03bcb24181e4ff3924 (patch)
treeda666ef91e0b60d20aa0b01529644c136fd1f4ab /src/bundleparquet/spliturl.cpp
downloadoeuf-17a3ea880402338420699e03bcb24181e4ff3924.tar.gz
oeuf-17a3ea880402338420699e03bcb24181e4ff3924.zip
Initial commit
Based on dc4ba6a
Diffstat (limited to 'src/bundleparquet/spliturl.cpp')
-rw-r--r--src/bundleparquet/spliturl.cpp203
1 files changed, 203 insertions, 0 deletions
diff --git a/src/bundleparquet/spliturl.cpp b/src/bundleparquet/spliturl.cpp
new file mode 100644
index 0000000..90fd821
--- /dev/null
+++ b/src/bundleparquet/spliturl.cpp
@@ -0,0 +1,203 @@
1// vim:set sw=2 ts=2 sts et:
2
3#include <cstring>
4#include <iostream>
5#include <optional>
6#include <sstream>
7#include <string>
8
9#include <curl/curl.h>
10
11#include "spliturl.hpp"
12
13// splitUrl takes a URL of the shape '[http[s]://]HOST[:PORT][/PATH]', and
14// splits it into two URLs:
15// - scheme + host -> '[http[s]://]HOST'
16// - port + path -> '[PORT][/PATH]'
17// In case an IPv6 address is provided, the host must enclosed in square
18// brackets. The zone ID may also be indicated. Note that in the resulting
19// parts, the colon preceding the port number is omitted. This is on purpose.
20std::optional<SplitUrl> splitUrl(const std::string &url, std::string *error) {
21 std::stringstream errs;
22 std::optional<SplitUrl> result;
23 char *processed = nullptr;
24 char *scheme = nullptr;
25 char *user = nullptr;
26 char *password = nullptr;
27 char *zoneid = nullptr;
28 char *query = nullptr;
29 char *fragment = nullptr;
30 CURLU *schemehost = nullptr;
31 char *schemehost_url = nullptr;
32 char *portpath_url = nullptr;
33
34 // Parse the URL, allowing the user to omit the scheme. CURL will use 'https'
35 // by default if no scheme is specified.
36
37 CURLU *parsed = curl_url();
38 CURLUcode rc = curl_url_set(parsed, CURLUPART_URL, url.c_str(), CURLU_DEFAULT_SCHEME);
39 if (rc != CURLUE_OK) {
40 errs << "Failed to parse URL: " << curl_url_strerror(rc);
41 goto Exit;
42 }
43
44 // As we parse the URL with the option CURLU_DEFAULT_SCHEME, the CURL API
45 // won't require the user to provide the scheme part of the URL. It will
46 // automatically default the scheme to https. However, we do not usually want
47 // it to default to HTTPS, but HTTP instead (as the use case, connecting to a
48 // PushGateway server, usually is served over a private network via HTTP).
49 //
50 // This is why we check if the scheme was put there by CURL and otherwise set
51 // it to HTTP. We also check for any other schemes that the user may have
52 // provided, and reject anything that is not http/https.
53 if (!url.starts_with("http://") && !url.starts_with("https://")) {
54 rc = curl_url_get(parsed, CURLUPART_SCHEME, &scheme, 0);
55 if (rc != CURLUE_OK) {
56 errs << "Could not get scheme from parsed URL: " << curl_url_strerror(rc);
57 goto Exit;
58 }
59 if (strcmp(scheme, "https")) {
60 errs << "Unexpected scheme" << scheme << "in provided URL (expected http or https)";
61 goto Exit;
62 }
63 rc = curl_url_set(parsed, CURLUPART_SCHEME, "http", 0);
64 if (rc != CURLUE_OK) {
65 errs << "Could not set URL scheme to http: " << curl_url_strerror(rc);
66 goto Exit;
67 }
68 }
69
70 // Turn the parsed URL back into a string.
71 rc = curl_url_get(parsed, CURLUPART_URL, &processed, 0);
72 if (rc != CURLUE_OK) {
73 errs << "Failed to output parsed URL: " << curl_url_strerror(rc);
74 goto Exit;
75 }
76
77 // This part of the code checks if no prohibited parts are present in the URL
78 // (basic auth: (user, password), query, fragment).
79
80 rc = curl_url_get(parsed, CURLUPART_USER, &user, 0);
81 if (rc == CURLUE_OK && strlen(user) != 0) {
82 errs << "Provided URL should not contain a user part";
83 goto Exit;
84 } else if (rc != CURLUE_NO_USER && rc != CURLUE_OK) {
85 errs << "Failed to get check user part existence in provided url: " << curl_url_strerror(rc);
86 goto Exit;
87 }
88
89 rc = curl_url_get(parsed, CURLUPART_PASSWORD, &password, 0);
90 if (rc == CURLUE_OK && strlen(password) != 0) {
91 errs << "Provided URL should not contain a password part";
92 goto Exit;
93 } else if (rc != CURLUE_NO_PASSWORD && rc != CURLUE_OK) {
94 errs << "Failed to get check password part existence in provided url: " << curl_url_strerror(rc);
95 goto Exit;
96 }
97
98 rc = curl_url_get(parsed, CURLUPART_QUERY, &query, 0);
99 if (rc == CURLUE_OK && strlen(query) != 0) {
100 errs << "Provided URL should not contain a query part";
101 goto Exit;
102 } else if (rc != CURLUE_NO_QUERY && rc != CURLUE_OK) {
103 errs << "Failed to get check query part existence in provided url: " << curl_url_strerror(rc);
104 goto Exit;
105 }
106
107 rc = curl_url_get(parsed, CURLUPART_FRAGMENT, &fragment, 0);
108 if (rc == CURLUE_OK && strlen(fragment) != 0) {
109 errs << "Provided URL should not contain a fragment part";
110 goto Exit;
111 } else if (rc != CURLUE_NO_FRAGMENT && rc != CURLUE_OK) {
112 errs << "Failed to get check fragment part existence in provided url: " << curl_url_strerror(rc);
113 goto Exit;
114 }
115
116 // Now that we know that the provided URL makes sense, we can start doing
117 // some arts and crafts. We get started by copying the parsed URL into
118 // schemehost and simply delete all parts which are not scheme + host.
119
120 schemehost = curl_url_dup(parsed);
121
122 // CURL BUG WORKAROUND: CURLUPART_ZONEID is NOT copied by curl_url_dup!
123 // ^ fixed in CURL 8.3.0 after https://curl.se/mail/lib-2023-07/0047.html
124 rc = curl_url_get(parsed, CURLUPART_ZONEID, &zoneid, 0);
125 if (rc == CURLUE_OK) {
126 rc = curl_url_set(schemehost, CURLUPART_ZONEID, zoneid, 0);
127 if (rc != CURLUE_OK) {
128 errs << "Could not copy zone ID to duplicated URL: " << curl_url_strerror(rc);
129 goto Exit;
130 }
131 }
132 rc = curl_url_set(schemehost, CURLUPART_PORT, nullptr, 0);
133 if (rc != CURLUE_OK) {
134 errs << "Could not unset port in duplicated URL: " << curl_url_strerror(rc);
135 goto Exit;
136 }
137 rc = curl_url_set(schemehost, CURLUPART_PATH, nullptr, 0);
138 if (rc != CURLUE_OK) {
139 errs << "Could not unset path in duplicated URL: " << curl_url_strerror(rc);
140 goto Exit;
141 }
142
143 // Okay, now we have the schemehost CURLU all ready to go. Note that a URL
144 // only consisting of a scheme and host is considered valid, so CURL will be
145 // more than happy to actually turn it into a string for us. Which is exactly
146 // what we do here :)
147
148 rc = curl_url_get(schemehost, CURLUPART_URL, &schemehost_url, 0);
149 if (rc != CURLUE_OK) {
150 errs << "Could not get scheme + host URL: " << curl_url_strerror(rc);
151 goto Exit;
152 }
153
154 // Remove any trailing slash after the scheme + host URL that CURL might have
155 // put there -- we still want to get a valid URL if we paste the port + path
156 // part behind it.
157
158 if (strlen(schemehost_url) > 0) {
159 if (schemehost_url[strlen(schemehost_url) - 1] != '/') {
160 errs << "Scheme + host URL does not end with a slash";
161 goto Exit;
162 }
163 schemehost_url[strlen(schemehost_url) - 1] = '\0';
164 }
165
166 // Look, this is really gross. Because the port + path part of the URL is not
167 // a valid URL itself, but the scheme + host should be a prefix of the full
168 // URL containing the port + path, we can simply check if it is indeed a
169 // prefix, and then strip it from the full URL, giving us the port + path
170 // (after deleting the colon preceding the port).
171
172 if (!std::string_view(processed).starts_with(schemehost_url)) {
173 errs << "Scheme + host URL is not a prefix of the processed URL";
174 goto Exit;
175 }
176
177 portpath_url = processed + strlen(schemehost_url);
178 // We should not have the colon before the port, prometheus-cpp inserts it
179 if (strlen(portpath_url) > 0 && portpath_url[0] == ':') portpath_url++;
180 // We do not need a trailing slash
181 if (strlen(portpath_url) > 0 && portpath_url[strlen(portpath_url)-1] == '/')
182 portpath_url[strlen(portpath_url)-1] = '\0';
183
184 // It has been done. BLECH
185 result = std::make_optional<SplitUrl>(schemehost_url, portpath_url);
186
187Exit:
188 curl_free(processed);
189 curl_free(scheme);
190 curl_free(user);
191 curl_free(password);
192 curl_free(query);
193 curl_free(fragment);
194 curl_free(zoneid);
195 curl_free(schemehost_url);
196 curl_url_cleanup(schemehost);
197 curl_url_cleanup(parsed);
198
199 if (!result && error)
200 *error = errs.str();
201
202 return result;
203}