diff options
author | Rutger Broekhoff | 2024-05-02 20:27:40 +0200 |
---|---|---|
committer | Rutger Broekhoff | 2024-05-02 20:27:40 +0200 |
commit | 17a3ea880402338420699e03bcb24181e4ff3924 (patch) | |
tree | da666ef91e0b60d20aa0b01529644c136fd1f4ab /src/bundleparquet/spliturl.cpp | |
download | oeuf-17a3ea880402338420699e03bcb24181e4ff3924.tar.gz oeuf-17a3ea880402338420699e03bcb24181e4ff3924.zip |
Initial commit
Based on dc4ba6a
Diffstat (limited to 'src/bundleparquet/spliturl.cpp')
-rw-r--r-- | src/bundleparquet/spliturl.cpp | 203 |
1 files changed, 203 insertions, 0 deletions
diff --git a/src/bundleparquet/spliturl.cpp b/src/bundleparquet/spliturl.cpp new file mode 100644 index 0000000..90fd821 --- /dev/null +++ b/src/bundleparquet/spliturl.cpp | |||
@@ -0,0 +1,203 @@ | |||
1 | // vim:set sw=2 ts=2 sts et: | ||
2 | |||
3 | #include <cstring> | ||
4 | #include <iostream> | ||
5 | #include <optional> | ||
6 | #include <sstream> | ||
7 | #include <string> | ||
8 | |||
9 | #include <curl/curl.h> | ||
10 | |||
11 | #include "spliturl.hpp" | ||
12 | |||
13 | // splitUrl takes a URL of the shape '[http[s]://]HOST[:PORT][/PATH]', and | ||
14 | // splits it into two URLs: | ||
15 | // - scheme + host -> '[http[s]://]HOST' | ||
16 | // - port + path -> '[PORT][/PATH]' | ||
17 | // In case an IPv6 address is provided, the host must enclosed in square | ||
18 | // brackets. The zone ID may also be indicated. Note that in the resulting | ||
19 | // parts, the colon preceding the port number is omitted. This is on purpose. | ||
20 | std::optional<SplitUrl> splitUrl(const std::string &url, std::string *error) { | ||
21 | std::stringstream errs; | ||
22 | std::optional<SplitUrl> result; | ||
23 | char *processed = nullptr; | ||
24 | char *scheme = nullptr; | ||
25 | char *user = nullptr; | ||
26 | char *password = nullptr; | ||
27 | char *zoneid = nullptr; | ||
28 | char *query = nullptr; | ||
29 | char *fragment = nullptr; | ||
30 | CURLU *schemehost = nullptr; | ||
31 | char *schemehost_url = nullptr; | ||
32 | char *portpath_url = nullptr; | ||
33 | |||
34 | // Parse the URL, allowing the user to omit the scheme. CURL will use 'https' | ||
35 | // by default if no scheme is specified. | ||
36 | |||
37 | CURLU *parsed = curl_url(); | ||
38 | CURLUcode rc = curl_url_set(parsed, CURLUPART_URL, url.c_str(), CURLU_DEFAULT_SCHEME); | ||
39 | if (rc != CURLUE_OK) { | ||
40 | errs << "Failed to parse URL: " << curl_url_strerror(rc); | ||
41 | goto Exit; | ||
42 | } | ||
43 | |||
44 | // As we parse the URL with the option CURLU_DEFAULT_SCHEME, the CURL API | ||
45 | // won't require the user to provide the scheme part of the URL. It will | ||
46 | // automatically default the scheme to https. However, we do not usually want | ||
47 | // it to default to HTTPS, but HTTP instead (as the use case, connecting to a | ||
48 | // PushGateway server, usually is served over a private network via HTTP). | ||
49 | // | ||
50 | // This is why we check if the scheme was put there by CURL and otherwise set | ||
51 | // it to HTTP. We also check for any other schemes that the user may have | ||
52 | // provided, and reject anything that is not http/https. | ||
53 | if (!url.starts_with("http://") && !url.starts_with("https://")) { | ||
54 | rc = curl_url_get(parsed, CURLUPART_SCHEME, &scheme, 0); | ||
55 | if (rc != CURLUE_OK) { | ||
56 | errs << "Could not get scheme from parsed URL: " << curl_url_strerror(rc); | ||
57 | goto Exit; | ||
58 | } | ||
59 | if (strcmp(scheme, "https")) { | ||
60 | errs << "Unexpected scheme" << scheme << "in provided URL (expected http or https)"; | ||
61 | goto Exit; | ||
62 | } | ||
63 | rc = curl_url_set(parsed, CURLUPART_SCHEME, "http", 0); | ||
64 | if (rc != CURLUE_OK) { | ||
65 | errs << "Could not set URL scheme to http: " << curl_url_strerror(rc); | ||
66 | goto Exit; | ||
67 | } | ||
68 | } | ||
69 | |||
70 | // Turn the parsed URL back into a string. | ||
71 | rc = curl_url_get(parsed, CURLUPART_URL, &processed, 0); | ||
72 | if (rc != CURLUE_OK) { | ||
73 | errs << "Failed to output parsed URL: " << curl_url_strerror(rc); | ||
74 | goto Exit; | ||
75 | } | ||
76 | |||
77 | // This part of the code checks if no prohibited parts are present in the URL | ||
78 | // (basic auth: (user, password), query, fragment). | ||
79 | |||
80 | rc = curl_url_get(parsed, CURLUPART_USER, &user, 0); | ||
81 | if (rc == CURLUE_OK && strlen(user) != 0) { | ||
82 | errs << "Provided URL should not contain a user part"; | ||
83 | goto Exit; | ||
84 | } else if (rc != CURLUE_NO_USER && rc != CURLUE_OK) { | ||
85 | errs << "Failed to get check user part existence in provided url: " << curl_url_strerror(rc); | ||
86 | goto Exit; | ||
87 | } | ||
88 | |||
89 | rc = curl_url_get(parsed, CURLUPART_PASSWORD, &password, 0); | ||
90 | if (rc == CURLUE_OK && strlen(password) != 0) { | ||
91 | errs << "Provided URL should not contain a password part"; | ||
92 | goto Exit; | ||
93 | } else if (rc != CURLUE_NO_PASSWORD && rc != CURLUE_OK) { | ||
94 | errs << "Failed to get check password part existence in provided url: " << curl_url_strerror(rc); | ||
95 | goto Exit; | ||
96 | } | ||
97 | |||
98 | rc = curl_url_get(parsed, CURLUPART_QUERY, &query, 0); | ||
99 | if (rc == CURLUE_OK && strlen(query) != 0) { | ||
100 | errs << "Provided URL should not contain a query part"; | ||
101 | goto Exit; | ||
102 | } else if (rc != CURLUE_NO_QUERY && rc != CURLUE_OK) { | ||
103 | errs << "Failed to get check query part existence in provided url: " << curl_url_strerror(rc); | ||
104 | goto Exit; | ||
105 | } | ||
106 | |||
107 | rc = curl_url_get(parsed, CURLUPART_FRAGMENT, &fragment, 0); | ||
108 | if (rc == CURLUE_OK && strlen(fragment) != 0) { | ||
109 | errs << "Provided URL should not contain a fragment part"; | ||
110 | goto Exit; | ||
111 | } else if (rc != CURLUE_NO_FRAGMENT && rc != CURLUE_OK) { | ||
112 | errs << "Failed to get check fragment part existence in provided url: " << curl_url_strerror(rc); | ||
113 | goto Exit; | ||
114 | } | ||
115 | |||
116 | // Now that we know that the provided URL makes sense, we can start doing | ||
117 | // some arts and crafts. We get started by copying the parsed URL into | ||
118 | // schemehost and simply delete all parts which are not scheme + host. | ||
119 | |||
120 | schemehost = curl_url_dup(parsed); | ||
121 | |||
122 | // CURL BUG WORKAROUND: CURLUPART_ZONEID is NOT copied by curl_url_dup! | ||
123 | // ^ fixed in CURL 8.3.0 after https://curl.se/mail/lib-2023-07/0047.html | ||
124 | rc = curl_url_get(parsed, CURLUPART_ZONEID, &zoneid, 0); | ||
125 | if (rc == CURLUE_OK) { | ||
126 | rc = curl_url_set(schemehost, CURLUPART_ZONEID, zoneid, 0); | ||
127 | if (rc != CURLUE_OK) { | ||
128 | errs << "Could not copy zone ID to duplicated URL: " << curl_url_strerror(rc); | ||
129 | goto Exit; | ||
130 | } | ||
131 | } | ||
132 | rc = curl_url_set(schemehost, CURLUPART_PORT, nullptr, 0); | ||
133 | if (rc != CURLUE_OK) { | ||
134 | errs << "Could not unset port in duplicated URL: " << curl_url_strerror(rc); | ||
135 | goto Exit; | ||
136 | } | ||
137 | rc = curl_url_set(schemehost, CURLUPART_PATH, nullptr, 0); | ||
138 | if (rc != CURLUE_OK) { | ||
139 | errs << "Could not unset path in duplicated URL: " << curl_url_strerror(rc); | ||
140 | goto Exit; | ||
141 | } | ||
142 | |||
143 | // Okay, now we have the schemehost CURLU all ready to go. Note that a URL | ||
144 | // only consisting of a scheme and host is considered valid, so CURL will be | ||
145 | // more than happy to actually turn it into a string for us. Which is exactly | ||
146 | // what we do here :) | ||
147 | |||
148 | rc = curl_url_get(schemehost, CURLUPART_URL, &schemehost_url, 0); | ||
149 | if (rc != CURLUE_OK) { | ||
150 | errs << "Could not get scheme + host URL: " << curl_url_strerror(rc); | ||
151 | goto Exit; | ||
152 | } | ||
153 | |||
154 | // Remove any trailing slash after the scheme + host URL that CURL might have | ||
155 | // put there -- we still want to get a valid URL if we paste the port + path | ||
156 | // part behind it. | ||
157 | |||
158 | if (strlen(schemehost_url) > 0) { | ||
159 | if (schemehost_url[strlen(schemehost_url) - 1] != '/') { | ||
160 | errs << "Scheme + host URL does not end with a slash"; | ||
161 | goto Exit; | ||
162 | } | ||
163 | schemehost_url[strlen(schemehost_url) - 1] = '\0'; | ||
164 | } | ||
165 | |||
166 | // Look, this is really gross. Because the port + path part of the URL is not | ||
167 | // a valid URL itself, but the scheme + host should be a prefix of the full | ||
168 | // URL containing the port + path, we can simply check if it is indeed a | ||
169 | // prefix, and then strip it from the full URL, giving us the port + path | ||
170 | // (after deleting the colon preceding the port). | ||
171 | |||
172 | if (!std::string_view(processed).starts_with(schemehost_url)) { | ||
173 | errs << "Scheme + host URL is not a prefix of the processed URL"; | ||
174 | goto Exit; | ||
175 | } | ||
176 | |||
177 | portpath_url = processed + strlen(schemehost_url); | ||
178 | // We should not have the colon before the port, prometheus-cpp inserts it | ||
179 | if (strlen(portpath_url) > 0 && portpath_url[0] == ':') portpath_url++; | ||
180 | // We do not need a trailing slash | ||
181 | if (strlen(portpath_url) > 0 && portpath_url[strlen(portpath_url)-1] == '/') | ||
182 | portpath_url[strlen(portpath_url)-1] = '\0'; | ||
183 | |||
184 | // It has been done. BLECH | ||
185 | result = std::make_optional<SplitUrl>(schemehost_url, portpath_url); | ||
186 | |||
187 | Exit: | ||
188 | curl_free(processed); | ||
189 | curl_free(scheme); | ||
190 | curl_free(user); | ||
191 | curl_free(password); | ||
192 | curl_free(query); | ||
193 | curl_free(fragment); | ||
194 | curl_free(zoneid); | ||
195 | curl_free(schemehost_url); | ||
196 | curl_url_cleanup(schemehost); | ||
197 | curl_url_cleanup(parsed); | ||
198 | |||
199 | if (!result && error) | ||
200 | *error = errs.str(); | ||
201 | |||
202 | return result; | ||
203 | } | ||