diff options
| author | Rutger Broekhoff | 2024-05-02 20:27:40 +0200 | 
|---|---|---|
| committer | Rutger Broekhoff | 2024-05-02 20:27:40 +0200 | 
| commit | 17a3ea880402338420699e03bcb24181e4ff3924 (patch) | |
| tree | da666ef91e0b60d20aa0b01529644c136fd1f4ab /src/bundleparquet/spliturl.cpp | |
| download | oeuf-17a3ea880402338420699e03bcb24181e4ff3924.tar.gz oeuf-17a3ea880402338420699e03bcb24181e4ff3924.zip | |
Initial commit
Based on dc4ba6a
Diffstat (limited to 'src/bundleparquet/spliturl.cpp')
| -rw-r--r-- | src/bundleparquet/spliturl.cpp | 203 | 
1 files changed, 203 insertions, 0 deletions
| diff --git a/src/bundleparquet/spliturl.cpp b/src/bundleparquet/spliturl.cpp new file mode 100644 index 0000000..90fd821 --- /dev/null +++ b/src/bundleparquet/spliturl.cpp | |||
| @@ -0,0 +1,203 @@ | |||
| 1 | // vim:set sw=2 ts=2 sts et: | ||
| 2 | |||
| 3 | #include <cstring> | ||
| 4 | #include <iostream> | ||
| 5 | #include <optional> | ||
| 6 | #include <sstream> | ||
| 7 | #include <string> | ||
| 8 | |||
| 9 | #include <curl/curl.h> | ||
| 10 | |||
| 11 | #include "spliturl.hpp" | ||
| 12 | |||
| 13 | // splitUrl takes a URL of the shape '[http[s]://]HOST[:PORT][/PATH]', and | ||
| 14 | // splits it into two URLs: | ||
| 15 | // - scheme + host -> '[http[s]://]HOST' | ||
| 16 | // - port + path -> '[PORT][/PATH]' | ||
| 17 | // In case an IPv6 address is provided, the host must enclosed in square | ||
| 18 | // brackets. The zone ID may also be indicated. Note that in the resulting | ||
| 19 | // parts, the colon preceding the port number is omitted. This is on purpose. | ||
| 20 | std::optional<SplitUrl> splitUrl(const std::string &url, std::string *error) { | ||
| 21 | std::stringstream errs; | ||
| 22 | std::optional<SplitUrl> result; | ||
| 23 | char *processed = nullptr; | ||
| 24 | char *scheme = nullptr; | ||
| 25 | char *user = nullptr; | ||
| 26 | char *password = nullptr; | ||
| 27 | char *zoneid = nullptr; | ||
| 28 | char *query = nullptr; | ||
| 29 | char *fragment = nullptr; | ||
| 30 | CURLU *schemehost = nullptr; | ||
| 31 | char *schemehost_url = nullptr; | ||
| 32 | char *portpath_url = nullptr; | ||
| 33 | |||
| 34 | // Parse the URL, allowing the user to omit the scheme. CURL will use 'https' | ||
| 35 | // by default if no scheme is specified. | ||
| 36 | |||
| 37 | CURLU *parsed = curl_url(); | ||
| 38 | CURLUcode rc = curl_url_set(parsed, CURLUPART_URL, url.c_str(), CURLU_DEFAULT_SCHEME); | ||
| 39 | if (rc != CURLUE_OK) { | ||
| 40 | errs << "Failed to parse URL: " << curl_url_strerror(rc); | ||
| 41 | goto Exit; | ||
| 42 | } | ||
| 43 | |||
| 44 | // As we parse the URL with the option CURLU_DEFAULT_SCHEME, the CURL API | ||
| 45 | // won't require the user to provide the scheme part of the URL. It will | ||
| 46 | // automatically default the scheme to https. However, we do not usually want | ||
| 47 | // it to default to HTTPS, but HTTP instead (as the use case, connecting to a | ||
| 48 | // PushGateway server, usually is served over a private network via HTTP). | ||
| 49 | // | ||
| 50 | // This is why we check if the scheme was put there by CURL and otherwise set | ||
| 51 | // it to HTTP. We also check for any other schemes that the user may have | ||
| 52 | // provided, and reject anything that is not http/https. | ||
| 53 | if (!url.starts_with("http://") && !url.starts_with("https://")) { | ||
| 54 | rc = curl_url_get(parsed, CURLUPART_SCHEME, &scheme, 0); | ||
| 55 | if (rc != CURLUE_OK) { | ||
| 56 | errs << "Could not get scheme from parsed URL: " << curl_url_strerror(rc); | ||
| 57 | goto Exit; | ||
| 58 | } | ||
| 59 | if (strcmp(scheme, "https")) { | ||
| 60 | errs << "Unexpected scheme" << scheme << "in provided URL (expected http or https)"; | ||
| 61 | goto Exit; | ||
| 62 | } | ||
| 63 | rc = curl_url_set(parsed, CURLUPART_SCHEME, "http", 0); | ||
| 64 | if (rc != CURLUE_OK) { | ||
| 65 | errs << "Could not set URL scheme to http: " << curl_url_strerror(rc); | ||
| 66 | goto Exit; | ||
| 67 | } | ||
| 68 | } | ||
| 69 | |||
| 70 | // Turn the parsed URL back into a string. | ||
| 71 | rc = curl_url_get(parsed, CURLUPART_URL, &processed, 0); | ||
| 72 | if (rc != CURLUE_OK) { | ||
| 73 | errs << "Failed to output parsed URL: " << curl_url_strerror(rc); | ||
| 74 | goto Exit; | ||
| 75 | } | ||
| 76 | |||
| 77 | // This part of the code checks if no prohibited parts are present in the URL | ||
| 78 | // (basic auth: (user, password), query, fragment). | ||
| 79 | |||
| 80 | rc = curl_url_get(parsed, CURLUPART_USER, &user, 0); | ||
| 81 | if (rc == CURLUE_OK && strlen(user) != 0) { | ||
| 82 | errs << "Provided URL should not contain a user part"; | ||
| 83 | goto Exit; | ||
| 84 | } else if (rc != CURLUE_NO_USER && rc != CURLUE_OK) { | ||
| 85 | errs << "Failed to get check user part existence in provided url: " << curl_url_strerror(rc); | ||
| 86 | goto Exit; | ||
| 87 | } | ||
| 88 | |||
| 89 | rc = curl_url_get(parsed, CURLUPART_PASSWORD, &password, 0); | ||
| 90 | if (rc == CURLUE_OK && strlen(password) != 0) { | ||
| 91 | errs << "Provided URL should not contain a password part"; | ||
| 92 | goto Exit; | ||
| 93 | } else if (rc != CURLUE_NO_PASSWORD && rc != CURLUE_OK) { | ||
| 94 | errs << "Failed to get check password part existence in provided url: " << curl_url_strerror(rc); | ||
| 95 | goto Exit; | ||
| 96 | } | ||
| 97 | |||
| 98 | rc = curl_url_get(parsed, CURLUPART_QUERY, &query, 0); | ||
| 99 | if (rc == CURLUE_OK && strlen(query) != 0) { | ||
| 100 | errs << "Provided URL should not contain a query part"; | ||
| 101 | goto Exit; | ||
| 102 | } else if (rc != CURLUE_NO_QUERY && rc != CURLUE_OK) { | ||
| 103 | errs << "Failed to get check query part existence in provided url: " << curl_url_strerror(rc); | ||
| 104 | goto Exit; | ||
| 105 | } | ||
| 106 | |||
| 107 | rc = curl_url_get(parsed, CURLUPART_FRAGMENT, &fragment, 0); | ||
| 108 | if (rc == CURLUE_OK && strlen(fragment) != 0) { | ||
| 109 | errs << "Provided URL should not contain a fragment part"; | ||
| 110 | goto Exit; | ||
| 111 | } else if (rc != CURLUE_NO_FRAGMENT && rc != CURLUE_OK) { | ||
| 112 | errs << "Failed to get check fragment part existence in provided url: " << curl_url_strerror(rc); | ||
| 113 | goto Exit; | ||
| 114 | } | ||
| 115 | |||
| 116 | // Now that we know that the provided URL makes sense, we can start doing | ||
| 117 | // some arts and crafts. We get started by copying the parsed URL into | ||
| 118 | // schemehost and simply delete all parts which are not scheme + host. | ||
| 119 | |||
| 120 | schemehost = curl_url_dup(parsed); | ||
| 121 | |||
| 122 | // CURL BUG WORKAROUND: CURLUPART_ZONEID is NOT copied by curl_url_dup! | ||
| 123 | // ^ fixed in CURL 8.3.0 after https://curl.se/mail/lib-2023-07/0047.html | ||
| 124 | rc = curl_url_get(parsed, CURLUPART_ZONEID, &zoneid, 0); | ||
| 125 | if (rc == CURLUE_OK) { | ||
| 126 | rc = curl_url_set(schemehost, CURLUPART_ZONEID, zoneid, 0); | ||
| 127 | if (rc != CURLUE_OK) { | ||
| 128 | errs << "Could not copy zone ID to duplicated URL: " << curl_url_strerror(rc); | ||
| 129 | goto Exit; | ||
| 130 | } | ||
| 131 | } | ||
| 132 | rc = curl_url_set(schemehost, CURLUPART_PORT, nullptr, 0); | ||
| 133 | if (rc != CURLUE_OK) { | ||
| 134 | errs << "Could not unset port in duplicated URL: " << curl_url_strerror(rc); | ||
| 135 | goto Exit; | ||
| 136 | } | ||
| 137 | rc = curl_url_set(schemehost, CURLUPART_PATH, nullptr, 0); | ||
| 138 | if (rc != CURLUE_OK) { | ||
| 139 | errs << "Could not unset path in duplicated URL: " << curl_url_strerror(rc); | ||
| 140 | goto Exit; | ||
| 141 | } | ||
| 142 | |||
| 143 | // Okay, now we have the schemehost CURLU all ready to go. Note that a URL | ||
| 144 | // only consisting of a scheme and host is considered valid, so CURL will be | ||
| 145 | // more than happy to actually turn it into a string for us. Which is exactly | ||
| 146 | // what we do here :) | ||
| 147 | |||
| 148 | rc = curl_url_get(schemehost, CURLUPART_URL, &schemehost_url, 0); | ||
| 149 | if (rc != CURLUE_OK) { | ||
| 150 | errs << "Could not get scheme + host URL: " << curl_url_strerror(rc); | ||
| 151 | goto Exit; | ||
| 152 | } | ||
| 153 | |||
| 154 | // Remove any trailing slash after the scheme + host URL that CURL might have | ||
| 155 | // put there -- we still want to get a valid URL if we paste the port + path | ||
| 156 | // part behind it. | ||
| 157 | |||
| 158 | if (strlen(schemehost_url) > 0) { | ||
| 159 | if (schemehost_url[strlen(schemehost_url) - 1] != '/') { | ||
| 160 | errs << "Scheme + host URL does not end with a slash"; | ||
| 161 | goto Exit; | ||
| 162 | } | ||
| 163 | schemehost_url[strlen(schemehost_url) - 1] = '\0'; | ||
| 164 | } | ||
| 165 | |||
| 166 | // Look, this is really gross. Because the port + path part of the URL is not | ||
| 167 | // a valid URL itself, but the scheme + host should be a prefix of the full | ||
| 168 | // URL containing the port + path, we can simply check if it is indeed a | ||
| 169 | // prefix, and then strip it from the full URL, giving us the port + path | ||
| 170 | // (after deleting the colon preceding the port). | ||
| 171 | |||
| 172 | if (!std::string_view(processed).starts_with(schemehost_url)) { | ||
| 173 | errs << "Scheme + host URL is not a prefix of the processed URL"; | ||
| 174 | goto Exit; | ||
| 175 | } | ||
| 176 | |||
| 177 | portpath_url = processed + strlen(schemehost_url); | ||
| 178 | // We should not have the colon before the port, prometheus-cpp inserts it | ||
| 179 | if (strlen(portpath_url) > 0 && portpath_url[0] == ':') portpath_url++; | ||
| 180 | // We do not need a trailing slash | ||
| 181 | if (strlen(portpath_url) > 0 && portpath_url[strlen(portpath_url)-1] == '/') | ||
| 182 | portpath_url[strlen(portpath_url)-1] = '\0'; | ||
| 183 | |||
| 184 | // It has been done. BLECH | ||
| 185 | result = std::make_optional<SplitUrl>(schemehost_url, portpath_url); | ||
| 186 | |||
| 187 | Exit: | ||
| 188 | curl_free(processed); | ||
| 189 | curl_free(scheme); | ||
| 190 | curl_free(user); | ||
| 191 | curl_free(password); | ||
| 192 | curl_free(query); | ||
| 193 | curl_free(fragment); | ||
| 194 | curl_free(zoneid); | ||
| 195 | curl_free(schemehost_url); | ||
| 196 | curl_url_cleanup(schemehost); | ||
| 197 | curl_url_cleanup(parsed); | ||
| 198 | |||
| 199 | if (!result && error) | ||
| 200 | *error = errs.str(); | ||
| 201 | |||
| 202 | return result; | ||
| 203 | } | ||