// vim:set sw=2 ts=2 sts et: // // Copyright 2024 Rutger Broekhoff. Licensed under the EUPL. #include #include #include #include #include #include #include "spliturl.hpp" // splitUrl takes a URL of the shape '[http[s]://]HOST[:PORT][/PATH]', and // splits it into two URLs: // - scheme + host -> '[http[s]://]HOST' // - port + path -> '[PORT][/PATH]' // In case an IPv6 address is provided, the host must be enclosed in square // brackets. The zone ID may also be indicated. Note that in the resulting // parts, the colon preceding the port number is omitted. This is on purpose. std::optional splitUrl(const std::string &url, std::string *error) { std::stringstream errs; std::optional result; char *processed = nullptr; char *scheme = nullptr; char *user = nullptr; char *password = nullptr; char *zoneid = nullptr; char *query = nullptr; char *fragment = nullptr; CURLU *schemehost = nullptr; char *schemehost_url = nullptr; char *portpath_url = nullptr; // Parse the URL, allowing the user to omit the scheme. CURL will use 'https' // by default if no scheme is specified. CURLU *parsed = curl_url(); CURLUcode rc = curl_url_set(parsed, CURLUPART_URL, url.c_str(), CURLU_DEFAULT_SCHEME); if (rc != CURLUE_OK) { errs << "Failed to parse URL: " << curl_url_strerror(rc); goto Exit; } // As we parse the URL with the option CURLU_DEFAULT_SCHEME, the CURL API // won't require the user to provide the scheme part of the URL. It will // automatically default the scheme to https. However, we do not usually want // it to default to HTTPS, but HTTP instead. (In this specific use case of // connecting to a PushGateway server, we assume that the PushGateway server // is available over a trusted network, and only using unsecured HTTP). // // This is why we check if the scheme was put there by CURL and otherwise set // it to HTTP. We also check for any other schemes that the user may have // provided, and reject anything that is not http/https. if (!url.starts_with("http://") && !url.starts_with("https://")) { rc = curl_url_get(parsed, CURLUPART_SCHEME, &scheme, 0); if (rc != CURLUE_OK) { errs << "Could not get scheme from parsed URL: " << curl_url_strerror(rc); goto Exit; } if (strcmp(scheme, "https")) { errs << "Unexpected scheme" << scheme << "in provided URL (expected http or https)"; goto Exit; } rc = curl_url_set(parsed, CURLUPART_SCHEME, "http", 0); if (rc != CURLUE_OK) { errs << "Could not set URL scheme to http: " << curl_url_strerror(rc); goto Exit; } } // Turn the parsed URL back into a string. rc = curl_url_get(parsed, CURLUPART_URL, &processed, 0); if (rc != CURLUE_OK) { errs << "Failed to output parsed URL: " << curl_url_strerror(rc); goto Exit; } // This part of the code checks if no prohibited parts are present in the URL // (basic auth: (user, password), query, fragment). rc = curl_url_get(parsed, CURLUPART_USER, &user, 0); if (rc == CURLUE_OK && strlen(user) != 0) { errs << "Provided URL should not contain a user part"; goto Exit; } else if (rc != CURLUE_NO_USER && rc != CURLUE_OK) { errs << "Failed to get check user part existence in provided url: " << curl_url_strerror(rc); goto Exit; } rc = curl_url_get(parsed, CURLUPART_PASSWORD, &password, 0); if (rc == CURLUE_OK && strlen(password) != 0) { errs << "Provided URL should not contain a password part"; goto Exit; } else if (rc != CURLUE_NO_PASSWORD && rc != CURLUE_OK) { errs << "Failed to get check password part existence in provided url: " << curl_url_strerror(rc); goto Exit; } rc = curl_url_get(parsed, CURLUPART_QUERY, &query, 0); if (rc == CURLUE_OK && strlen(query) != 0) { errs << "Provided URL should not contain a query part"; goto Exit; } else if (rc != CURLUE_NO_QUERY && rc != CURLUE_OK) { errs << "Failed to get check query part existence in provided url: " << curl_url_strerror(rc); goto Exit; } rc = curl_url_get(parsed, CURLUPART_FRAGMENT, &fragment, 0); if (rc == CURLUE_OK && strlen(fragment) != 0) { errs << "Provided URL should not contain a fragment part"; goto Exit; } else if (rc != CURLUE_NO_FRAGMENT && rc != CURLUE_OK) { errs << "Failed to get check fragment part existence in provided url: " << curl_url_strerror(rc); goto Exit; } // Now that we know that the provided URL makes sense, we can start doing // some arts and crafts. We get started by copying the parsed URL into // schemehost and simply delete all parts which are not scheme + host. schemehost = curl_url_dup(parsed); rc = curl_url_set(schemehost, CURLUPART_PORT, nullptr, 0); if (rc != CURLUE_OK) { errs << "Could not unset port in duplicated URL: " << curl_url_strerror(rc); goto Exit; } rc = curl_url_set(schemehost, CURLUPART_PATH, nullptr, 0); if (rc != CURLUE_OK) { errs << "Could not unset path in duplicated URL: " << curl_url_strerror(rc); goto Exit; } // Okay, now we have the schemehost CURLU all ready to go. Note that a URL // only consisting of a scheme and host is considered valid, so CURL will be // more than happy to actually turn it into a string for us. Which is exactly // what we do here :) rc = curl_url_get(schemehost, CURLUPART_URL, &schemehost_url, 0); if (rc != CURLUE_OK) { errs << "Could not get scheme + host URL: " << curl_url_strerror(rc); goto Exit; } // Remove any trailing slash after the scheme + host URL that CURL might have // put there -- we still want to get a valid URL if we paste the port + path // part behind it. if (strlen(schemehost_url) > 0) { if (schemehost_url[strlen(schemehost_url) - 1] != '/') { errs << "Scheme + host URL does not end with a slash"; goto Exit; } schemehost_url[strlen(schemehost_url) - 1] = '\0'; } // Look, this is really gross. Because the port + path part of the URL is not // a valid URL itself, but the scheme + host should be a prefix of the full // URL containing the port + path, we can simply check if it is indeed a // prefix, and then strip it from the full URL, giving us the port + path // (after deleting the colon preceding the port). if (!std::string_view(processed).starts_with(schemehost_url)) { errs << "Scheme + host URL is not a prefix of the processed URL"; goto Exit; } portpath_url = processed + strlen(schemehost_url); // We should not have the colon before the port, prometheus-cpp inserts it if (strlen(portpath_url) > 0 && portpath_url[0] == ':') portpath_url++; // We do not need a trailing slash if (strlen(portpath_url) > 0 && portpath_url[strlen(portpath_url)-1] == '/') portpath_url[strlen(portpath_url)-1] = '\0'; // It has been done. BLECH result = std::make_optional(schemehost_url, portpath_url); Exit: curl_free(processed); curl_free(scheme); curl_free(user); curl_free(password); curl_free(query); curl_free(fragment); curl_free(zoneid); curl_free(schemehost_url); curl_url_cleanup(schemehost); curl_url_cleanup(parsed); if (!result && error) *error = errs.str(); return result; }