Initial commit

Based on dc4ba6a
author: Rutger Broekhoff 2024-05-02 20:27:40 +0200
committer: Rutger Broekhoff 2024-05-02 20:27:40 +0200
commit: 17a3ea880402338420699e03bcb24181e4ff3924 (patch)
tree: da666ef91e0b60d20aa0b01529644c136fd1f4ab /src/bundleparquet/spliturl.cpp
download: oeuf-17a3ea880402338420699e03bcb24181e4ff3924.tar.gz
oeuf-17a3ea880402338420699e03bcb24181e4ff3924.zip
1 files changed, 203 insertions, 0 deletions
diff --git a/src/bundleparquet/spliturl.cpp b/src/bundleparquet/spliturl.cpp
new file mode 100644
index 0000000..90fd821
--- /dev/null
+++ b/src/bundleparquet/spliturl.cpp
@@ -0,0 +1,203 @@
+// vim:set sw=2 ts=2 sts et:
+#include <cstring>
+#include <iostream>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <curl/curl.h>
+#include "spliturl.hpp"
+// splitUrl takes a URL of the shape '[http[s]://]HOST[:PORT][/PATH]', and
+// splits it into two URLs:
+//   - scheme + host -> '[http[s]://]HOST'
+//   - port   + path -> '[PORT][/PATH]'
+// In case an IPv6 address is provided, the host must enclosed in square
+// brackets. The zone ID may also be indicated. Note that in the resulting
+// parts, the colon preceding the port number is omitted. This is on purpose.
+std::optional<SplitUrl> splitUrl(const std::string &url, std::string *error) {
+  std::stringstream errs;
+  std::optional<SplitUrl> result;
+  char   *processed      = nullptr;
+  char   *scheme         = nullptr;
+  char   *user           = nullptr;
+  char   *password       = nullptr;
+  char   *zoneid         = nullptr;
+  char   *query          = nullptr;
+  char   *fragment       = nullptr;
+  CURLU  *schemehost     = nullptr;
+  char   *schemehost_url = nullptr;
+  char   *portpath_url   = nullptr;
+  // Parse the URL, allowing the user to omit the scheme. CURL will use 'https'
+  // by default if no scheme is specified.
+  CURLU *parsed = curl_url();
+  CURLUcode rc = curl_url_set(parsed, CURLUPART_URL, url.c_str(), CURLU_DEFAULT_SCHEME);
+  if (rc != CURLUE_OK) {
+    errs << "Failed to parse URL: " << curl_url_strerror(rc);
+    goto Exit;
+  }
+  // As we parse the URL with the option CURLU_DEFAULT_SCHEME, the CURL API
+  // won't require the user to provide the scheme part of the URL. It will
+  // automatically default the scheme to https. However, we do not usually want
+  // it to default to HTTPS, but HTTP instead (as the use case, connecting to a
+  // PushGateway server, usually is served over a private network via HTTP).
+  // 
+  // This is why we check if the scheme was put there by CURL and otherwise set
+  // it to HTTP. We also check for any other schemes that the user may have
+  // provided, and reject anything that is not http/https.
+  if (!url.starts_with("http://") && !url.starts_with("https://")) {
+    rc = curl_url_get(parsed, CURLUPART_SCHEME, &scheme, 0);
+    if (rc != CURLUE_OK) {
+      errs << "Could not get scheme from parsed URL: " << curl_url_strerror(rc);
+      goto Exit;
+    }
+    if (strcmp(scheme, "https")) {
+      errs << "Unexpected scheme" << scheme << "in provided URL (expected http or https)";
+      goto Exit;
+    }
+    rc = curl_url_set(parsed, CURLUPART_SCHEME, "http", 0);
+    if (rc != CURLUE_OK) {
+      errs << "Could not set URL scheme to http: " << curl_url_strerror(rc);
+      goto Exit;
+    }
+  }
+  // Turn the parsed URL back into a string.
+  rc = curl_url_get(parsed, CURLUPART_URL, &processed, 0);
+  if (rc != CURLUE_OK) {
+    errs << "Failed to output parsed URL: " << curl_url_strerror(rc);
+    goto Exit;
+  }
+  // This part of the code checks if no prohibited parts are present in the URL
+  // (basic auth: (user, password), query, fragment).
+  rc = curl_url_get(parsed, CURLUPART_USER, &user, 0);
+  if (rc == CURLUE_OK && strlen(user) != 0) {
+    errs << "Provided URL should not contain a user part";
+    goto Exit;
+  } else if (rc != CURLUE_NO_USER && rc != CURLUE_OK) {
+    errs << "Failed to get check user part existence in provided url: " << curl_url_strerror(rc);
+    goto Exit;
+  }
+  rc = curl_url_get(parsed, CURLUPART_PASSWORD, &password, 0);
+  if (rc == CURLUE_OK && strlen(password) != 0) {
+    errs << "Provided URL should not contain a password part";
+    goto Exit;
+  } else if (rc != CURLUE_NO_PASSWORD && rc != CURLUE_OK) {
+    errs << "Failed to get check password part existence in provided url: " << curl_url_strerror(rc);
+    goto Exit;
+  }
+  rc = curl_url_get(parsed, CURLUPART_QUERY, &query, 0);
+  if (rc == CURLUE_OK && strlen(query) != 0) {
+    errs << "Provided URL should not contain a query part";
+    goto Exit;
+  } else if (rc != CURLUE_NO_QUERY && rc != CURLUE_OK) {
+    errs << "Failed to get check query part existence in provided url: " << curl_url_strerror(rc);
+    goto Exit;
+  }
+  rc = curl_url_get(parsed, CURLUPART_FRAGMENT, &fragment, 0);
+  if (rc == CURLUE_OK && strlen(fragment) != 0) {
+    errs << "Provided URL should not contain a fragment part";
+    goto Exit;
+  } else if (rc != CURLUE_NO_FRAGMENT && rc != CURLUE_OK) {
+    errs << "Failed to get check fragment part existence in provided url: " << curl_url_strerror(rc);
+    goto Exit;
+  }
+  // Now that we know that the provided URL makes sense, we can start doing
+  // some arts and crafts. We get started by copying the parsed URL into
+  // schemehost and simply delete all parts which are not scheme + host.
+  schemehost = curl_url_dup(parsed);
+  // CURL BUG WORKAROUND: CURLUPART_ZONEID is NOT copied by curl_url_dup!
+  //      ^ fixed in CURL 8.3.0 after https://curl.se/mail/lib-2023-07/0047.html
+  rc = curl_url_get(parsed, CURLUPART_ZONEID, &zoneid, 0);
+  if (rc == CURLUE_OK) {
+    rc = curl_url_set(schemehost, CURLUPART_ZONEID, zoneid, 0);
+    if (rc != CURLUE_OK) {
+      errs << "Could not copy zone ID to duplicated URL: " << curl_url_strerror(rc);
+      goto Exit;
+    }
+  }
+  rc = curl_url_set(schemehost, CURLUPART_PORT, nullptr, 0);
+  if (rc != CURLUE_OK) {
+    errs << "Could not unset port in duplicated URL: " << curl_url_strerror(rc);
+    goto Exit;
+  }
+  rc = curl_url_set(schemehost, CURLUPART_PATH, nullptr, 0);
+  if (rc != CURLUE_OK) {
+    errs << "Could not unset path in duplicated URL: " << curl_url_strerror(rc);
+    goto Exit;
+  }
+  // Okay, now we have the schemehost CURLU all ready to go. Note that a URL
+  // only consisting of a scheme and host is considered valid, so CURL will be
+  // more than happy to actually turn it into a string for us. Which is exactly
+  // what we do here :)
+  rc = curl_url_get(schemehost, CURLUPART_URL, &schemehost_url, 0);
+  if (rc != CURLUE_OK) {
+    errs << "Could not get scheme + host URL: " << curl_url_strerror(rc);
+    goto Exit;
+  }
+  // Remove any trailing slash after the scheme + host URL that CURL might have
+  // put there -- we still want to get a valid URL if we paste the port + path
+  // part behind it.
+  if (strlen(schemehost_url) > 0) {
+    if (schemehost_url[strlen(schemehost_url) - 1] != '/') {
+      errs << "Scheme + host URL does not end with a slash";
+      goto Exit;
+    }
+    schemehost_url[strlen(schemehost_url) - 1] = '\0';
+  }
+  // Look, this is really gross. Because the port + path part of the URL is not
+  // a valid URL itself, but the scheme + host should be a prefix of the full
+  // URL containing the port + path, we can simply check if it is indeed a
+  // prefix, and then strip it from the full URL, giving us the port + path
+  // (after deleting the colon preceding the port).
+  if (!std::string_view(processed).starts_with(schemehost_url)) {
+    errs << "Scheme + host URL is not a prefix of the processed URL";
+    goto Exit;
+  }
+  portpath_url = processed + strlen(schemehost_url);
+  // We should not have the colon before the port, prometheus-cpp inserts it
+  if (strlen(portpath_url) > 0 && portpath_url[0] == ':') portpath_url++;
+  // We do not need a trailing slash
+  if (strlen(portpath_url) > 0 && portpath_url[strlen(portpath_url)-1] == '/')
+    portpath_url[strlen(portpath_url)-1] = '\0';
+  // It has been done. BLECH
+  result = std::make_optional<SplitUrl>(schemehost_url, portpath_url);
+Exit:
+  curl_free(processed);
+  curl_free(scheme);
+  curl_free(user);
+  curl_free(password);
+  curl_free(query);
+  curl_free(fragment);
+  curl_free(zoneid);
+  curl_free(schemehost_url);
+  curl_url_cleanup(schemehost);
+  curl_url_cleanup(parsed);
+  if (!result && error)
+    *error = errs.str();
+  return result;
+}
author	Rutger Broekhoff	2024-05-02 20:27:40 +0200
committer	Rutger Broekhoff	2024-05-02 20:27:40 +0200
commit	17a3ea880402338420699e03bcb24181e4ff3924 (patch)
tree	da666ef91e0b60d20aa0b01529644c136fd1f4ab /src/bundleparquet/spliturl.cpp
download	oeuf-17a3ea880402338420699e03bcb24181e4ff3924.tar.gz oeuf-17a3ea880402338420699e03bcb24181e4ff3924.zip

diff --git a/src/bundleparquet/spliturl.cpp b/src/bundleparquet/spliturl.cpp new file mode 100644 index 0000000..90fd821 --- /dev/null +++ b/src/bundleparquet/spliturl.cpp
@@ -0,0 +1,203 @@
	1	// vim:set sw=2 ts=2 sts et:
	2
	3	#include <cstring>
	4	#include <iostream>
	5	#include <optional>
	6	#include <sstream>
	7	#include <string>
	8
	9	#include <curl/curl.h>
	10
	11	#include "spliturl.hpp"
	12
	13	// splitUrl takes a URL of the shape '[http[s]://]HOST[:PORT][/PATH]', and
	14	// splits it into two URLs:
	15	// - scheme + host -> '[http[s]://]HOST'
	16	// - port + path -> '[PORT][/PATH]'
	17	// In case an IPv6 address is provided, the host must enclosed in square
	18	// brackets. The zone ID may also be indicated. Note that in the resulting
	19	// parts, the colon preceding the port number is omitted. This is on purpose.
	20	std::optional<SplitUrl> splitUrl(const std::string &url, std::string *error) {
	21	std::stringstream errs;
	22	std::optional<SplitUrl> result;
	23	char *processed = nullptr;
	24	char *scheme = nullptr;
	25	char *user = nullptr;
	26	char *password = nullptr;
	27	char *zoneid = nullptr;
	28	char *query = nullptr;
	29	char *fragment = nullptr;
	30	CURLU *schemehost = nullptr;
	31	char *schemehost_url = nullptr;
	32	char *portpath_url = nullptr;
	33
	34	// Parse the URL, allowing the user to omit the scheme. CURL will use 'https'
	35	// by default if no scheme is specified.
	36
	37	CURLU *parsed = curl_url();
	38	CURLUcode rc = curl_url_set(parsed, CURLUPART_URL, url.c_str(), CURLU_DEFAULT_SCHEME);
	39	if (rc != CURLUE_OK) {
	40	errs << "Failed to parse URL: " << curl_url_strerror(rc);
	41	goto Exit;
	42	}
	43
	44	// As we parse the URL with the option CURLU_DEFAULT_SCHEME, the CURL API
	45	// won't require the user to provide the scheme part of the URL. It will
	46	// automatically default the scheme to https. However, we do not usually want
	47	// it to default to HTTPS, but HTTP instead (as the use case, connecting to a
	48	// PushGateway server, usually is served over a private network via HTTP).
	49	//
	50	// This is why we check if the scheme was put there by CURL and otherwise set
	51	// it to HTTP. We also check for any other schemes that the user may have
	52	// provided, and reject anything that is not http/https.
	53	if (!url.starts_with("http://") && !url.starts_with("https://")) {
	54	rc = curl_url_get(parsed, CURLUPART_SCHEME, &scheme, 0);
	55	if (rc != CURLUE_OK) {
	56	errs << "Could not get scheme from parsed URL: " << curl_url_strerror(rc);
	57	goto Exit;
	58	}
	59	if (strcmp(scheme, "https")) {
	60	errs << "Unexpected scheme" << scheme << "in provided URL (expected http or https)";
	61	goto Exit;
	62	}
	63	rc = curl_url_set(parsed, CURLUPART_SCHEME, "http", 0);
	64	if (rc != CURLUE_OK) {
	65	errs << "Could not set URL scheme to http: " << curl_url_strerror(rc);
	66	goto Exit;
	67	}
	68	}
	69
	70	// Turn the parsed URL back into a string.
	71	rc = curl_url_get(parsed, CURLUPART_URL, &processed, 0);
	72	if (rc != CURLUE_OK) {
	73	errs << "Failed to output parsed URL: " << curl_url_strerror(rc);
	74	goto Exit;
	75	}
	76
	77	// This part of the code checks if no prohibited parts are present in the URL
	78	// (basic auth: (user, password), query, fragment).
	79
	80	rc = curl_url_get(parsed, CURLUPART_USER, &user, 0);
	81	if (rc == CURLUE_OK && strlen(user) != 0) {
	82	errs << "Provided URL should not contain a user part";
	83	goto Exit;
	84	} else if (rc != CURLUE_NO_USER && rc != CURLUE_OK) {
	85	errs << "Failed to get check user part existence in provided url: " << curl_url_strerror(rc);
	86	goto Exit;
	87	}
	88
	89	rc = curl_url_get(parsed, CURLUPART_PASSWORD, &password, 0);
	90	if (rc == CURLUE_OK && strlen(password) != 0) {
	91	errs << "Provided URL should not contain a password part";
	92	goto Exit;
	93	} else if (rc != CURLUE_NO_PASSWORD && rc != CURLUE_OK) {
	94	errs << "Failed to get check password part existence in provided url: " << curl_url_strerror(rc);
	95	goto Exit;
	96	}
	97
	98	rc = curl_url_get(parsed, CURLUPART_QUERY, &query, 0);
	99	if (rc == CURLUE_OK && strlen(query) != 0) {
	100	errs << "Provided URL should not contain a query part";
	101	goto Exit;
	102	} else if (rc != CURLUE_NO_QUERY && rc != CURLUE_OK) {
	103	errs << "Failed to get check query part existence in provided url: " << curl_url_strerror(rc);
	104	goto Exit;
	105	}
	106
	107	rc = curl_url_get(parsed, CURLUPART_FRAGMENT, &fragment, 0);
	108	if (rc == CURLUE_OK && strlen(fragment) != 0) {
	109	errs << "Provided URL should not contain a fragment part";
	110	goto Exit;
	111	} else if (rc != CURLUE_NO_FRAGMENT && rc != CURLUE_OK) {
	112	errs << "Failed to get check fragment part existence in provided url: " << curl_url_strerror(rc);
	113	goto Exit;
	114	}
	115
	116	// Now that we know that the provided URL makes sense, we can start doing
	117	// some arts and crafts. We get started by copying the parsed URL into
	118	// schemehost and simply delete all parts which are not scheme + host.
	119
	120	schemehost = curl_url_dup(parsed);
	121
	122	// CURL BUG WORKAROUND: CURLUPART_ZONEID is NOT copied by curl_url_dup!
	123	// ^ fixed in CURL 8.3.0 after https://curl.se/mail/lib-2023-07/0047.html
	124	rc = curl_url_get(parsed, CURLUPART_ZONEID, &zoneid, 0);
	125	if (rc == CURLUE_OK) {
	126	rc = curl_url_set(schemehost, CURLUPART_ZONEID, zoneid, 0);
	127	if (rc != CURLUE_OK) {
	128	errs << "Could not copy zone ID to duplicated URL: " << curl_url_strerror(rc);
	129	goto Exit;
	130	}
	131	}
	132	rc = curl_url_set(schemehost, CURLUPART_PORT, nullptr, 0);
	133	if (rc != CURLUE_OK) {
	134	errs << "Could not unset port in duplicated URL: " << curl_url_strerror(rc);
	135	goto Exit;
	136	}
	137	rc = curl_url_set(schemehost, CURLUPART_PATH, nullptr, 0);
	138	if (rc != CURLUE_OK) {
	139	errs << "Could not unset path in duplicated URL: " << curl_url_strerror(rc);
	140	goto Exit;
	141	}
	142
	143	// Okay, now we have the schemehost CURLU all ready to go. Note that a URL
	144	// only consisting of a scheme and host is considered valid, so CURL will be
	145	// more than happy to actually turn it into a string for us. Which is exactly
	146	// what we do here :)
	147
	148	rc = curl_url_get(schemehost, CURLUPART_URL, &schemehost_url, 0);
	149	if (rc != CURLUE_OK) {
	150	errs << "Could not get scheme + host URL: " << curl_url_strerror(rc);
	151	goto Exit;
	152	}
	153
	154	// Remove any trailing slash after the scheme + host URL that CURL might have
	155	// put there -- we still want to get a valid URL if we paste the port + path
	156	// part behind it.
	157
	158	if (strlen(schemehost_url) > 0) {
	159	if (schemehost_url[strlen(schemehost_url) - 1] != '/') {
	160	errs << "Scheme + host URL does not end with a slash";
	161	goto Exit;
	162	}
	163	schemehost_url[strlen(schemehost_url) - 1] = '\0';
	164	}
	165
	166	// Look, this is really gross. Because the port + path part of the URL is not
	167	// a valid URL itself, but the scheme + host should be a prefix of the full
	168	// URL containing the port + path, we can simply check if it is indeed a
	169	// prefix, and then strip it from the full URL, giving us the port + path
	170	// (after deleting the colon preceding the port).
	171
	172	if (!std::string_view(processed).starts_with(schemehost_url)) {
	173	errs << "Scheme + host URL is not a prefix of the processed URL";
	174	goto Exit;
	175	}
	176
	177	portpath_url = processed + strlen(schemehost_url);
	178	// We should not have the colon before the port, prometheus-cpp inserts it
	179	if (strlen(portpath_url) > 0 && portpath_url[0] == ':') portpath_url++;
	180	// We do not need a trailing slash
	181	if (strlen(portpath_url) > 0 && portpath_url[strlen(portpath_url)-1] == '/')
	182	portpath_url[strlen(portpath_url)-1] = '\0';
	183
	184	// It has been done. BLECH
	185	result = std::make_optional<SplitUrl>(schemehost_url, portpath_url);
	186
	187	Exit:
	188	curl_free(processed);
	189	curl_free(scheme);
	190	curl_free(user);
	191	curl_free(password);
	192	curl_free(query);
	193	curl_free(fragment);
	194	curl_free(zoneid);
	195	curl_free(schemehost_url);
	196	curl_url_cleanup(schemehost);
	197	curl_url_cleanup(parsed);
	198
	199	if (!result && error)
	200	*error = errs.str();
	201
	202	return result;
	203	}