oeuf - Playground for public transit data

// vim:set sw=2 ts=2 sts et:
//
// Copyright 2024 Rutger Broekhoff. Licensed under the EUPL.

#include <cstring>
#include <iostream>
#include <optional>
#include <sstream>
#include <string>

#include <curl/curl.h>

#include "spliturl.hpp"

// splitUrl takes a URL of the shape '[http[s]://]HOST[:PORT][/PATH]', and
// splits it into two URLs:
//   - scheme + host -> '[http[s]://]HOST'
//   - port   + path -> '[PORT][/PATH]'
// In case an IPv6 address is provided, the host must be enclosed in square
// brackets. The zone ID may also be indicated. Note that in the resulting
// parts, the colon preceding the port number is omitted. This is on purpose.
std::optional<SplitUrl> splitUrl(const std::string &url, std::string *error) {
  std::stringstream errs;
  std::optional<SplitUrl> result;
  char   *processed      = nullptr;
  char   *scheme         = nullptr;
  char   *user           = nullptr;
  char   *password       = nullptr;
  char   *zoneid         = nullptr;
  char   *query          = nullptr;
  char   *fragment       = nullptr;
  CURLU  *schemehost     = nullptr;
  char   *schemehost_url = nullptr;
  char   *portpath_url   = nullptr;

  // Parse the URL, allowing the user to omit the scheme. CURL will use 'https'
  // by default if no scheme is specified.

  CURLU *parsed = curl_url();
  CURLUcode rc = curl_url_set(parsed, CURLUPART_URL, url.c_str(), CURLU_DEFAULT_SCHEME);
  if (rc != CURLUE_OK) {
    errs << "Failed to parse URL: " << curl_url_strerror(rc);
    goto Exit;
  }

  // As we parse the URL with the option CURLU_DEFAULT_SCHEME, the CURL API
  // won't require the user to provide the scheme part of the URL. It will
  // automatically default the scheme to https. However, we do not usually want
  // it to default to HTTPS, but HTTP instead. (In this specific use case of
  // connecting to a PushGateway server, we assume that the PushGateway server
  // is available over a trusted network, and only using unsecured HTTP).
  // 
  // This is why we check if the scheme was put there by CURL and otherwise set
  // it to HTTP. We also check for any other schemes that the user may have
  // provided, and reject anything that is not http/https.
  if (!url.starts_with("http://") && !url.starts_with("https://")) {
    rc = curl_url_get(parsed, CURLUPART_SCHEME, &scheme, 0);
    if (rc != CURLUE_OK) {
      errs << "Could not get scheme from parsed URL: " << curl_url_strerror(rc);
      goto Exit;
    }
    if (strcmp(scheme, "https")) {
      errs << "Unexpected scheme" << scheme << "in provided URL (expected http or https)";
      goto Exit;
    }
    rc = curl_url_set(parsed, CURLUPART_SCHEME, "http", 0);
    if (rc != CURLUE_OK) {
      errs << "Could not set URL scheme to http: " << curl_url_strerror(rc);
      goto Exit;
    }
  }

  // Turn the parsed URL back into a string.
  rc = curl_url_get(parsed, CURLUPART_URL, &processed, 0);
  if (rc != CURLUE_OK) {
    errs << "Failed to output parsed URL: " << curl_url_strerror(rc);
    goto Exit;
  }

  // This part of the code checks if no prohibited parts are present in the URL
  // (basic auth: (user, password), query, fragment).

  rc = curl_url_get(parsed, CURLUPART_USER, &user, 0);
  if (rc == CURLUE_OK && strlen(user) != 0) {
    errs << "Provided URL should not contain a user part";
    goto Exit;
  } else if (rc != CURLUE_NO_USER && rc != CURLUE_OK) {
    errs << "Failed to get check user part existence in provided url: " << curl_url_strerror(rc);
    goto Exit;
  }

  rc = curl_url_get(parsed, CURLUPART_PASSWORD, &password, 0);
  if (rc == CURLUE_OK && strlen(password) != 0) {
    errs << "Provided URL should not contain a password part";
    goto Exit;
  } else if (rc != CURLUE_NO_PASSWORD && rc != CURLUE_OK) {
    errs << "Failed to get check password part existence in provided url: " << curl_url_strerror(rc);
    goto Exit;
  }

  rc = curl_url_get(parsed, CURLUPART_QUERY, &query, 0);
  if (rc == CURLUE_OK && strlen(query) != 0) {
    errs << "Provided URL should not contain a query part";
    goto Exit;
  } else if (rc != CURLUE_NO_QUERY && rc != CURLUE_OK) {
    errs << "Failed to get check query part existence in provided url: " << curl_url_strerror(rc);
    goto Exit;
  }

  rc = curl_url_get(parsed, CURLUPART_FRAGMENT, &fragment, 0);
  if (rc == CURLUE_OK && strlen(fragment) != 0) {
    errs << "Provided URL should not contain a fragment part";
    goto Exit;
  } else if (rc != CURLUE_NO_FRAGMENT && rc != CURLUE_OK) {
    errs << "Failed to get check fragment part existence in provided url: " << curl_url_strerror(rc);
    goto Exit;
  }

  // Now that we know that the provided URL makes sense, we can start doing
  // some arts and crafts. We get started by copying the parsed URL into
  // schemehost and simply delete all parts which are not scheme + host.

  schemehost = curl_url_dup(parsed);

  rc = curl_url_set(schemehost, CURLUPART_PORT, nullptr, 0);
  if (rc != CURLUE_OK) {
    errs << "Could not unset port in duplicated URL: " << curl_url_strerror(rc);
    goto Exit;
  }
  rc = curl_url_set(schemehost, CURLUPART_PATH, nullptr, 0);
  if (rc != CURLUE_OK) {
    errs << "Could not unset path in duplicated URL: " << curl_url_strerror(rc);
    goto Exit;
  }

  // Okay, now we have the schemehost CURLU all ready to go. Note that a URL
  // only consisting of a scheme and host is considered valid, so CURL will be
  // more than happy to actually turn it into a string for us. Which is exactly
  // what we do here :)

  rc = curl_url_get(schemehost, CURLUPART_URL, &schemehost_url, 0);
  if (rc != CURLUE_OK) {
    errs << "Could not get scheme + host URL: " << curl_url_strerror(rc);
    goto Exit;
  }

  // Remove any trailing slash after the scheme + host URL that CURL might have
  // put there -- we still want to get a valid URL if we paste the port + path
  // part behind it.

  if (strlen(schemehost_url) > 0) {
    if (schemehost_url[strlen(schemehost_url) - 1] != '/') {
      errs << "Scheme + host URL does not end with a slash";
      goto Exit;
    }
    schemehost_url[strlen(schemehost_url) - 1] = '\0';
  }

  // Look, this is really gross. Because the port + path part of the URL is not
  // a valid URL itself, but the scheme + host should be a prefix of the full
  // URL containing the port + path, we can simply check if it is indeed a
  // prefix, and then strip it from the full URL, giving us the port + path
  // (after deleting the colon preceding the port).

  if (!std::string_view(processed).starts_with(schemehost_url)) {
    errs << "Scheme + host URL is not a prefix of the processed URL";
    goto Exit;
  }

  portpath_url = processed + strlen(schemehost_url);
  // We should not have the colon before the port, prometheus-cpp inserts it
  if (strlen(portpath_url) > 0 && portpath_url[0] == ':') portpath_url++;
  // We do not need a trailing slash
  if (strlen(portpath_url) > 0 && portpath_url[strlen(portpath_url)-1] == '/')
    portpath_url[strlen(portpath_url)-1] = '\0';

  // It has been done. BLECH
  result = std::make_optional<SplitUrl>(schemehost_url, portpath_url);

Exit:
  curl_free(processed);
  curl_free(scheme);
  curl_free(user);
  curl_free(password);
  curl_free(query);
  curl_free(fragment);
  curl_free(zoneid);
  curl_free(schemehost_url);
  curl_url_cleanup(schemehost);
  curl_url_cleanup(parsed);

  if (!result && error)
    *error = errs.str();

  return result;
}