Skip to content
254 changes: 249 additions & 5 deletions cpp/include/kvikio/detail/url.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ class UrlParser {
*
* @return UrlComponents structure containing the parsed URL components
*
* @throw std::runtime_error if the URL cannot be parsed or if component extraction fails
* @exception std::runtime_error if the URL cannot be parsed or if component extraction fails
*
* Example:
* @code{.cpp}
Expand All @@ -151,9 +151,10 @@ class UrlParser {
* CURLU_URLDECODE
* );
*
* // Allow non-standard schemes
* // Allow non-standard schemes, i.e. schemes not registered with Internet Assigned Numbers
* // Authority (IANA), such as AWS S3
* auto custom = UrlParser::parse(
* "myscheme://example.com",
* "s3://my-bucket/my-object.bin",
* CURLU_NON_SUPPORT_SCHEME
* );
* @endcode
Expand All @@ -170,7 +171,7 @@ class UrlParser {
* @param bitmask_component_flags Flags controlling extraction behavior
* @param allowed_err_code Optional error code to treat as valid (e.g., CURLUE_NO_SCHEME)
* @return The extracted component as a string, or std::nullopt if not present
* @throw std::runtime_error if extraction fails with an unexpected error
* @exception std::runtime_error if extraction fails with an unexpected error
*/
static std::optional<std::string> extract_component(
CurlUrlHandle const& handle,
Expand All @@ -187,7 +188,7 @@ class UrlParser {
* @param bitmask_component_flags Flags controlling extraction behavior
* @param allowed_err_code Optional error code to treat as valid
* @return The extracted component as a string, or std::nullopt if not present
* @throw std::runtime_error if extraction fails with an unexpected error
* @exception std::runtime_error if extraction fails with an unexpected error
*/
static std::optional<std::string> extract_component(
std::string const& url,
Expand All @@ -196,4 +197,247 @@ class UrlParser {
std::optional<unsigned int> bitmask_component_flags = std::nullopt,
std::optional<CURLUcode> allowed_err_code = std::nullopt);
};

/**
* @brief URL builder utility using libcurl's URL API
*
* This class provides methods for constructing and modifying URLs by setting individual components
* (scheme, host, port, path, query, fragment).
*
* @note This class uses libcurl's URL parsing which follows RFC 3986 plus. See
* https://curl.se/docs/url-syntax.html
*
* Example:
* @code{.cpp}
* // Build from scratch
* auto url = UrlBuilder()
* .set_scheme("https")
* .set_host("witcher4.com")
* .set_path("/ciri")
* .set_query("occupation", "witcher")
* .build();
*
* // Modify existing URL
* auto modified = UrlBuilder("https://witcher4.com/old/path/to/bestiary")
* .set_path("/new/path/to/bestiary")
* .set_port("8080")
* .build();
* @endcode
*/
class UrlBuilder {
private:
CurlUrlHandle _handle;

/**
* @brief Internal helper to set a URL component
*
* @param part The URL part to set
* @param value The value to set. Use `nullptr` to clear
* @param flags Optional flags for the operation
* @return Reference to this builder for chaining
* @exception std::runtime_error if the operation fails
*/
UrlBuilder& set_component(CURLUPart part,
char const* value,
std::optional<unsigned int> flags = std::nullopt);

public:
/**
* @brief Construct an empty URL builder
* @exception std::runtime_error if initialization fails
*/
explicit UrlBuilder();

/**
* @brief Construct a URL builder from an existing URL string
*
* @param url The URL string to start with
* @param bitmask_url_flags Optional flags for URL parsing. Common flags include:
* - CURLU_DEFAULT_SCHEME: Allows URLs without schemes
* - CURLU_NON_SUPPORT_SCHEME: Accept non-supported schemes
* - CURLU_URLENCODE: URL encode the path
* @exception std::runtime_error if the URL cannot be parsed
*/
explicit UrlBuilder(std::string const& url,
std::optional<unsigned int> bitmask_url_flags = std::nullopt);

/**
* @brief Construct a URL builder from parsed URL components
*
* @param components The parsed URL components to start with
* @param bitmask_url_flags Optional flags for URL handling
* @exception std::runtime_error if the components cannot be set
*/
explicit UrlBuilder(UrlParser::UrlComponents const& components,
std::optional<unsigned int> bitmask_url_flags = std::nullopt);

/**
* @brief Set the URL scheme (e.g., "http", "https", "ftp")
*
* @param scheme The scheme to set. Use `std::nullopt` to clear
* @return Reference to this builder for chaining
* @exception std::runtime_error if the scheme is invalid
*
* Example:
* @code{.cpp}
* builder.set_scheme("https");
* @endcode
*/
UrlBuilder& set_scheme(std::optional<std::string> const& scheme);

/**
* @brief Set the hostname or IP address
*
* @param host The host to set. Use `std::nullopt` to clear
* @return Reference to this builder for chaining
* @exception std::runtime_error if the host is invalid
*
* Example:
* @code{.cpp}
* builder.set_host("api.example.com");
* @endcode
*/
UrlBuilder& set_host(std::optional<std::string> const& host);

/**
* @brief Set the port number
*
* @param port The port to set as string. Use `std::nullopt` to clear
* @return Reference to this builder for chaining
* @exception std::runtime_error if the port is invalid
*
* Example:
* @code{.cpp}
* builder.set_port("8080");
* @endcode
*/
UrlBuilder& set_port(std::optional<std::string> const& port);

/**
* @brief Set the path component
*
* @param path The path to set (should start with "/" for absolute paths). Use `std::nullopt` to
* clear
* @return Reference to this builder for chaining
* @exception std::runtime_error if the path is invalid
*
* Example:
* @code{.cpp}
* builder.set_path("/api/v1/users");
* @endcode
*/
UrlBuilder& set_path(std::optional<std::string> const& path);

/**
* @brief Set the entire query string
*
* @param query The query string (without leading "?"). Use `std::nullopt` to clear
* @return Reference to this builder for chaining
* @exception std::runtime_error if the query is invalid
*
* Example:
* @code{.cpp}
* builder.set_query("page=1&limit=10");
* @endcode
*/
UrlBuilder& set_query(std::optional<std::string> const& query);

/**
* @brief Set the fragment identifier
*
* @param fragment The fragment (without leading "#"). Use `std::nullopt` to clear
* @return Reference to this builder for chaining
* @exception std::runtime_error if the fragment is invalid
*
* Example:
* @code{.cpp}
* builder.set_fragment("section-2");
* @endcode
*/
UrlBuilder& set_fragment(std::optional<std::string> const& fragment);

/**
* @brief Build the final URL string
*
* @param bitmask_component_flags Optional flags for URL formatting. Common flags:
* - CURLU_PUNYCODE: Convert host to punycode if needed
* - CURLU_NO_DEFAULT_PORT: Include port even if it's the default for the scheme
* @return The complete URL string
* @exception std::runtime_error if the URL cannot be built
*
* Example:
* @code{.cpp}
* std::string url = builder.build();
* @endcode
*/
std::string build(std::optional<unsigned int> bitmask_component_flags = std::nullopt) const;

static std::string build_manually(UrlParser::UrlComponents const& components);
};

/**
* @brief Provides URL encoding functionality
*
* The AWS object naming documentation
* (https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html) lists several
* types of special characters. In practice, handling them using libcurl is complex and described
* below.
*
* - Special characters that are safe for use in key names: "!-_.*'()" KvikIO includes !*'() in
* `aws_special_chars`, because for private bucket they cause AWS authentication by libcurl to fail
*
* - Characters that might require special handling: "&$@=;/:+ ,? and 0-31, 127 ASCII
* characters". For /, KvikIO does not include it in `aws_special_chars`, because it can be legally
* used as a path separator. For the space character and ?, although KvikIO has them in
* `aws_special_chars`, users must manually percent encode them to %20 and %3F, respectively.
* Otherwise, the space character will be considered malformed by libcurl, and ? cause ambiguity
* with the query string. For the control characters, KvikIO include them all in
* `aws_special_chars`.
*
* - Characters to avoid: "\{^}%`]">[~<#| and 128-255 non-ASCII characters". KvikIO recommends
* users avoiding these characters in the URL. They are not included in `aws_special_chars`.
*
*/
class UrlEncoder {
public:
/**
* @brief Default set of special characters requiring encoding in AWS URLs
*/
static constexpr char aws_special_chars[] = {
'!', '*', '\'', '(', ')', '&', '$', '@', '=', ';', ':', '+',
' ', ',', '?', '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08',
'\x09', '\x0A', '\x0B', '\x0C', '\x0D', '\x0E', '\x0F', '\x10', '\x11', '\x12', '\x13', '\x14',
'\x15', '\x16', '\x17', '\x18', '\x19', '\x1A', '\x1B', '\x1C', '\x1D', '\x1E', '\x1F', '\x7F'};

/**
* @brief Percent-encodes specified characters in a URL path
*
* Performs percent-encoding (RFC 3986) on a given path string, encoding only the characters
* specified in the chars_to_encode parameter. Each encoded character is replaced with its
* percent-encoded equivalent (%XX where XX is the hexadecimal representation of the character).
*
* Only ASCII characters (0-127) are supported for encoding. Non-ASCII characters in
* chars_to_encode will be encoded to an empty string. Characters not in chars_to_encode are
* passed through unchanged.
*
* @param path The path string to encode
* @param chars_to_encode Set of characters that should be encoded (defaults to aws_special_chars)
*
* @return A new string with specified characters percent-encoded
*
* @code{.cpp}
* // Example usage with default AWS special characters
* std::string encoded = UrlEncoder::encode_path("/path/ with spaces");
* // Result: "/path/%20with%20spaces"
*
* // Example with custom character set
* std::string encoded = UrlEncoder::encode_path("hello/world", "/");
* // Result: "hello%2Fworld"
* @endcode
*/
static std::string encode_path(std::string_view path,
std::string_view chars_to_encode = std::string_view{
aws_special_chars, sizeof(aws_special_chars)});
};

} // namespace kvikio::detail
Loading
Loading