Skip to content

Jules #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 39 additions & 9 deletions src/functions/extract_domain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ namespace duckdb
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting domain: " + std::string (e.what ());
// Set NULL on error
FlatVector::SetNull (result, i, true);
}
}
}
Expand All @@ -45,18 +46,26 @@ namespace duckdb
Connection con (db);

// Extract the host from the URL
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/|mailto:)?((?:[^\/\?:#@]+@)?([^\/\?:#]+)))");
// This regex captures the host, excluding protocol, path, query, fragment, and port.
// It explicitly excludes '/', '\s', '#', '?', ':' from the host.
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\s#?:]+))");
std::smatch host_match;
if (!std::regex_search (input, host_match, host_regex))
std::string host_str; // Use a separate string for the matched host

// Search for the host in the input string
// No need for searchable_input, regex_search can take input directly
if (std::regex_search (input, host_match, host_regex) && host_match.size () > 1)
{
return "";
host_str = host_match[1].str ();
}
else
{
return ""; // No host found
}

auto host = host_match[host_match.size () - 1].str ();

// Split the host into parts
std::vector<std::string> parts;
std::istringstream stream (host);
std::istringstream stream (host_str);
std::string part;
while (std::getline (stream, part, '.'))
{
Expand All @@ -65,8 +74,29 @@ namespace duckdb

// Find the longest matching public suffix
std::string public_suffix;
int public_suffix_index = -1;

int public_suffix_index = -1; // Using -1 to indicate no valid public suffix part found yet

// Iterate through all possible suffix combinations, from shortest to longest.
// The goal is to find the longest known public suffix.
// For example, for 'a.b.c.co.uk', it will test:
// uk, co.uk, c.co.uk, b.c.co.uk, a.b.c.co.uk
// If 'co.uk' is a public suffix, it will be matched.
// If 'c.co.uk' is also a public suffix (e.g. *.sch.uk), that would be matched.
// The last and longest match is chosen by breaking after the first DB match,
// assuming suffixes are ordered or queried appropriately by the PSL logic.
// However, the original loop structure implies checking all parts and
// the longest one that is a PSL entry should be chosen.
// The current logic takes the *first* match from the right that is a PSL entry.
// Let's refine the comment to reflect the actual loop behavior.

// Iterate through parts of the hostname from right to left to find the longest public suffix.
// For 'a.b.c.co.uk', it will form candidates:
// 1. uk
// 2. co.uk
// 3. c.co.uk
// 4. b.c.co.uk
// 5. a.b.c.co.uk
// It stops at the first and longest valid suffix found in the public_suffix_list.
for (size_t j = 0; j < parts.size (); j++)
{
// Build the candidate suffix
Expand Down
3 changes: 2 additions & 1 deletion src/functions/extract_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ namespace duckdb
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting extension: " + std::string (e.what ());
// Set NULL on error
FlatVector::SetNull (result, i, true);
}
};
}
Expand Down
3 changes: 2 additions & 1 deletion src/functions/extract_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ namespace duckdb
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting host: " + std::string (e.what ());
// Set NULL on error
FlatVector::SetNull (result, i, true);
}
}
}
Expand Down
5 changes: 3 additions & 2 deletions src/functions/extract_path.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ namespace duckdb

for (idx_t i = 0; i < args.size (); i++)
{
// Paths are often case-sensitive, so we don't convert to lowercase.
auto input = input_vector.GetValue (i).ToString ();
std::transform (input.begin (), input.end (), input.begin (), ::tolower);

try
{
Expand All @@ -26,7 +26,8 @@ namespace duckdb
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting path: " + std::string (e.what ());
// Set NULL on error
FlatVector::SetNull (result, i, true);
}
};
}
Expand Down
3 changes: 2 additions & 1 deletion src/functions/extract_port.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ namespace duckdb
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting port: " + std::string (e.what ());
// Set NULL on error
FlatVector::SetNull (result, i, true);
}
};
}
Expand Down
13 changes: 9 additions & 4 deletions src/functions/extract_query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ namespace duckdb
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting query string: " + std::string (e.what ());
// Set NULL on error
FlatVector::SetNull (result, i, true);
}
};
}
Expand All @@ -36,9 +37,13 @@ namespace duckdb
{
// Regex to match the query string component of a URL
// Explanation:
// (?:\?|&) - Non-capturing group to match either "?" (start of query) or "&" (query parameter separator)
// ([^#]+) - Capturing group to match the query string (any characters except "#")
std::regex query_regex (R"((?:\?|&)([^#]+))");
// \? - Matches the literal '?' character.
// ([^#]*) - Capturing group:
// [^#] - Matches any character that is NOT a '#'.
// * - Matches the previous character zero or more times.
// This regex captures content after the first '?' up to a '#' or end of string.
// Does not handle query parameters in fragments.
std::regex query_regex (R"(\?([^#]*))");
std::smatch query_match;

// Use regex_search to find the query string in the input
Expand Down
3 changes: 2 additions & 1 deletion src/functions/extract_schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ namespace duckdb
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting schema: " + std::string (e.what ());
// Set NULL on error
FlatVector::SetNull (result, i, true);
}
};
}
Expand Down
30 changes: 23 additions & 7 deletions src/functions/extract_subdomain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ namespace duckdb
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting subdomain: " + std::string (e.what ());
// Set NULL on error
FlatVector::SetNull (result, i, true);
}
}
}
Expand All @@ -43,18 +44,25 @@ namespace duckdb
Connection con (db);

// Extract the host from the URL
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\?:]+))");
// This regex captures the host, excluding protocol, path, query, fragment, and port.
// It explicitly excludes '/', '\s', '#', '?', ':' from the host.
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\s#?:]+))");
std::smatch host_match;
if (!std::regex_search (input, host_match, host_regex))
std::string host_str;

// No need for searchable_input, regex_search can take input directly
if (std::regex_search (input, host_match, host_regex) && host_match.size () > 1)
{
return "";
host_str = host_match[1].str ();
}
else
{
return ""; // No host found
}

auto host = host_match[1].str ();

// Split the host into parts
std::vector<std::string> parts;
std::istringstream stream (host);
std::istringstream stream (host_str);
std::string part;
while (std::getline (stream, part, '.'))
{
Expand All @@ -65,6 +73,14 @@ namespace duckdb
std::string public_suffix;
int public_suffix_index = -1;

// Iterate through all possible suffix combinations, from shortest to longest.
// The goal is to find the longest known public suffix.
// For example, for 'a.b.c.co.uk', it will test:
// uk, co.uk, c.co.uk, b.c.co.uk, a.b.c.co.uk
// If 'co.uk' is a public suffix, it will be matched.
// If 'c.co.uk' is also a public suffix (e.g. *.sch.uk), that would be matched.
// The last and longest match is chosen.
// The current logic takes the *first* match from the right that is a PSL entry.
for (size_t j = 0; j < parts.size (); j++)
{
// Build the candidate suffix
Expand Down
30 changes: 23 additions & 7 deletions src/functions/extract_tld.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ namespace duckdb
}
catch (const std::exception &e)
{
result_data[i] = "Error extracting tld: " + std::string (e.what ());
// Set NULL on error
FlatVector::SetNull (result, i, true);
}
}
}
Expand All @@ -43,18 +44,25 @@ namespace duckdb
Connection con (db);

// Extract the host from the URL
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\?:]+))");
// This regex captures the host, excluding protocol, path, query, fragment, and port.
// It explicitly excludes '/', '\s', '#', '?', ':' from the host.
std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\s#?:]+))");
std::smatch host_match;
if (!std::regex_search (input, host_match, host_regex))
std::string host_str;

// No need for searchable_input, regex_search can take input directly
if (std::regex_search (input, host_match, host_regex) && host_match.size () > 1)
{
return "";
host_str = host_match[1].str ();
}
else
{
return ""; // No host found
}

auto host = host_match[1].str ();

// Split the host into parts
std::vector<std::string> parts;
std::istringstream stream (host);
std::istringstream stream (host_str);
std::string part;
while (std::getline (stream, part, '.'))
{
Expand All @@ -64,6 +72,14 @@ namespace duckdb
// Find the longest matching public suffix
std::string public_suffix;

// Iterate through all possible suffix combinations, from shortest to longest.
// The goal is to find the longest known public suffix.
// For example, for 'a.b.c.co.uk', it will test:
// uk, co.uk, c.co.uk, b.c.co.uk, a.b.c.co.uk
// If 'co.uk' is a public suffix, it will be matched.
// If 'c.co.uk' is also a public suffix (e.g. *.sch.uk), that would be matched.
// The last and longest match is chosen.
// The current logic takes the *first* match from the right that is a PSL entry.
for (size_t j = 0; j < parts.size (); j++)
{
// Build the candidate suffix
Expand Down
Loading
Loading