diff --git a/can_ada-stubs/__init__.pyi b/can_ada-stubs/__init__.pyi index 389d4fa..242e40d 100644 --- a/can_ada-stubs/__init__.pyi +++ b/can_ada-stubs/__init__.pyi @@ -1,4 +1,7 @@ -from typing import Iterator, overload +from typing import Iterator, overload, TYPE_CHECKING + +if TYPE_CHECKING: + from urllib.parse import ParseResult __version__: str @@ -68,3 +71,4 @@ def can_parse(input: str, base_input: str | None = ...) -> bool: ... def idna_decode(arg0: str) -> str: ... def idna_encode(arg0: str) -> bytes: ... def parse(arg0: str) -> URL: ... +def parse_compat(arg0: str) -> ParseResult: ... diff --git a/src/binding.cpp b/src/binding.cpp index 832fc0b..d1a4b8a 100644 --- a/src/binding.cpp +++ b/src/binding.cpp @@ -4,6 +4,11 @@ namespace py = pybind11; +static py::object get_parse_result_class() { + static py::object cls = py::module_::import("urllib.parse").attr("ParseResult"); + return cls; +} + PYBIND11_MODULE(can_ada, m) { #ifdef VERSION_INFO m.attr("__version__") = Py_STRINGIFY(VERSION_INFO); @@ -142,4 +147,66 @@ PYBIND11_MODULE(can_ada, m) { return url.value(); }); + m.def("parse_compat", [](std::string_view input) { + ada::result result = ada::parse(input); + if (!result) { + throw py::value_error("URL could not be parsed."); + } + + auto& url = result.value(); + + std::string scheme = [&] { + std::string s = std::string(url.get_protocol()); + return (!s.empty() && s.back() == ':') ? s.substr(0, s.size() - 1) : s; + }(); + + + std::string netloc; + if (url.has_non_empty_username()) { + netloc += std::string(url.get_username()); + if (url.has_password()) { + netloc += ":" + std::string(url.get_password()); + } + netloc += "@"; + } + netloc += std::string(url.get_host()); + if (url.has_port()) { + netloc += ":" + std::string(url.get_port()); + } + + std::string path, params; + // not really correct, but this is urllib.parse.urlparse behaviour + [&] { + std::string raw_path = std::string(url.get_pathname()); + size_t last_slash = raw_path.rfind('/'); + std::string last_segment = (last_slash != std::string::npos) + ? raw_path.substr(last_slash + 1) + : raw_path; + + size_t semi = last_segment.find(';'); + if (semi != std::string::npos) { + path = (last_slash != std::string::npos ? raw_path.substr(0, last_slash + 1) : "") + + last_segment.substr(0, semi); + params = last_segment.substr(semi + 1); + } else { + path = raw_path; + params = ""; + } + }(); + + std::string query = [&] { + std::string s = std::string(url.get_search()); + return (!s.empty() && s.front() == '?') ? s.substr(1) : s; + }(); + + std::string fragment = [&] { + std::string s = std::string(url.get_hash()); + return (!s.empty() && s.front() == '#') ? s.substr(1) : s; + }(); + + + return get_parse_result_class()(scheme, netloc, path, params, query, fragment); + }); + + } \ No newline at end of file diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 448221f..6f382ed 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -39,6 +39,14 @@ def can_ada_parse(): # not valid WHATWG URLs. pass +def can_ada_parse_compat(): + for line in data(): + try: + can_ada.parse_compat(line) + except ValueError: + # There are a small number of URLs in the sample data that are + # not valid WHATWG URLs. + pass @pytest.mark.slow def test_urllib_parse(benchmark): @@ -53,3 +61,9 @@ def test_ada_python_parse(benchmark): @pytest.mark.slow def test_can_ada_parse(benchmark): benchmark(can_ada_parse) + + + +@pytest.mark.slow +def test_can_ada_parse_compat(benchmark): + benchmark(can_ada_parse_compat)