Skip to content

Commit e0b3152

Browse files
authored
fix: HTTP 308 Permanent Redirect status code handling (#2389)
Change the handling of HTTP status code 308 to behave more like `urllib.request.HTTPRedirectHandler`, most critically, the new 308 handling will create a new `urllib.request.Request` object with the new URL, which will prevent state from being carried over from the original request. One case where this is important is when the domain name changes, for example, when the original URL is `http://www.w3.org/ns/adms.ttl` and the redirect URL is `https://uri.semic.eu/w3c/ns/adms.ttl`. With the previous behaviour, the redirect would contain a `Host` header with the value `www.w3.org` instead of `uri.semic.eu` because the `Host` header is placed in `Request.unredirected_hdrs` and takes precedence over the `Host` header in `Request.headers`. Other changes: - Only handle HTTP status code 308 on Python versions before 3.11 as Python 3.11 will handle 308 by default [[ref](https://docs.python.org/3.11/whatsnew/changelog.html#id128)]. - Move code which uses `http://www.w3.org/ns/adms.ttl` and `http://www.w3.org/ns/adms.rdf` out of `test_guess_format_for_parse` into a separate parameterized test, which instead uses the embedded http server. This allows the test to fully control the `Content-Type` header in the response instead of relying on the value that the server is sending. This is needed because the server is sending `Content-Type: text/plain` for the `adms.ttl` file, which is not a valid RDF format, and the test is expecting `Content-Type: text/turtle`. Fixes: - <#2382>.
1 parent 2c8d1e1 commit e0b3152

File tree

15 files changed

+1121
-52
lines changed

15 files changed

+1121
-52
lines changed

rdflib/_networking.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
from __future__ import annotations
2+
3+
import string
4+
import sys
5+
from typing import Dict
6+
from urllib.error import HTTPError
7+
from urllib.parse import quote as urlquote
8+
from urllib.parse import urljoin, urlsplit
9+
from urllib.request import HTTPRedirectHandler, Request, urlopen
10+
from urllib.response import addinfourl
11+
12+
13+
def _make_redirect_request(request: Request, http_error: HTTPError) -> Request:
14+
"""
15+
Create a new request object for a redirected request.
16+
17+
The logic is based on `urllib.request.HTTPRedirectHandler` from `this commit <https://github.com/python/cpython/blob/b58bc8c2a9a316891a5ea1a0487aebfc86c2793a/Lib/urllib/request.py#L641-L751>_`.
18+
19+
:param request: The original request that resulted in the redirect.
20+
:param http_error: The response to the original request that indicates a
21+
redirect should occur and contains the new location.
22+
:return: A new request object to the location indicated by the response.
23+
:raises HTTPError: the supplied ``http_error`` if the redirect request
24+
cannot be created.
25+
:raises ValueError: If the response code is `None`.
26+
:raises ValueError: If the response does not contain a ``Location`` header
27+
or the ``Location`` header is not a string.
28+
:raises HTTPError: If the scheme of the new location is not ``http``,
29+
``https``, or ``ftp``.
30+
:raises HTTPError: If there are too many redirects or a redirect loop.
31+
"""
32+
new_url = http_error.headers.get("Location")
33+
if new_url is None:
34+
raise http_error
35+
if not isinstance(new_url, str):
36+
raise ValueError(f"Location header {new_url!r} is not a string")
37+
38+
new_url_parts = urlsplit(new_url)
39+
40+
# For security reasons don't allow redirection to anything other than http,
41+
# https or ftp.
42+
if new_url_parts.scheme not in ("http", "https", "ftp", ""):
43+
raise HTTPError(
44+
new_url,
45+
http_error.code,
46+
f"{http_error.reason} - Redirection to url {new_url!r} is not allowed",
47+
http_error.headers,
48+
http_error.fp,
49+
)
50+
51+
# http.client.parse_headers() decodes as ISO-8859-1. Recover the original
52+
# bytes and percent-encode non-ASCII bytes, and any special characters such
53+
# as the space.
54+
new_url = urlquote(new_url, encoding="iso-8859-1", safe=string.punctuation)
55+
new_url = urljoin(request.full_url, new_url)
56+
57+
# XXX Probably want to forget about the state of the current
58+
# request, although that might interact poorly with other
59+
# handlers that also use handler-specific request attributes
60+
content_headers = ("content-length", "content-type")
61+
newheaders = {
62+
k: v for k, v in request.headers.items() if k.lower() not in content_headers
63+
}
64+
new_request = Request(
65+
new_url,
66+
headers=newheaders,
67+
origin_req_host=request.origin_req_host,
68+
unverifiable=True,
69+
)
70+
71+
visited: Dict[str, int]
72+
if hasattr(request, "redirect_dict"):
73+
visited = request.redirect_dict
74+
if (
75+
visited.get(new_url, 0) >= HTTPRedirectHandler.max_repeats
76+
or len(visited) >= HTTPRedirectHandler.max_redirections
77+
):
78+
raise HTTPError(
79+
request.full_url,
80+
http_error.code,
81+
HTTPRedirectHandler.inf_msg + http_error.reason,
82+
http_error.headers,
83+
http_error.fp,
84+
)
85+
else:
86+
visited = {}
87+
setattr(request, "redirect_dict", visited)
88+
89+
setattr(new_request, "redirect_dict", visited)
90+
visited[new_url] = visited.get(new_url, 0) + 1
91+
return new_request
92+
93+
94+
def _urlopen(request: Request) -> addinfourl:
95+
"""
96+
This is a shim for `urlopen` that handles HTTP redirects with status code
97+
308 (Permanent Redirect).
98+
99+
This function should be removed once all supported versions of Python
100+
handles the 308 HTTP status code.
101+
102+
:param request: The request to open.
103+
:return: The response to the request.
104+
"""
105+
try:
106+
return urlopen(request)
107+
except HTTPError as error:
108+
if error.code == 308 and sys.version_info < (3, 11):
109+
# HTTP response code 308 (Permanent Redirect) is not supported by python
110+
# versions older than 3.11. See <https://bugs.python.org/issue40321> and
111+
# <https://github.com/python/cpython/issues/84501> for more details.
112+
# This custom error handling should be removed once all supported
113+
# versions of Python handles 308.
114+
new_request = _make_redirect_request(request, error)
115+
return _urlopen(new_request)
116+
else:
117+
raise

rdflib/parser.py

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@
2727
Tuple,
2828
Union,
2929
)
30-
from urllib.error import HTTPError
3130
from urllib.parse import urljoin
32-
from urllib.request import Request, url2pathname, urlopen
31+
from urllib.request import Request, url2pathname
3332
from xml.sax import xmlreader
3433

3534
import rdflib.util
3635
from rdflib import __version__
36+
from rdflib._networking import _urlopen
3737
from rdflib.namespace import Namespace
3838
from rdflib.term import URIRef
3939

@@ -267,21 +267,6 @@ def __init__(self, system_id: Optional[str] = None, format: Optional[str] = None
267267

268268
req = Request(system_id, None, myheaders) # type: ignore[arg-type]
269269

270-
def _urlopen(req: Request) -> Any:
271-
try:
272-
return urlopen(req)
273-
except HTTPError as ex:
274-
# 308 (Permanent Redirect) is not supported by current python version(s)
275-
# See https://bugs.python.org/issue40321
276-
# This custom error handling should be removed once all
277-
# supported versions of python support 308.
278-
if ex.code == 308:
279-
# type error: Incompatible types in assignment (expression has type "Optional[Any]", variable has type "str")
280-
req.full_url = ex.headers.get("Location") # type: ignore[assignment]
281-
return _urlopen(req)
282-
else:
283-
raise
284-
285270
response: addinfourl = _urlopen(req)
286271
self.url = response.geturl() # in case redirections took place
287272
self.links = self.get_links(response)

test/conftest.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,22 +44,44 @@ def rdfs_graph() -> Graph:
4444
return Graph().parse(TEST_DATA_DIR / "defined_namespaces/rdfs.ttl", format="turtle")
4545

4646

47+
_ServedBaseHTTPServerMocks = Tuple[ServedBaseHTTPServerMock, ServedBaseHTTPServerMock]
48+
49+
4750
@pytest.fixture(scope="session")
48-
def _session_function_httpmock() -> Generator[ServedBaseHTTPServerMock, None, None]:
51+
def _session_function_httpmocks() -> Generator[_ServedBaseHTTPServerMocks, None, None]:
4952
"""
5053
This fixture is session scoped, but it is reset for each function in
5154
:func:`function_httpmock`. This should not be used directly.
5255
"""
53-
with ServedBaseHTTPServerMock() as httpmock:
54-
yield httpmock
56+
with ServedBaseHTTPServerMock() as httpmock_a, ServedBaseHTTPServerMock() as httpmock_b:
57+
yield httpmock_a, httpmock_b
5558

5659

5760
@pytest.fixture(scope="function")
5861
def function_httpmock(
59-
_session_function_httpmock: ServedBaseHTTPServerMock,
62+
_session_function_httpmocks: _ServedBaseHTTPServerMocks,
6063
) -> Generator[ServedBaseHTTPServerMock, None, None]:
61-
_session_function_httpmock.reset()
62-
yield _session_function_httpmock
64+
"""
65+
HTTP server mock that is reset for each test function.
66+
"""
67+
(mock, _) = _session_function_httpmocks
68+
mock.reset()
69+
yield mock
70+
71+
72+
@pytest.fixture(scope="function")
73+
def function_httpmocks(
74+
_session_function_httpmocks: _ServedBaseHTTPServerMocks,
75+
) -> Generator[Tuple[ServedBaseHTTPServerMock, ServedBaseHTTPServerMock], None, None]:
76+
"""
77+
Alternative HTTP server mock that is reset for each test function.
78+
79+
This exists in case a tests needs to work with two different HTTP servers.
80+
"""
81+
(mock_a, mock_b) = _session_function_httpmocks
82+
mock_a.reset()
83+
mock_b.reset()
84+
yield mock_a, mock_b
6385

6486

6587
@pytest.fixture(scope="session", autouse=True)

test/data.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from pathlib import Path
22

33
from rdflib import URIRef
4+
from rdflib.graph import Graph
45

56
TEST_DIR = Path(__file__).parent
67
TEST_DATA_DIR = TEST_DIR / "data"
@@ -19,3 +20,20 @@
1920
context0 = URIRef("urn:example:context-0")
2021
context1 = URIRef("urn:example:context-1")
2122
context2 = URIRef("urn:example:context-2")
23+
24+
25+
simple_triple_graph = Graph().add(
26+
(
27+
URIRef("http://example.org/subject"),
28+
URIRef("http://example.org/predicate"),
29+
URIRef("http://example.org/object"),
30+
)
31+
)
32+
"""
33+
A simple graph with a single triple. This is equivalent to the following RDF files:
34+
35+
* ``test/data/variants/simple_triple.nq``
36+
* ``test/data/variants/simple_triple.nt``
37+
* ``test/data/variants/simple_triple.ttl``
38+
* ``test/data/variants/simple_triple.xml``
39+
"""

0 commit comments

Comments
 (0)