Skip to content

Added ability to use LF, not only CRLF delimiter for response Headers and Body #115

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion h11/_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
# Remember that this has to run in O(n) time -- so e.g. the bytearray cast is
# critical.
obs_fold_re = re.compile(br"[ \t]+")
strict_line_delimiter_regex = re.compile(b"\r\n", re.MULTILINE)


def _obsolete_line_fold(lines):
Expand Down Expand Up @@ -153,7 +154,7 @@ def __call__(self, buf):
assert self._bytes_to_discard == 0
if self._bytes_in_chunk == 0:
# We need to refill our chunk count
chunk_header = buf.maybe_extract_until_next(b"\r\n")
chunk_header = buf.maybe_extract_until_next(strict_line_delimiter_regex, 2)
if chunk_header is None:
return None
matches = validate(
Expand Down
54 changes: 38 additions & 16 deletions h11/_receivebuffer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import re
import sys

__all__ = ["ReceiveBuffer"]


# Operations we want to support:
# - find next \r\n or \r\n\r\n, or wait until there is one
# - find next \r\n or \r\n\r\n (\n or \n\n are also acceptable),
# or wait until there is one
# - read at-most-N bytes
# Goals:
# - on average, do this fast
Expand Down Expand Up @@ -38,13 +40,22 @@
# slightly clever thing where we delay calling compress() until we've
# processed a whole event, which could in theory be slightly more efficient
# than the internal bytearray support.)

blank_line_delimiter_regex = re.compile(b"\n\r?\n", re.MULTILINE)


def rstrip_line(line):
return line.rstrip(b"\r")


class ReceiveBuffer(object):
def __init__(self):
self._data = bytearray()
# These are both absolute offsets into self._data:
self._start = 0
self._looked_at = 0
self._looked_for = b""

self._looked_for_regex = blank_line_delimiter_regex

def __bool__(self):
return bool(len(self))
Expand Down Expand Up @@ -79,21 +90,29 @@ def maybe_extract_at_most(self, count):
self._start += len(out)
return out

def maybe_extract_until_next(self, needle):
def maybe_extract_until_next(self, needle_regex, max_needle_length):
Copy link
Contributor

@tomchristie tomchristie Nov 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this actually be min_needle_length, or have I got that back-to-front?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ie. always resume searching from the first possible point where there could still be a match.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yeah I see it max_needle_length, because that pushes the restart back as far as possible.

# Returns extracted bytes on success (advancing offset), or None on
# failure
if self._looked_for == needle:
search_start = max(self._start, self._looked_at - len(needle) + 1)
if self._looked_for_regex == needle_regex:
looked_at = max(self._start, self._looked_at - max_needle_length)
else:
search_start = self._start
offset = self._data.find(needle, search_start)
if offset == -1:
looked_at = self._start
self._looked_for_regex = needle_regex

delimiter_match = next(
self._looked_for_regex.finditer(self._data, looked_at), None
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you help explain why this is next(finditer(...)) rather than search(...) here?
What am I missing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, you are absolutely right, I missed Pattern::search using Pattern::finditer. I'm to fix it using search as it's more clear


if delimiter_match is None:
self._looked_at = len(self._data)
self._looked_for = needle
return None
new_start = offset + len(needle)
out = self._data[self._start : new_start]
self._start = new_start

_, end = delimiter_match.span(0)

out = self._data[self._start : end]

self._start = end

return out

# HTTP/1.1 has a number of constructs where you keep reading lines until
Expand All @@ -102,11 +121,14 @@ def maybe_extract_lines(self):
if self._data[self._start : self._start + 2] == b"\r\n":
self._start += 2
return []
elif self._data[self._start : self._start + 1] == b"\n":
self._start += 1
return []
else:
data = self.maybe_extract_until_next(b"\r\n\r\n")
data = self.maybe_extract_until_next(blank_line_delimiter_regex, 3)
if data is None:
return None
lines = data.split(b"\r\n")
assert lines[-2] == lines[-1] == b""
del lines[-2:]

lines = list(map(rstrip_line, data.rstrip(b"\r\n").split(b"\n")))

return lines
65 changes: 60 additions & 5 deletions h11/tests/test_receivebuffer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import re

import pytest

from .._receivebuffer import ReceiveBuffer


Expand Down Expand Up @@ -35,23 +39,23 @@ def test_receivebuffer():

b += b"12345a6789aa"

assert b.maybe_extract_until_next(b"a") == b"12345a"
assert b.maybe_extract_until_next(re.compile(b"a"), 1) == b"12345a"
assert bytes(b) == b"6789aa"

assert b.maybe_extract_until_next(b"aaa") is None
assert b.maybe_extract_until_next(re.compile(b"aaa"), 3) is None
assert bytes(b) == b"6789aa"

b += b"a12"
assert b.maybe_extract_until_next(b"aaa") == b"6789aaa"
assert b.maybe_extract_until_next(re.compile(b"aaa"), 3) == b"6789aaa"
assert bytes(b) == b"12"

# check repeated searches for the same needle, triggering the
# pickup-where-we-left-off logic
b += b"345"
assert b.maybe_extract_until_next(b"aaa") is None
assert b.maybe_extract_until_next(re.compile(b"aaa"), 3) is None

b += b"6789aaa123"
assert b.maybe_extract_until_next(b"aaa") == b"123456789aaa"
assert b.maybe_extract_until_next(re.compile(b"aaa"), 3) == b"123456789aaa"
assert bytes(b) == b"123"

################################################################
Expand All @@ -76,3 +80,54 @@ def test_receivebuffer():
b += b"\r\ntrailing"
assert b.maybe_extract_lines() == []
assert bytes(b) == b"trailing"


@pytest.mark.parametrize(
"data",
[
pytest.param(
(
b"HTTP/1.1 200 OK\r\n",
b"Content-type: text/plain\r\n",
b"Connection: close\r\n",
b"\r\n",
b"Some body",
),
id="with_crlf_delimiter",
),
pytest.param(
(
b"HTTP/1.1 200 OK\n",
b"Content-type: text/plain\n",
b"Connection: close\n",
b"\n",
b"Some body",
),
id="with_lf_only_delimiter",
),
pytest.param(
(
b"HTTP/1.1 200 OK\n",
b"Content-type: text/plain\r\n",
b"Connection: close\n",
b"\n",
b"Some body",
),
id="with_mixed_crlf_and_lf",
),
],
)
def test_receivebuffer_for_invalid_delimiter(data):
b = ReceiveBuffer()

for line in data:
b += line

lines = b.maybe_extract_lines()

assert lines == [
b"HTTP/1.1 200 OK",
b"Content-type: text/plain",
b"Connection: close",
]
assert bytes(b) == b"Some body"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also add a few similar tests to test_readers_unusual in test_io.py? These tests are good for the basic ReceiveBuffer functionality, but the test harness there sets up a more "end-to-end" setup that runs our full http parsing pipeline, so it would give us more confidence that everything is wired up correctly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added similar tests to test_readers_unusual