Skip to content

Commit f196d40

Browse files
committed
Improve performance by using built-in bytes.find.
The Boyer-Moore-Horspool algorithm was removed and replaced with Python's built-in `find` method. This appears to be faster, sometimes by an order of magnitude.
1 parent 473f23c commit f196d40

File tree

1 file changed

+21
-28
lines changed

1 file changed

+21
-28
lines changed

multipart/multipart.py

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -980,23 +980,11 @@ def __init__(
980980
# Setup marks. These are used to track the state of data received.
981981
self.marks: dict[str, int] = {}
982982

983-
# TODO: Actually use this rather than the dumb version we currently use
984-
# # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
985-
# skip = [len(boundary) for x in range(256)]
986-
# for i in range(len(boundary) - 1):
987-
# skip[ord_char(boundary[i])] = len(boundary) - i - 1
988-
#
989-
# # We use a tuple since it's a constant, and marginally faster.
990-
# self.skip = tuple(skip)
991-
992983
# Save our boundary.
993984
if isinstance(boundary, str): # pragma: no cover
994985
boundary = boundary.encode("latin-1")
995986
self.boundary = b"\r\n--" + boundary
996987

997-
# Get a set of characters that belong to our boundary.
998-
self.boundary_chars = frozenset(self.boundary)
999-
1000988
def write(self, data: bytes) -> int:
1001989
"""Write some data to the parser, which will perform size verification,
1002990
and then parse the data into the appropriate location (e.g. header,
@@ -1276,34 +1264,39 @@ def data_callback(name: str, end_i: int, remaining: bool = False) -> None:
12761264
# We're processing our part data right now. During this, we
12771265
# need to efficiently search for our boundary, since any data
12781266
# on any number of lines can be a part of the current data.
1279-
# We use the Boyer-Moore-Horspool algorithm to efficiently
1280-
# search through the remainder of the buffer looking for our
1281-
# boundary.
12821267

12831268
# Save the current value of our index. We use this in case we
12841269
# find part of a boundary, but it doesn't match fully.
12851270
prev_index = index
12861271

12871272
# Set up variables.
12881273
boundary_length = len(boundary)
1289-
boundary_end = boundary_length - 1
12901274
data_length = length
1291-
boundary_chars = self.boundary_chars
12921275

12931276
# If our index is 0, we're starting a new part, so start our
12941277
# search.
12951278
if index == 0:
1296-
# Search forward until we either hit the end of our buffer,
1297-
# or reach a character that's in our boundary.
1298-
i += boundary_end
1299-
while i < data_length - 1 and data[i] not in boundary_chars:
1300-
i += boundary_length
1301-
1302-
# Reset i back the length of our boundary, which is the
1303-
# earliest possible location that could be our match (i.e.
1304-
# if we've just broken out of our loop since we saw the
1305-
# last character in our boundary)
1306-
i -= boundary_end
1279+
# The most common case is likely to be that the whole
1280+
# boundary is present in the buffer.
1281+
# Calling `find` is much faster than iterating here.
1282+
i0 = data.find(boundary, i, data_length)
1283+
if i0 >= 0:
1284+
# We matched the whole boundary string.
1285+
index = boundary_length - 1
1286+
i = i0 + boundary_length - 1
1287+
else:
1288+
# No match found for whole string.
1289+
# There may be a partial boundary at the end of the
1290+
# data, which the find will not match.
1291+
# Since the length should to be searched is limited to
1292+
# the boundary length, just perform a naive search.
1293+
i = max(i, data_length - boundary_length)
1294+
1295+
# Search forward until we either hit the end of our buffer,
1296+
# or reach a potential start of the boundary.
1297+
while i < data_length - 1 and data[i] != boundary[0]:
1298+
i += 1
1299+
13071300
c = data[i]
13081301

13091302
# Now, we have a couple of cases here. If our index is before

0 commit comments

Comments
 (0)