Skip to content

Commit 0d19c08

Browse files
committed
Improve performance by using built-in bytes.find.
The Boyer-Moore-Horspool algorithm was removed and replaced with Python's built-in `find` method. This appears to be faster, sometimes by an order of magnitude.
1 parent ff06ea5 commit 0d19c08

File tree

1 file changed

+21
-28
lines changed

1 file changed

+21
-28
lines changed

multipart/multipart.py

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -979,23 +979,11 @@ def __init__(
979979
# Setup marks. These are used to track the state of data received.
980980
self.marks: dict[str, int] = {}
981981

982-
# TODO: Actually use this rather than the dumb version we currently use
983-
# # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
984-
# skip = [len(boundary) for x in range(256)]
985-
# for i in range(len(boundary) - 1):
986-
# skip[ord_char(boundary[i])] = len(boundary) - i - 1
987-
#
988-
# # We use a tuple since it's a constant, and marginally faster.
989-
# self.skip = tuple(skip)
990-
991982
# Save our boundary.
992983
if isinstance(boundary, str): # pragma: no cover
993984
boundary = boundary.encode("latin-1")
994985
self.boundary = b"\r\n--" + boundary
995986

996-
# Get a set of characters that belong to our boundary.
997-
self.boundary_chars = frozenset(self.boundary)
998-
999987
def write(self, data: bytes) -> int:
1000988
"""Write some data to the parser, which will perform size verification,
1001989
and then parse the data into the appropriate location (e.g. header,
@@ -1283,34 +1271,39 @@ def data_callback(name: str, end_i: int, remaining: bool = False) -> None:
12831271
# We're processing our part data right now. During this, we
12841272
# need to efficiently search for our boundary, since any data
12851273
# on any number of lines can be a part of the current data.
1286-
# We use the Boyer-Moore-Horspool algorithm to efficiently
1287-
# search through the remainder of the buffer looking for our
1288-
# boundary.
12891274

12901275
# Save the current value of our index. We use this in case we
12911276
# find part of a boundary, but it doesn't match fully.
12921277
prev_index = index
12931278

12941279
# Set up variables.
12951280
boundary_length = len(boundary)
1296-
boundary_end = boundary_length - 1
12971281
data_length = length
1298-
boundary_chars = self.boundary_chars
12991282

13001283
# If our index is 0, we're starting a new part, so start our
13011284
# search.
13021285
if index == 0:
1303-
# Search forward until we either hit the end of our buffer,
1304-
# or reach a character that's in our boundary.
1305-
i += boundary_end
1306-
while i < data_length - 1 and data[i] not in boundary_chars:
1307-
i += boundary_length
1308-
1309-
# Reset i back the length of our boundary, which is the
1310-
# earliest possible location that could be our match (i.e.
1311-
# if we've just broken out of our loop since we saw the
1312-
# last character in our boundary)
1313-
i -= boundary_end
1286+
# The most common case is likely to be that the whole
1287+
# boundary is present in the buffer.
1288+
# Calling `find` is much faster than iterating here.
1289+
i0 = data.find(boundary, i, data_length)
1290+
if i0 >= 0:
1291+
# We matched the whole boundary string.
1292+
index = boundary_length - 1
1293+
i = i0 + boundary_length - 1
1294+
else:
1295+
# No match found for whole string.
1296+
# There may be a partial boundary at the end of the
1297+
# data, which the find will not match.
1298+
# Since the length should to be searched is limited to
1299+
# the boundary length, just perform a naive search.
1300+
i = max(i, data_length - boundary_length)
1301+
1302+
# Search forward until we either hit the end of our buffer,
1303+
# or reach a potential start of the boundary.
1304+
while i < data_length - 1 and data[i] != boundary[0]:
1305+
i += 1
1306+
13141307
c = data[i]
13151308

13161309
# Now, we have a couple of cases here. If our index is before

0 commit comments

Comments
 (0)