@@ -980,23 +980,11 @@ def __init__(
980
980
# Setup marks. These are used to track the state of data received.
981
981
self .marks : dict [str , int ] = {}
982
982
983
- # TODO: Actually use this rather than the dumb version we currently use
984
- # # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
985
- # skip = [len(boundary) for x in range(256)]
986
- # for i in range(len(boundary) - 1):
987
- # skip[ord_char(boundary[i])] = len(boundary) - i - 1
988
- #
989
- # # We use a tuple since it's a constant, and marginally faster.
990
- # self.skip = tuple(skip)
991
-
992
983
# Save our boundary.
993
984
if isinstance (boundary , str ): # pragma: no cover
994
985
boundary = boundary .encode ("latin-1" )
995
986
self .boundary = b"\r \n --" + boundary
996
987
997
- # Get a set of characters that belong to our boundary.
998
- self .boundary_chars = frozenset (self .boundary )
999
-
1000
988
def write (self , data : bytes ) -> int :
1001
989
"""Write some data to the parser, which will perform size verification,
1002
990
and then parse the data into the appropriate location (e.g. header,
@@ -1276,34 +1264,39 @@ def data_callback(name: str, end_i: int, remaining: bool = False) -> None:
1276
1264
# We're processing our part data right now. During this, we
1277
1265
# need to efficiently search for our boundary, since any data
1278
1266
# on any number of lines can be a part of the current data.
1279
- # We use the Boyer-Moore-Horspool algorithm to efficiently
1280
- # search through the remainder of the buffer looking for our
1281
- # boundary.
1282
1267
1283
1268
# Save the current value of our index. We use this in case we
1284
1269
# find part of a boundary, but it doesn't match fully.
1285
1270
prev_index = index
1286
1271
1287
1272
# Set up variables.
1288
1273
boundary_length = len (boundary )
1289
- boundary_end = boundary_length - 1
1290
1274
data_length = length
1291
- boundary_chars = self .boundary_chars
1292
1275
1293
1276
# If our index is 0, we're starting a new part, so start our
1294
1277
# search.
1295
1278
if index == 0 :
1296
- # Search forward until we either hit the end of our buffer,
1297
- # or reach a character that's in our boundary.
1298
- i += boundary_end
1299
- while i < data_length - 1 and data [i ] not in boundary_chars :
1300
- i += boundary_length
1301
-
1302
- # Reset i back the length of our boundary, which is the
1303
- # earliest possible location that could be our match (i.e.
1304
- # if we've just broken out of our loop since we saw the
1305
- # last character in our boundary)
1306
- i -= boundary_end
1279
+ # The most common case is likely to be that the whole
1280
+ # boundary is present in the buffer.
1281
+ # Calling `find` is much faster than iterating here.
1282
+ i0 = data .find (boundary , i , data_length )
1283
+ if i0 >= 0 :
1284
+ # We matched the whole boundary string.
1285
+ index = boundary_length - 1
1286
+ i = i0 + boundary_length - 1
1287
+ else :
1288
+ # No match found for whole string.
1289
+ # There may be a partial boundary at the end of the
1290
+ # data, which the find will not match.
1291
+ # Since the length should to be searched is limited to
1292
+ # the boundary length, just perform a naive search.
1293
+ i = max (i , data_length - boundary_length )
1294
+
1295
+ # Search forward until we either hit the end of our buffer,
1296
+ # or reach a potential start of the boundary.
1297
+ while i < data_length - 1 and data [i ] != boundary [0 ]:
1298
+ i += 1
1299
+
1307
1300
c = data [i ]
1308
1301
1309
1302
# Now, we have a couple of cases here. If our index is before
0 commit comments