@@ -979,23 +979,11 @@ def __init__(
979
979
# Setup marks. These are used to track the state of data received.
980
980
self .marks : dict [str , int ] = {}
981
981
982
- # TODO: Actually use this rather than the dumb version we currently use
983
- # # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
984
- # skip = [len(boundary) for x in range(256)]
985
- # for i in range(len(boundary) - 1):
986
- # skip[ord_char(boundary[i])] = len(boundary) - i - 1
987
- #
988
- # # We use a tuple since it's a constant, and marginally faster.
989
- # self.skip = tuple(skip)
990
-
991
982
# Save our boundary.
992
983
if isinstance (boundary , str ): # pragma: no cover
993
984
boundary = boundary .encode ("latin-1" )
994
985
self .boundary = b"\r \n --" + boundary
995
986
996
- # Get a set of characters that belong to our boundary.
997
- self .boundary_chars = frozenset (self .boundary )
998
-
999
987
def write (self , data : bytes ) -> int :
1000
988
"""Write some data to the parser, which will perform size verification,
1001
989
and then parse the data into the appropriate location (e.g. header,
@@ -1283,34 +1271,39 @@ def data_callback(name: str, end_i: int, remaining: bool = False) -> None:
1283
1271
# We're processing our part data right now. During this, we
1284
1272
# need to efficiently search for our boundary, since any data
1285
1273
# on any number of lines can be a part of the current data.
1286
- # We use the Boyer-Moore-Horspool algorithm to efficiently
1287
- # search through the remainder of the buffer looking for our
1288
- # boundary.
1289
1274
1290
1275
# Save the current value of our index. We use this in case we
1291
1276
# find part of a boundary, but it doesn't match fully.
1292
1277
prev_index = index
1293
1278
1294
1279
# Set up variables.
1295
1280
boundary_length = len (boundary )
1296
- boundary_end = boundary_length - 1
1297
1281
data_length = length
1298
- boundary_chars = self .boundary_chars
1299
1282
1300
1283
# If our index is 0, we're starting a new part, so start our
1301
1284
# search.
1302
1285
if index == 0 :
1303
- # Search forward until we either hit the end of our buffer,
1304
- # or reach a character that's in our boundary.
1305
- i += boundary_end
1306
- while i < data_length - 1 and data [i ] not in boundary_chars :
1307
- i += boundary_length
1308
-
1309
- # Reset i back the length of our boundary, which is the
1310
- # earliest possible location that could be our match (i.e.
1311
- # if we've just broken out of our loop since we saw the
1312
- # last character in our boundary)
1313
- i -= boundary_end
1286
+ # The most common case is likely to be that the whole
1287
+ # boundary is present in the buffer.
1288
+ # Calling `find` is much faster than iterating here.
1289
+ i0 = data .find (boundary , i , data_length )
1290
+ if i0 >= 0 :
1291
+ # We matched the whole boundary string.
1292
+ index = boundary_length - 1
1293
+ i = i0 + boundary_length - 1
1294
+ else :
1295
+ # No match found for whole string.
1296
+ # There may be a partial boundary at the end of the
1297
+ # data, which the find will not match.
1298
+ # Since the length should to be searched is limited to
1299
+ # the boundary length, just perform a naive search.
1300
+ i = max (i , data_length - boundary_length )
1301
+
1302
+ # Search forward until we either hit the end of our buffer,
1303
+ # or reach a potential start of the boundary.
1304
+ while i < data_length - 1 and data [i ] != boundary [0 ]:
1305
+ i += 1
1306
+
1314
1307
c = data [i ]
1315
1308
1316
1309
# Now, we have a couple of cases here. If our index is before
0 commit comments