2
2
3
3
from __future__ import annotations
4
4
5
- __all__ = ["HTTPRangeRequestUnsupported" , "dist_from_wheel_url" ]
5
+ __all__ = ["HTTPRangeRequestUnsupported" , "dist_from_wheel_url" , "LazyHTTPFile" ]
6
6
7
7
import io
8
8
import logging
22
22
from pip ._internal .metadata import BaseDistribution , MemoryWheel , get_wheel_distribution
23
23
from pip ._internal .network .session import PipSession as Session
24
24
from pip ._internal .network .utils import HEADERS
25
+ from pip ._internal .utils .logging import indent_log
25
26
26
27
logger = logging .getLogger (__name__ )
27
28
@@ -40,6 +41,11 @@ def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistributi
40
41
"""
41
42
try :
42
43
with LazyHTTPFile (url , session ) as lazy_file :
44
+ with indent_log ():
45
+ logger .debug ("begin prefetching for %s" , name )
46
+ lazy_file .prefetch_contiguous_dist_info (name )
47
+ logger .debug ("done prefetching for %s" , name )
48
+
43
49
# For read-only ZIP files, ZipFile only needs methods read,
44
50
# seek, seekable and tell, not the whole IO protocol.
45
51
wheel = MemoryWheel (lazy_file .name , lazy_file )
@@ -145,6 +151,11 @@ def __next__(self) -> bytes:
145
151
raise NotImplementedError
146
152
147
153
154
+ # The central directory for tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is
155
+ # 944931 bytes, for a 459424488 byte file (about 486x as large).
156
+ _DEFAULT_INITIAL_FETCH = 1_000_000
157
+
158
+
148
159
class LazyHTTPFile (ReadOnlyIOWrapper ):
149
160
"""File-like object mapped to a ZIP file over HTTP.
150
161
@@ -159,7 +170,10 @@ class LazyHTTPFile(ReadOnlyIOWrapper):
159
170
_domains_without_negative_range : ClassVar [set [str ]] = set ()
160
171
161
172
def __init__ (
162
- self , url : str , session : Session , initial_chunk_size : int = CONTENT_CHUNK_SIZE
173
+ self ,
174
+ url : str ,
175
+ session : Session ,
176
+ initial_chunk_size : int = _DEFAULT_INITIAL_FETCH ,
163
177
) -> None :
164
178
# Add delete=False and print the file's `.name` to debug invalid virtual zips.
165
179
super ().__init__ (cast (BinaryIO , NamedTemporaryFile ()))
@@ -172,21 +186,20 @@ def __init__(
172
186
173
187
self ._length , initial_chunk = self ._extract_content_length (initial_chunk_size )
174
188
self .truncate (self ._length )
175
- # The central directory for
176
- # tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is 944931 bytes, for
177
- # a 459424488 byte file (about 486x as large).
178
- self ._minimum_fetch_granularity = max (initial_chunk_size , self ._length // 400 )
179
189
if initial_chunk is None :
180
190
# If we could not download any file contents yet (e.g. if negative byte
181
191
# ranges were not supported), then download all of this at once, hopefully
182
192
# pulling in the entire central directory.
183
- initial_start = max (0 , self ._length - self . _minimum_fetch_granularity )
193
+ initial_start = max (0 , self ._length - initial_chunk_size )
184
194
self ._download (initial_start , self ._length )
185
195
else :
186
- self .seek (- len (initial_chunk ), io .SEEK_END )
187
- self ._file .write (initial_chunk )
188
- self ._left .append (self ._length - len (initial_chunk ))
189
- self ._right .append (self ._length - 1 )
196
+ # If we could download file contents, then write them to the end of the
197
+ # file and set up our bisect boundaries by hand.
198
+ with self ._stay ():
199
+ self .seek (- len (initial_chunk ), io .SEEK_END )
200
+ self ._file .write (initial_chunk )
201
+ self ._left .append (self ._length - len (initial_chunk ))
202
+ self ._right .append (self ._length - 1 )
190
203
191
204
def read (self , size : int = - 1 ) -> bytes :
192
205
"""Read up to size bytes from the object and return them.
@@ -195,17 +208,17 @@ def read(self, size: int = -1) -> bytes:
195
208
all bytes until EOF are returned. Fewer than
196
209
size bytes may be returned if EOF is reached.
197
210
"""
198
- # BUG does not download correctly if size is unspecified
199
211
cur = self .tell ()
212
+ logger .debug ("read size %d at %d" , size , cur )
200
213
if size < 0 :
201
214
assert cur <= self ._length
202
215
download_size = self ._length - cur
203
216
elif size == 0 :
204
- return b''
217
+ return b""
205
218
else :
206
- download_size = max ( size , self . _minimum_fetch_granularity )
219
+ download_size = size
207
220
stop = min (cur + download_size , self ._length )
208
- self ._download (cur , stop - 1 )
221
+ self ._download (cur , stop )
209
222
return self ._file .read (size )
210
223
211
224
def __enter__ (self ) -> LazyHTTPFile :
@@ -221,18 +234,20 @@ def _content_length_from_head(self) -> int:
221
234
head = self ._session .head (self ._url , headers = HEADERS )
222
235
head .raise_for_status ()
223
236
assert head .status_code == codes .ok
224
- return int (head .headers ["content-length " ])
237
+ return int (head .headers ["Content-Length " ])
225
238
226
239
@staticmethod
227
240
def _parse_full_length_from_content_range (arg : str ) -> Optional [int ]:
228
- if m := re .match (r"bytes [^/]+/([0-9]+)" , arg ):
241
+ m = re .match (r"bytes [^/]+/([0-9]+)" , arg )
242
+ if m is not None :
229
243
return int (m .group (1 ))
230
244
return None
231
245
232
246
def _try_initial_chunk_request (self , initial_chunk_size : int ) -> tuple [int , bytes ]:
233
247
headers = HEADERS .copy ()
234
248
# Perform a negative range index, which is not supported by some servers.
235
249
headers ["Range" ] = f"bytes=-{ initial_chunk_size } "
250
+ logger .debug ("initial bytes request: %s" , headers ["Range" ])
236
251
# TODO: Get range requests to be correctly cached
237
252
headers ["Cache-Control" ] = "no-cache"
238
253
# TODO: If-Match (etag) to detect file changed during fetch would be a
@@ -242,7 +257,7 @@ def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, byte
242
257
tail = self ._session .get (self ._url , headers = headers )
243
258
tail .raise_for_status ()
244
259
245
- response_length = int (tail .headers ["content-length " ])
260
+ response_length = int (tail .headers ["Content-Length " ])
246
261
assert response_length == len (tail .content )
247
262
248
263
code = tail .status_code
@@ -255,12 +270,15 @@ def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, byte
255
270
elif code != codes .partial_content :
256
271
raise HTTPRangeRequestUnsupported ("did not receive partial content or ok" )
257
272
258
- range_arg = tail .headers ["content-range" ]
259
- if file_length := self ._parse_full_length_from_content_range (range_arg ):
273
+ range_arg = tail .headers ["Content-Range" ]
274
+ file_length = self ._parse_full_length_from_content_range (range_arg )
275
+ if file_length is not None :
260
276
return (file_length , tail .content )
261
277
raise HTTPRangeRequestUnsupported (f"could not parse content-range: { range_arg } " )
262
278
263
- def _extract_content_length (self , initial_chunk_size : int ) -> tuple [int , Optional [bytes ]]:
279
+ def _extract_content_length (
280
+ self , initial_chunk_size : int
281
+ ) -> tuple [int , Optional [bytes ]]:
264
282
domain = urlparse (self ._url ).netloc
265
283
if domain in self ._domains_without_negative_range :
266
284
return (self ._content_length_from_head (), None )
@@ -287,8 +305,9 @@ def _extract_content_length(self, initial_chunk_size: int) -> tuple[int, Optiona
287
305
if code == codes .requested_range_not_satisfiable :
288
306
# In this case, we don't have any file content yet, but we do know the
289
307
# size the file will be, so we can return that and exit here.
290
- range_arg = resp .headers ["content-range" ]
291
- if length := self ._parse_full_length_from_content_range (range_arg ):
308
+ range_arg = resp .headers ["Content-Range" ]
309
+ length = self ._parse_full_length_from_content_range (range_arg )
310
+ if length is not None :
292
311
return (length , None )
293
312
raise HTTPRangeRequestUnsupported (
294
313
f"could not parse content-range: { range_arg } "
@@ -330,7 +349,7 @@ def _stream_response(self, start: int, end: int) -> Response:
330
349
# https://www.rfc-editor.org/rfc/rfc9110#field.content-range
331
350
headers = HEADERS .copy ()
332
351
headers ["Range" ] = f"bytes={ start } -{ end } "
333
- logger .debug ("%s" , headers ["Range" ])
352
+ logger .debug ("streamed bytes request: %s" , headers ["Range" ])
334
353
# TODO: Get range requests to be correctly cached
335
354
headers ["Cache-Control" ] = "no-cache"
336
355
# TODO: If-Match (etag) to detect file changed during fetch would be a
@@ -364,6 +383,8 @@ def _merge(
364
383
365
384
def _download (self , start : int , end : int ) -> None :
366
385
"""Download bytes from start to end inclusively."""
386
+ # Reducing by 1 to get an inclusive end range.
387
+ end -= 1
367
388
with self ._stay ():
368
389
left = bisect_left (self ._right , start )
369
390
right = bisect_right (self ._left , end )
@@ -372,3 +393,35 @@ def _download(self, start: int, end: int) -> None:
372
393
self .seek (start )
373
394
for chunk in response .iter_content (CONTENT_CHUNK_SIZE ):
374
395
self ._file .write (chunk )
396
+
397
+ def prefetch_contiguous_dist_info (self , name : str ) -> None :
398
+ """
399
+ Read contents of entire dist-info section of wheel.
400
+
401
+ pip will read every entry in this directory when generating a dist from a wheel,
402
+ so prepopulating the file contents avoids waiting for multiple range requests.
403
+ """
404
+ dist_info_prefix = re .compile (r"^[^/]*\.dist-info/" )
405
+ start : Optional [int ] = None
406
+ end : Optional [int ] = None
407
+
408
+ zf = ZipFile (self )
409
+
410
+ for info in zf .infolist ():
411
+ if start is None :
412
+ if dist_info_prefix .search (info .filename ):
413
+ start = info .header_offset
414
+ continue
415
+ else :
416
+ if not dist_info_prefix .search (info .filename ):
417
+ end = info .header_offset
418
+ break
419
+ if start is None :
420
+ raise UnsupportedWheel (
421
+ f"no { dist_info_prefix } directory found for { name } in { self .name } "
422
+ )
423
+ # If the last entries of the zip are the .dist-info/ dir (as usual), then give
424
+ # us everything until the start of the central directory.
425
+ if end is None :
426
+ end = zf .start_dir
427
+ self ._download (start , end )
0 commit comments