@@ -39,12 +39,10 @@ def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistributi
39
39
is raised.
40
40
"""
41
41
try :
42
- with LazyZipOverHTTP (url , session ) as zf :
43
- zf .prefetch_dist_info ()
44
-
42
+ with LazyHTTPFile (url , session ) as lazy_file :
45
43
# For read-only ZIP files, ZipFile only needs methods read,
46
44
# seek, seekable and tell, not the whole IO protocol.
47
- wheel = MemoryWheel (zf .name , zf )
45
+ wheel = MemoryWheel (lazy_file .name , lazy_file )
48
46
# After context manager exit, wheel.name is an invalid file by intention.
49
47
return get_wheel_distribution (wheel , canonicalize_name (name ))
50
48
except (BadZipFile , UnsupportedWheel ):
@@ -147,7 +145,7 @@ def __next__(self) -> bytes:
147
145
raise NotImplementedError
148
146
149
147
150
- class LazyZipOverHTTP (ReadOnlyIOWrapper ):
148
+ class LazyHTTPFile (ReadOnlyIOWrapper ):
151
149
"""File-like object mapped to a ZIP file over HTTP.
152
150
153
151
This uses HTTP range requests to lazily fetch the file's content,
@@ -161,20 +159,30 @@ class LazyZipOverHTTP(ReadOnlyIOWrapper):
161
159
_domains_without_negative_range : ClassVar [set [str ]] = set ()
162
160
163
161
def __init__ (
164
- self , url : str , session : Session , chunk_size : int = CONTENT_CHUNK_SIZE
162
+ self , url : str , session : Session , initial_chunk_size : int = CONTENT_CHUNK_SIZE
165
163
) -> None :
164
+ # Add delete=False and print the file's `.name` to debug invalid virtual zips.
166
165
super ().__init__ (cast (BinaryIO , NamedTemporaryFile ()))
167
166
168
167
self ._request_count = 0
169
168
self ._session = session
170
169
self ._url = url
171
- self ._chunk_size = chunk_size
172
170
self ._left : list [int ] = []
173
171
self ._right : list [int ] = []
174
172
175
- self ._length , initial_chunk = self ._extract_content_length ()
173
+ self ._length , initial_chunk = self ._extract_content_length (initial_chunk_size )
176
174
self .truncate (self ._length )
177
- if initial_chunk is not None :
175
+ # The central directory for
176
+ # tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is 944931 bytes, for
177
+ # a 459424488 byte file (about 486x as large).
178
+ self ._minimum_fetch_granularity = max (initial_chunk_size , self ._length // 400 )
179
+ if initial_chunk is None :
180
+ # If we could not download any file contents yet (e.g. if negative byte
181
+ # ranges were not supported), then download all of this at once, hopefully
182
+ # pulling in the entire central directory.
183
+ initial_start = max (0 , self ._length - self ._minimum_fetch_granularity )
184
+ self ._download (initial_start , self ._length )
185
+ else :
178
186
self .seek (- len (initial_chunk ), io .SEEK_END )
179
187
self ._file .write (initial_chunk )
180
188
self ._left .append (self ._length - len (initial_chunk ))
@@ -192,28 +200,27 @@ def read(self, size: int = -1) -> bytes:
192
200
if size < 0 :
193
201
assert cur <= self ._length
194
202
download_size = self ._length - cur
203
+ elif size == 0 :
204
+ return b''
195
205
else :
196
- download_size = max (size , self ._chunk_size )
206
+ download_size = max (size , self ._minimum_fetch_granularity )
197
207
stop = min (cur + download_size , self ._length )
198
208
self ._download (cur , stop - 1 )
199
209
return self ._file .read (size )
200
210
201
- def __enter__ (self ) -> LazyZipOverHTTP :
211
+ def __enter__ (self ) -> LazyHTTPFile :
202
212
super ().__enter__ ()
203
213
return self
204
214
205
215
def __exit__ (self , * exc : Any ) -> None :
206
- logger .debug ("requests for url %s: %s " , self ._url , self ._request_count )
216
+ logger .debug ("%d requests for url %s" , self ._request_count , self ._url )
207
217
super ().__exit__ (* exc )
208
218
209
219
def _content_length_from_head (self ) -> int :
210
220
self ._request_count += 1
211
221
head = self ._session .head (self ._url , headers = HEADERS )
212
222
head .raise_for_status ()
213
223
assert head .status_code == codes .ok
214
- # S3 provides lowercased headers, and in the normal case these will return the
215
- # same as 'Content-Length'.
216
- # FIXME: provide documentation for this?
217
224
return int (head .headers ["content-length" ])
218
225
219
226
@staticmethod
@@ -222,10 +229,10 @@ def _parse_full_length_from_content_range(arg: str) -> Optional[int]:
222
229
return int (m .group (1 ))
223
230
return None
224
231
225
- def _try_initial_chunk_request (self ) -> tuple [int , bytes ]:
232
+ def _try_initial_chunk_request (self , initial_chunk_size : int ) -> tuple [int , bytes ]:
226
233
headers = HEADERS .copy ()
227
234
# Perform a negative range index, which is not supported by some servers.
228
- headers ["Range" ] = f"bytes=-{ self . _chunk_size } "
235
+ headers ["Range" ] = f"bytes=-{ initial_chunk_size } "
229
236
# TODO: Get range requests to be correctly cached
230
237
headers ["Cache-Control" ] = "no-cache"
231
238
# TODO: If-Match (etag) to detect file changed during fetch would be a
@@ -243,7 +250,7 @@ def _try_initial_chunk_request(self) -> tuple[int, bytes]:
243
250
if code == codes .ok :
244
251
# If this was done despite a smaller requested byte range, then we assume
245
252
# the server does not support range requests.
246
- if len (tail ) > self . _chunk_size :
253
+ if len (tail ) > initial_chunk_size :
247
254
raise HTTPRangeRequestUnsupported ("returned complete file contents" )
248
255
elif code != codes .partial_content :
249
256
raise HTTPRangeRequestUnsupported ("did not receive partial content or ok" )
@@ -253,14 +260,14 @@ def _try_initial_chunk_request(self) -> tuple[int, bytes]:
253
260
return (file_length , tail .content )
254
261
raise HTTPRangeRequestUnsupported (f"could not parse content-range: { range_arg } " )
255
262
256
- def _extract_content_length (self ) -> tuple [int , Optional [bytes ]]:
263
+ def _extract_content_length (self , initial_chunk_size : int ) -> tuple [int , Optional [bytes ]]:
257
264
domain = urlparse (self ._url ).netloc
258
265
if domain in self ._domains_without_negative_range :
259
266
return (self ._content_length_from_head (), None )
260
267
261
268
# Initial range request for just the end of the file.
262
269
try :
263
- return self ._try_initial_chunk_request ()
270
+ return self ._try_initial_chunk_request (initial_chunk_size )
264
271
except HTTPError as e :
265
272
resp = e .response
266
273
code = resp .status_code
@@ -306,7 +313,7 @@ def _stay(self) -> Iterator[None]:
306
313
def _check_zip (self ) -> None :
307
314
"""Check and download until the file is a valid ZIP."""
308
315
end = self ._length - 1
309
- for start in reversed (range (0 , end , self . _chunk_size )):
316
+ for start in reversed (range (0 , end , CONTENT_CHUNK_SIZE )):
310
317
self ._download (start , end )
311
318
with self ._stay ():
312
319
try :
@@ -363,23 +370,5 @@ def _download(self, start: int, end: int) -> None:
363
370
for start , end in self ._merge (start , end , left , right ):
364
371
response = self ._stream_response (start , end )
365
372
self .seek (start )
366
- for chunk in response .iter_content (self . _chunk_size ):
373
+ for chunk in response .iter_content (CONTENT_CHUNK_SIZE ):
367
374
self ._file .write (chunk )
368
-
369
- def prefetch_dist_info (self ) -> None :
370
- """
371
- Read contents of entire dist-info section of wheel.
372
-
373
- pip wants to read WHEEL and METADATA.
374
- """
375
- with self ._stay ():
376
- zf = ZipFile (self )
377
- infolist = zf .infolist ()
378
- for info in infolist :
379
- # should be (wheel filename without extension etc) + (.dist-info/)
380
- if ".dist-info/" in info .filename :
381
- start = info .header_offset
382
- end = zf .start_dir
383
- self .seek (start )
384
- self .read (end - start )
385
- break
0 commit comments