Skip to content

Commit 95cfe93

Browse files
rewrite some parts of lazy wheel
1 parent 1263f8d commit 95cfe93

File tree

2 files changed

+30
-40
lines changed

2 files changed

+30
-40
lines changed

src/pip/_internal/network/lazy_wheel.py

Lines changed: 29 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,10 @@ def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistributi
3939
is raised.
4040
"""
4141
try:
42-
with LazyZipOverHTTP(url, session) as zf:
43-
zf.prefetch_dist_info()
44-
42+
with LazyHTTPFile(url, session) as lazy_file:
4543
# For read-only ZIP files, ZipFile only needs methods read,
4644
# seek, seekable and tell, not the whole IO protocol.
47-
wheel = MemoryWheel(zf.name, zf)
45+
wheel = MemoryWheel(lazy_file.name, lazy_file)
4846
# After context manager exit, wheel.name is an invalid file by intention.
4947
return get_wheel_distribution(wheel, canonicalize_name(name))
5048
except (BadZipFile, UnsupportedWheel):
@@ -147,7 +145,7 @@ def __next__(self) -> bytes:
147145
raise NotImplementedError
148146

149147

150-
class LazyZipOverHTTP(ReadOnlyIOWrapper):
148+
class LazyHTTPFile(ReadOnlyIOWrapper):
151149
"""File-like object mapped to a ZIP file over HTTP.
152150
153151
This uses HTTP range requests to lazily fetch the file's content,
@@ -161,20 +159,30 @@ class LazyZipOverHTTP(ReadOnlyIOWrapper):
161159
_domains_without_negative_range: ClassVar[set[str]] = set()
162160

163161
def __init__(
164-
self, url: str, session: Session, chunk_size: int = CONTENT_CHUNK_SIZE
162+
self, url: str, session: Session, initial_chunk_size: int = CONTENT_CHUNK_SIZE
165163
) -> None:
164+
# Add delete=False and print the file's `.name` to debug invalid virtual zips.
166165
super().__init__(cast(BinaryIO, NamedTemporaryFile()))
167166

168167
self._request_count = 0
169168
self._session = session
170169
self._url = url
171-
self._chunk_size = chunk_size
172170
self._left: list[int] = []
173171
self._right: list[int] = []
174172

175-
self._length, initial_chunk = self._extract_content_length()
173+
self._length, initial_chunk = self._extract_content_length(initial_chunk_size)
176174
self.truncate(self._length)
177-
if initial_chunk is not None:
175+
# The central directory for
176+
# tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is 944931 bytes, for
177+
# a 459424488 byte file (about 486x as large).
178+
self._minimum_fetch_granularity = max(initial_chunk_size, self._length // 400)
179+
if initial_chunk is None:
180+
# If we could not download any file contents yet (e.g. if negative byte
181+
# ranges were not supported), then download all of this at once, hopefully
182+
# pulling in the entire central directory.
183+
initial_start = max(0, self._length - self._minimum_fetch_granularity)
184+
self._download(initial_start, self._length)
185+
else:
178186
self.seek(-len(initial_chunk), io.SEEK_END)
179187
self._file.write(initial_chunk)
180188
self._left.append(self._length - len(initial_chunk))
@@ -192,28 +200,27 @@ def read(self, size: int = -1) -> bytes:
192200
if size < 0:
193201
assert cur <= self._length
194202
download_size = self._length - cur
203+
elif size == 0:
204+
return b''
195205
else:
196-
download_size = max(size, self._chunk_size)
206+
download_size = max(size, self._minimum_fetch_granularity)
197207
stop = min(cur + download_size, self._length)
198208
self._download(cur, stop - 1)
199209
return self._file.read(size)
200210

201-
def __enter__(self) -> LazyZipOverHTTP:
211+
def __enter__(self) -> LazyHTTPFile:
202212
super().__enter__()
203213
return self
204214

205215
def __exit__(self, *exc: Any) -> None:
206-
logger.debug("requests for url %s: %s", self._url, self._request_count)
216+
logger.debug("%d requests for url %s", self._request_count, self._url)
207217
super().__exit__(*exc)
208218

209219
def _content_length_from_head(self) -> int:
210220
self._request_count += 1
211221
head = self._session.head(self._url, headers=HEADERS)
212222
head.raise_for_status()
213223
assert head.status_code == codes.ok
214-
# S3 provides lowercased headers, and in the normal case these will return the
215-
# same as 'Content-Length'.
216-
# FIXME: provide documentation for this?
217224
return int(head.headers["content-length"])
218225

219226
@staticmethod
@@ -222,10 +229,10 @@ def _parse_full_length_from_content_range(arg: str) -> Optional[int]:
222229
return int(m.group(1))
223230
return None
224231

225-
def _try_initial_chunk_request(self) -> tuple[int, bytes]:
232+
def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, bytes]:
226233
headers = HEADERS.copy()
227234
# Perform a negative range index, which is not supported by some servers.
228-
headers["Range"] = f"bytes=-{self._chunk_size}"
235+
headers["Range"] = f"bytes=-{initial_chunk_size}"
229236
# TODO: Get range requests to be correctly cached
230237
headers["Cache-Control"] = "no-cache"
231238
# TODO: If-Match (etag) to detect file changed during fetch would be a
@@ -243,7 +250,7 @@ def _try_initial_chunk_request(self) -> tuple[int, bytes]:
243250
if code == codes.ok:
244251
# If this was done despite a smaller requested byte range, then we assume
245252
# the server does not support range requests.
246-
if len(tail) > self._chunk_size:
253+
if len(tail) > initial_chunk_size:
247254
raise HTTPRangeRequestUnsupported("returned complete file contents")
248255
elif code != codes.partial_content:
249256
raise HTTPRangeRequestUnsupported("did not receive partial content or ok")
@@ -253,14 +260,14 @@ def _try_initial_chunk_request(self) -> tuple[int, bytes]:
253260
return (file_length, tail.content)
254261
raise HTTPRangeRequestUnsupported(f"could not parse content-range: {range_arg}")
255262

256-
def _extract_content_length(self) -> tuple[int, Optional[bytes]]:
263+
def _extract_content_length(self, initial_chunk_size: int) -> tuple[int, Optional[bytes]]:
257264
domain = urlparse(self._url).netloc
258265
if domain in self._domains_without_negative_range:
259266
return (self._content_length_from_head(), None)
260267

261268
# Initial range request for just the end of the file.
262269
try:
263-
return self._try_initial_chunk_request()
270+
return self._try_initial_chunk_request(initial_chunk_size)
264271
except HTTPError as e:
265272
resp = e.response
266273
code = resp.status_code
@@ -306,7 +313,7 @@ def _stay(self) -> Iterator[None]:
306313
def _check_zip(self) -> None:
307314
"""Check and download until the file is a valid ZIP."""
308315
end = self._length - 1
309-
for start in reversed(range(0, end, self._chunk_size)):
316+
for start in reversed(range(0, end, CONTENT_CHUNK_SIZE)):
310317
self._download(start, end)
311318
with self._stay():
312319
try:
@@ -363,23 +370,5 @@ def _download(self, start: int, end: int) -> None:
363370
for start, end in self._merge(start, end, left, right):
364371
response = self._stream_response(start, end)
365372
self.seek(start)
366-
for chunk in response.iter_content(self._chunk_size):
373+
for chunk in response.iter_content(CONTENT_CHUNK_SIZE):
367374
self._file.write(chunk)
368-
369-
def prefetch_dist_info(self) -> None:
370-
"""
371-
Read contents of entire dist-info section of wheel.
372-
373-
pip wants to read WHEEL and METADATA.
374-
"""
375-
with self._stay():
376-
zf = ZipFile(self)
377-
infolist = zf.infolist()
378-
for info in infolist:
379-
# should be (wheel filename without extension etc) + (.dist-info/)
380-
if ".dist-info/" in info.filename:
381-
start = info.header_offset
382-
end = zf.start_dir
383-
self.seek(start)
384-
self.read(end - start)
385-
break

src/pip/_internal/utils/wheel.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def wheel_dist_info_dir(source: ZipFile, name: str) -> str:
7070

7171
def read_wheel_metadata_file(source: ZipFile, path: str) -> bytes:
7272
try:
73+
logger.debug("extracting entry '%s' from zip '%s'", path, source.fp.name)
7374
return source.read(path)
7475
# BadZipFile for general corruption, KeyError for missing entry,
7576
# and RuntimeError for password-protected files

0 commit comments

Comments
 (0)