Skip to content

Commit 182b351

Browse files
make lazy wheel work against tensorflow-gpu
1 parent 8c35424 commit 182b351

File tree

2 files changed

+75
-22
lines changed

2 files changed

+75
-22
lines changed

src/pip/_internal/network/lazy_wheel.py

Lines changed: 71 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from __future__ import annotations
44

5-
__all__ = ["HTTPRangeRequestUnsupported", "dist_from_wheel_url"]
5+
__all__ = ["HTTPRangeRequestUnsupported", "dist_from_wheel_url", "LazyHTTPFile"]
66

77
import io
88
import logging
@@ -22,6 +22,7 @@
2222
from pip._internal.metadata import BaseDistribution, MemoryWheel, get_wheel_distribution
2323
from pip._internal.network.session import PipSession as Session
2424
from pip._internal.network.utils import HEADERS
25+
from pip._internal.utils.logging import indent_log
2526

2627
logger = logging.getLogger(__name__)
2728

@@ -40,6 +41,11 @@ def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistributi
4041
"""
4142
try:
4243
with LazyHTTPFile(url, session) as lazy_file:
44+
with indent_log():
45+
logger.debug("begin prefetching for %s", name)
46+
lazy_file.prefetch_contiguous_dist_info(name)
47+
logger.debug("done prefetching for %s", name)
48+
4349
# For read-only ZIP files, ZipFile only needs methods read,
4450
# seek, seekable and tell, not the whole IO protocol.
4551
wheel = MemoryWheel(lazy_file.name, lazy_file)
@@ -145,6 +151,11 @@ def __next__(self) -> bytes:
145151
raise NotImplementedError
146152

147153

154+
# The central directory for tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is
155+
# 944931 bytes, for a 459424488 byte file (about 486x as large).
156+
_DEFAULT_INITIAL_FETCH = 1_000_000
157+
158+
148159
class LazyHTTPFile(ReadOnlyIOWrapper):
149160
"""File-like object mapped to a ZIP file over HTTP.
150161
@@ -159,7 +170,10 @@ class LazyHTTPFile(ReadOnlyIOWrapper):
159170
_domains_without_negative_range: ClassVar[set[str]] = set()
160171

161172
def __init__(
162-
self, url: str, session: Session, initial_chunk_size: int = CONTENT_CHUNK_SIZE
173+
self,
174+
url: str,
175+
session: Session,
176+
initial_chunk_size: int = _DEFAULT_INITIAL_FETCH,
163177
) -> None:
164178
# Add delete=False and print the file's `.name` to debug invalid virtual zips.
165179
super().__init__(cast(BinaryIO, NamedTemporaryFile()))
@@ -172,21 +186,20 @@ def __init__(
172186

173187
self._length, initial_chunk = self._extract_content_length(initial_chunk_size)
174188
self.truncate(self._length)
175-
# The central directory for
176-
# tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is 944931 bytes, for
177-
# a 459424488 byte file (about 486x as large).
178-
self._minimum_fetch_granularity = max(initial_chunk_size, self._length // 400)
179189
if initial_chunk is None:
180190
# If we could not download any file contents yet (e.g. if negative byte
181191
# ranges were not supported), then download all of this at once, hopefully
182192
# pulling in the entire central directory.
183-
initial_start = max(0, self._length - self._minimum_fetch_granularity)
193+
initial_start = max(0, self._length - initial_chunk_size)
184194
self._download(initial_start, self._length)
185195
else:
186-
self.seek(-len(initial_chunk), io.SEEK_END)
187-
self._file.write(initial_chunk)
188-
self._left.append(self._length - len(initial_chunk))
189-
self._right.append(self._length - 1)
196+
# If we could download file contents, then write them to the end of the
197+
# file and set up our bisect boundaries by hand.
198+
with self._stay():
199+
self.seek(-len(initial_chunk), io.SEEK_END)
200+
self._file.write(initial_chunk)
201+
self._left.append(self._length - len(initial_chunk))
202+
self._right.append(self._length - 1)
190203

191204
def read(self, size: int = -1) -> bytes:
192205
"""Read up to size bytes from the object and return them.
@@ -195,17 +208,17 @@ def read(self, size: int = -1) -> bytes:
195208
all bytes until EOF are returned. Fewer than
196209
size bytes may be returned if EOF is reached.
197210
"""
198-
# BUG does not download correctly if size is unspecified
199211
cur = self.tell()
212+
logger.debug("read size %d at %d", size, cur)
200213
if size < 0:
201214
assert cur <= self._length
202215
download_size = self._length - cur
203216
elif size == 0:
204-
return b''
217+
return b""
205218
else:
206-
download_size = max(size, self._minimum_fetch_granularity)
219+
download_size = size
207220
stop = min(cur + download_size, self._length)
208-
self._download(cur, stop - 1)
221+
self._download(cur, stop)
209222
return self._file.read(size)
210223

211224
def __enter__(self) -> LazyHTTPFile:
@@ -221,7 +234,7 @@ def _content_length_from_head(self) -> int:
221234
head = self._session.head(self._url, headers=HEADERS)
222235
head.raise_for_status()
223236
assert head.status_code == codes.ok
224-
return int(head.headers["content-length"])
237+
return int(head.headers["Content-Length"])
225238

226239
@staticmethod
227240
def _parse_full_length_from_content_range(arg: str) -> Optional[int]:
@@ -233,6 +246,7 @@ def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, byte
233246
headers = HEADERS.copy()
234247
# Perform a negative range index, which is not supported by some servers.
235248
headers["Range"] = f"bytes=-{initial_chunk_size}"
249+
logger.debug("initial bytes request: %s", headers["Range"])
236250
# TODO: Get range requests to be correctly cached
237251
headers["Cache-Control"] = "no-cache"
238252
# TODO: If-Match (etag) to detect file changed during fetch would be a
@@ -242,7 +256,7 @@ def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, byte
242256
tail = self._session.get(self._url, headers=headers)
243257
tail.raise_for_status()
244258

245-
response_length = int(tail.headers["content-length"])
259+
response_length = int(tail.headers["Content-Length"])
246260
assert response_length == len(tail.content)
247261

248262
code = tail.status_code
@@ -255,12 +269,14 @@ def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, byte
255269
elif code != codes.partial_content:
256270
raise HTTPRangeRequestUnsupported("did not receive partial content or ok")
257271

258-
range_arg = tail.headers["content-range"]
272+
range_arg = tail.headers["Content-Range"]
259273
if file_length := self._parse_full_length_from_content_range(range_arg):
260274
return (file_length, tail.content)
261275
raise HTTPRangeRequestUnsupported(f"could not parse content-range: {range_arg}")
262276

263-
def _extract_content_length(self, initial_chunk_size: int) -> tuple[int, Optional[bytes]]:
277+
def _extract_content_length(
278+
self, initial_chunk_size: int
279+
) -> tuple[int, Optional[bytes]]:
264280
domain = urlparse(self._url).netloc
265281
if domain in self._domains_without_negative_range:
266282
return (self._content_length_from_head(), None)
@@ -287,7 +303,7 @@ def _extract_content_length(self, initial_chunk_size: int) -> tuple[int, Optiona
287303
if code == codes.requested_range_not_satisfiable:
288304
# In this case, we don't have any file content yet, but we do know the
289305
# size the file will be, so we can return that and exit here.
290-
range_arg = resp.headers["content-range"]
306+
range_arg = resp.headers["Content-Range"]
291307
if length := self._parse_full_length_from_content_range(range_arg):
292308
return (length, None)
293309
raise HTTPRangeRequestUnsupported(
@@ -330,7 +346,7 @@ def _stream_response(self, start: int, end: int) -> Response:
330346
# https://www.rfc-editor.org/rfc/rfc9110#field.content-range
331347
headers = HEADERS.copy()
332348
headers["Range"] = f"bytes={start}-{end}"
333-
logger.debug("%s", headers["Range"])
349+
logger.debug("streamed bytes request: %s", headers["Range"])
334350
# TODO: Get range requests to be correctly cached
335351
headers["Cache-Control"] = "no-cache"
336352
# TODO: If-Match (etag) to detect file changed during fetch would be a
@@ -364,6 +380,8 @@ def _merge(
364380

365381
def _download(self, start: int, end: int) -> None:
366382
"""Download bytes from start to end inclusively."""
383+
# Reducing by 1 to get an inclusive end range.
384+
end -= 1
367385
with self._stay():
368386
left = bisect_left(self._right, start)
369387
right = bisect_right(self._left, end)
@@ -372,3 +390,35 @@ def _download(self, start: int, end: int) -> None:
372390
self.seek(start)
373391
for chunk in response.iter_content(CONTENT_CHUNK_SIZE):
374392
self._file.write(chunk)
393+
394+
def prefetch_contiguous_dist_info(self, name: str) -> None:
395+
"""
396+
Read contents of entire dist-info section of wheel.
397+
398+
pip will read every entry in this directory when generating a dist from a wheel,
399+
so prepopulating the file contents avoids waiting for multiple range requests.
400+
"""
401+
dist_info_prefix = re.compile(r"^[^/]*\.dist-info/")
402+
start: Optional[int] = None
403+
end: Optional[int] = None
404+
405+
zf = ZipFile(self)
406+
407+
for info in zf.infolist():
408+
if start is None:
409+
if dist_info_prefix.search(info.filename):
410+
start = info.header_offset
411+
continue
412+
else:
413+
if not dist_info_prefix.search(info.filename):
414+
end = info.header_offset
415+
break
416+
if start is None:
417+
raise UnsupportedWheel(
418+
f"no {dist_info_prefix} directory found for {name} in {self.name}"
419+
)
420+
# If the last entries of the zip are the .dist-info/ dir (as usual), then give
421+
# us everything until the start of the central directory.
422+
if end is None:
423+
end = zf.start_dir
424+
self._download(start, end)

src/pip/_internal/utils/wheel.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pip._vendor.packaging.utils import canonicalize_name
1111

1212
from pip._internal.exceptions import UnsupportedWheel
13+
from pip._internal.network.lazy_wheel import LazyHTTPFile
1314

1415
VERSION_COMPATIBLE = (1, 0)
1516

@@ -69,8 +70,10 @@ def wheel_dist_info_dir(source: ZipFile, name: str) -> str:
6970

7071

7172
def read_wheel_metadata_file(source: ZipFile, path: str) -> bytes:
73+
if isinstance(source.fp, LazyHTTPFile):
74+
logger.debug("extracting entry '%s' from lazy zip '%s'", path, source.fp.name)
75+
7276
try:
73-
logger.debug("extracting entry '%s' from zip '%s'", path, source.fp.name)
7477
return source.read(path)
7578
# BadZipFile for general corruption, KeyError for missing entry,
7679
# and RuntimeError for password-protected files

0 commit comments

Comments
 (0)