Skip to content
This repository was archived by the owner on Sep 5, 2020. It is now read-only.

Commit 1e8b470

Browse files
committed
Draft the initial version
This is inspired by Daniel Holth's module httpfile and is based on the discussion over pypa/pip#7819
1 parent 52cdebd commit 1e8b470

File tree

4 files changed

+238
-0
lines changed

4 files changed

+238
-0
lines changed

.travis.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
branches:
2+
only:
3+
- master
4+
- /^\d+(\.\d+)+((a|b|rc)\d+)?(\.post\d+)?(\.dev\d+)?$/
5+
6+
language: python
7+
8+
install: pip install tox
9+
script: tox

lazip.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
# Lazy ZIP over HTTP
2+
# Copyright (C) 2020 Nguyễn Gia Phong
3+
#
4+
# This file is part of lazip.
5+
#
6+
# lazip is free software: you can redistribute it and/or modify it
7+
# under the terms of the GNU Lesser General Public License as published
8+
# by the Free Software Foundation, either version 3 of the License,
9+
# or (at your option) any later version.
10+
#
11+
# lazip is distributed in the hope that it will be useful,
12+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
# GNU Lesser General Public License for more details.
15+
#
16+
# You should have received a copy of the GNU Lesser General Public License
17+
# along with palace. If not, see <https://www.gnu.org/licenses/>.
18+
19+
"""Lazy ZIP over HTTP"""
20+
21+
__version__ = '0.0.1'
22+
__all__ = ['Lazip']
23+
24+
from bisect import bisect_left, bisect_right
25+
from contextlib import contextmanager
26+
from tempfile import NamedTemporaryFile
27+
from typing import Dict, Iterator, List, Optional, Tuple
28+
from zipfile import BadZipFile, ZipFile
29+
30+
from pip._internal.network.utils import response_chunks
31+
from pip._internal.utils.wheel import pkg_resources_distribution_for_wheel
32+
from requests import Session
33+
from requests.models import CONTENT_CHUNK_SIZE, Response
34+
35+
HEADERS: Dict[str, str] = {'Accept-Encoding': 'identity'}
36+
37+
38+
def init_range(stop: int, size: int) -> Iterator[Tuple[int, int]]:
39+
"""Return an iterator of intervals to fetch a file reversedly."""
40+
start = stop - size
41+
while start > 0:
42+
yield start, stop-1
43+
stop = start
44+
start -= size
45+
yield 0, stop-1
46+
47+
48+
class Lazip:
49+
"""File-like object mapped to a ZIP file over HTTP.
50+
51+
This uses HTTP range requests to lazily fetch the file's content,
52+
which is supposed to be fed to ZipFile.
53+
"""
54+
55+
def __init__(self, session: Session, url: str,
56+
chunk_size: int = CONTENT_CHUNK_SIZE) -> None:
57+
head = session.head(url, headers=HEADERS)
58+
head.raise_for_status()
59+
assert head.status_code == 200
60+
self.session, self.url, self.chunk_size = session, url, chunk_size
61+
self.length = int(head.headers['Content-Length'])
62+
self.file = NamedTemporaryFile()
63+
self.file.truncate(self.length)
64+
self.left: List[int] = []
65+
self.right: List[int] = []
66+
self.check_zip('bytes' in head.headers.get('Accept-Ranges', 'none'))
67+
68+
def __enter__(self) -> 'Lazip':
69+
self.file.__enter__()
70+
return self
71+
72+
def __exit__(self, *exc) -> Optional[bool]:
73+
return self.file.__exit__(*exc)
74+
75+
@property
76+
def name(self):
77+
"""File name."""
78+
return self.file.name
79+
80+
def seekable(self):
81+
"""Return whether random access is supported, which is True."""
82+
return True
83+
84+
@contextmanager
85+
def stay(self) -> Iterator[None]:
86+
"""Return a context manager keeping the position.
87+
88+
At the end of the block, seek back to original position.
89+
"""
90+
pos = self.tell()
91+
try:
92+
yield
93+
finally:
94+
self.seek(pos)
95+
96+
def check_zip(self, range_request: bool):
97+
"""Check and download until the file is a valid ZIP."""
98+
if not range_request:
99+
end = self.length - 1
100+
self.download(0, end)
101+
self.left, self.right = [0], [end]
102+
return
103+
for start, end in init_range(self.length, self.chunk_size):
104+
self.download(start, end)
105+
with self.stay():
106+
try:
107+
ZipFile(self)
108+
except BadZipFile:
109+
pass
110+
else:
111+
break
112+
113+
def stream_response(self, start: int, end: int,
114+
base_headers: Dict[str, str] = HEADERS) -> Response:
115+
"""Return HTTP response to a range request from start to end."""
116+
headers = {'Range': f'bytes={start}-{end}'}
117+
headers.update(base_headers)
118+
return self.session.get(self.url, headers=headers, stream=True)
119+
120+
def merge(self, start: int, end: int,
121+
left: int, right: int) -> Iterator[Tuple[int, int]]:
122+
"""Return an iterator of intervals to be fetched.
123+
124+
Args:
125+
start (int): Start of needed interval
126+
end (int): End of needed interval
127+
left (int): Index of first overlapping downloaded data
128+
right (int): Index after last overlapping downloaded data
129+
"""
130+
lslice, rslice = self.left[left:right], self.right[left:right]
131+
i = start = min(start, min(lslice, default=start))
132+
end = min(end, min(rslice, default=end))
133+
for j, k in zip(lslice, rslice):
134+
if j > i: yield i, j-1
135+
i = k + 1
136+
if i <= end: yield i, end
137+
self.left[left:right], self.right[left:right] = [start], [end]
138+
139+
def download(self, start: int, end: int):
140+
"""Download bytes from start to end inclusively."""
141+
with self.stay():
142+
i, j = bisect_left(self.right, start), bisect_right(self.left, end)
143+
for start, end in self.merge(start, end, i, j):
144+
response = self.stream_response(start, end)
145+
response.raise_for_status()
146+
self.seek(start)
147+
for chunk in response_chunks(response, self.chunk_size):
148+
self.file.write(chunk)
149+
150+
def read(self, size: int = -1) -> bytes:
151+
"""Read up to size bytes from the object and return them.
152+
153+
As a convenience, if size is unspecified or -1,
154+
all bytes until EOF are returned. Fewer than
155+
size bytes may be returned if EOF is reached.
156+
"""
157+
start = self.tell()
158+
stop = start + size if 0 <= size <= self.length-start else self.length
159+
self.download(start, stop-1)
160+
return self.file.read(size)
161+
162+
def seek(self, offset: int, whence: int = 0) -> int:
163+
"""Change stream position and return the new absolute position.
164+
165+
Seek to offset relative position indicated by whence:
166+
* 0: Start of stream (the default). pos should be >= 0;
167+
* 1: Current position - pos may be negative;
168+
* 2: End of stream - pos usually negative.
169+
"""
170+
return self.file.seek(offset, whence)
171+
172+
def tell(self) -> int:
173+
"""Return the current possition."""
174+
return self.file.tell()
175+
176+
def close(self) -> None:
177+
"""Close the file."""
178+
self.file.close()
179+
180+
181+
if __name__ == '__main__':
182+
url = ('https://files.pythonhosted.org/packages/17/d9/'
183+
'ff8955ce17c080c956cd5eed9c2da4de139d5eeabb9f9ebf2d981acef31d/'
184+
'brutalmaze-0.9.2-py3-none-any.whl')
185+
with Lazip(Session(), url) as wheel:
186+
print(pkg_resources_distribution_for_wheel(
187+
ZipFile(wheel), 'brutalmaze', wheel.name).requires())

pyproject.toml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
[build-system]
2+
requires = ['flit_core >=2,<3']
3+
build-backend = 'flit_core.buildapi'
4+
5+
[tool.flit.metadata]
6+
module = 'lazip'
7+
author = 'Nguyễn Gia Phong'
8+
author-email = '[email protected]'
9+
home-page = 'https://github.com/McSinyx/lazip'
10+
requires = ['pip', 'requests']
11+
description-file = 'README.md'
12+
classifiers = [
13+
'Development Status :: 1 - Planning',
14+
'Intended Audience :: Developers',
15+
'License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)',
16+
'Natural Language :: English',
17+
'Operating System :: OS Independent',
18+
'Programming Language :: Python',
19+
'Programming Language :: Python :: 3 :: Only',
20+
'Topic :: Internet :: WWW/HTTP',
21+
'Typing :: Typed']
22+
requires-python = '>=3.6'
23+
keywords = 'zip,http,range,lazy'
24+
license = 'LGPLv3+'

tox.ini

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[tox]
2+
envlist = py
3+
minversion = 3.3
4+
isolated_build = true
5+
6+
[testenv]
7+
deps =
8+
flake8-builtins
9+
isort[requirements]
10+
commands =
11+
flake8
12+
isort -c --diff
13+
14+
[flake8]
15+
hang-closing = True
16+
ignore = W503, E125, E225, E226, E227, E701, E704
17+
; See https://github.com/PyCQA/pycodestyle/issues/906
18+
;max-doc-length = 72

0 commit comments

Comments
 (0)