|
| 1 | +"""Download files with progress indicators. |
| 2 | +""" |
| 3 | +import cgi |
| 4 | +import logging |
| 5 | +import mimetypes |
| 6 | +import os |
| 7 | + |
| 8 | +from pip._vendor import requests |
| 9 | +from pip._vendor.requests.models import CONTENT_CHUNK_SIZE |
| 10 | + |
| 11 | +from pip._internal.models.index import PyPI |
| 12 | +from pip._internal.network.cache import is_from_cache |
| 13 | +from pip._internal.network.utils import response_chunks |
| 14 | +from pip._internal.utils.misc import ( |
| 15 | + format_size, |
| 16 | + redact_auth_from_url, |
| 17 | + splitext, |
| 18 | +) |
| 19 | +from pip._internal.utils.typing import MYPY_CHECK_RUNNING |
| 20 | +from pip._internal.utils.ui import DownloadProgressProvider |
| 21 | + |
| 22 | +if MYPY_CHECK_RUNNING: |
| 23 | + from typing import Iterable, Optional |
| 24 | + |
| 25 | + from pip._vendor.requests.models import Response |
| 26 | + |
| 27 | + from pip._internal.models.link import Link |
| 28 | + from pip._internal.network.session import PipSession |
| 29 | + |
| 30 | +logger = logging.getLogger(__name__) |
| 31 | + |
| 32 | + |
| 33 | +def _get_http_response_size(resp): |
| 34 | + # type: (Response) -> Optional[int] |
| 35 | + try: |
| 36 | + return int(resp.headers['content-length']) |
| 37 | + except (ValueError, KeyError, TypeError): |
| 38 | + return None |
| 39 | + |
| 40 | + |
| 41 | +def _prepare_download( |
| 42 | + resp, # type: Response |
| 43 | + link, # type: Link |
| 44 | + progress_bar # type: str |
| 45 | +): |
| 46 | + # type: (...) -> Iterable[bytes] |
| 47 | + total_length = _get_http_response_size(resp) |
| 48 | + |
| 49 | + if link.netloc == PyPI.file_storage_domain: |
| 50 | + url = link.show_url |
| 51 | + else: |
| 52 | + url = link.url_without_fragment |
| 53 | + |
| 54 | + logged_url = redact_auth_from_url(url) |
| 55 | + |
| 56 | + if total_length: |
| 57 | + logged_url = '{} ({})'.format(logged_url, format_size(total_length)) |
| 58 | + |
| 59 | + if is_from_cache(resp): |
| 60 | + logger.info("Using cached %s", logged_url) |
| 61 | + else: |
| 62 | + logger.info("Downloading %s", logged_url) |
| 63 | + |
| 64 | + if logger.getEffectiveLevel() > logging.INFO: |
| 65 | + show_progress = False |
| 66 | + elif is_from_cache(resp): |
| 67 | + show_progress = False |
| 68 | + elif not total_length: |
| 69 | + show_progress = True |
| 70 | + elif total_length > (40 * 1000): |
| 71 | + show_progress = True |
| 72 | + else: |
| 73 | + show_progress = False |
| 74 | + |
| 75 | + chunks = response_chunks(resp, CONTENT_CHUNK_SIZE) |
| 76 | + |
| 77 | + if not show_progress: |
| 78 | + return chunks |
| 79 | + |
| 80 | + return DownloadProgressProvider( |
| 81 | + progress_bar, max=total_length |
| 82 | + )(chunks) |
| 83 | + |
| 84 | + |
| 85 | +def sanitize_content_filename(filename): |
| 86 | + # type: (str) -> str |
| 87 | + """ |
| 88 | + Sanitize the "filename" value from a Content-Disposition header. |
| 89 | + """ |
| 90 | + return os.path.basename(filename) |
| 91 | + |
| 92 | + |
| 93 | +def parse_content_disposition(content_disposition, default_filename): |
| 94 | + # type: (str, str) -> str |
| 95 | + """ |
| 96 | + Parse the "filename" value from a Content-Disposition header, and |
| 97 | + return the default filename if the result is empty. |
| 98 | + """ |
| 99 | + _type, params = cgi.parse_header(content_disposition) |
| 100 | + filename = params.get('filename') |
| 101 | + if filename: |
| 102 | + # We need to sanitize the filename to prevent directory traversal |
| 103 | + # in case the filename contains ".." path parts. |
| 104 | + filename = sanitize_content_filename(filename) |
| 105 | + return filename or default_filename |
| 106 | + |
| 107 | + |
| 108 | +def _get_http_response_filename(resp, link): |
| 109 | + # type: (Response, Link) -> str |
| 110 | + """Get an ideal filename from the given HTTP response, falling back to |
| 111 | + the link filename if not provided. |
| 112 | + """ |
| 113 | + filename = link.filename # fallback |
| 114 | + # Have a look at the Content-Disposition header for a better guess |
| 115 | + content_disposition = resp.headers.get('content-disposition') |
| 116 | + if content_disposition: |
| 117 | + filename = parse_content_disposition(content_disposition, filename) |
| 118 | + ext = splitext(filename)[1] # type: Optional[str] |
| 119 | + if not ext: |
| 120 | + ext = mimetypes.guess_extension( |
| 121 | + resp.headers.get('content-type', '') |
| 122 | + ) |
| 123 | + if ext: |
| 124 | + filename += ext |
| 125 | + if not ext and link.url != resp.url: |
| 126 | + ext = os.path.splitext(resp.url)[1] |
| 127 | + if ext: |
| 128 | + filename += ext |
| 129 | + return filename |
| 130 | + |
| 131 | + |
| 132 | +def _http_get_download(session, link): |
| 133 | + # type: (PipSession, Link) -> Response |
| 134 | + target_url = link.url.split('#', 1)[0] |
| 135 | + resp = session.get( |
| 136 | + target_url, |
| 137 | + # We use Accept-Encoding: identity here because requests |
| 138 | + # defaults to accepting compressed responses. This breaks in |
| 139 | + # a variety of ways depending on how the server is configured. |
| 140 | + # - Some servers will notice that the file isn't a compressible |
| 141 | + # file and will leave the file alone and with an empty |
| 142 | + # Content-Encoding |
| 143 | + # - Some servers will notice that the file is already |
| 144 | + # compressed and will leave the file alone and will add a |
| 145 | + # Content-Encoding: gzip header |
| 146 | + # - Some servers won't notice anything at all and will take |
| 147 | + # a file that's already been compressed and compress it again |
| 148 | + # and set the Content-Encoding: gzip header |
| 149 | + # By setting this to request only the identity encoding We're |
| 150 | + # hoping to eliminate the third case. Hopefully there does not |
| 151 | + # exist a server which when given a file will notice it is |
| 152 | + # already compressed and that you're not asking for a |
| 153 | + # compressed file and will then decompress it before sending |
| 154 | + # because if that's the case I don't think it'll ever be |
| 155 | + # possible to make this work. |
| 156 | + headers={"Accept-Encoding": "identity"}, |
| 157 | + stream=True, |
| 158 | + ) |
| 159 | + resp.raise_for_status() |
| 160 | + return resp |
| 161 | + |
| 162 | + |
| 163 | +class Download(object): |
| 164 | + def __init__( |
| 165 | + self, |
| 166 | + response, # type: Response |
| 167 | + filename, # type: str |
| 168 | + chunks, # type: Iterable[bytes] |
| 169 | + ): |
| 170 | + # type: (...) -> None |
| 171 | + self.response = response |
| 172 | + self.filename = filename |
| 173 | + self.chunks = chunks |
| 174 | + |
| 175 | + |
| 176 | +class Downloader(object): |
| 177 | + def __init__( |
| 178 | + self, |
| 179 | + session, # type: PipSession |
| 180 | + progress_bar, # type: str |
| 181 | + ): |
| 182 | + # type: (...) -> None |
| 183 | + self._session = session |
| 184 | + self._progress_bar = progress_bar |
| 185 | + |
| 186 | + def __call__(self, link): |
| 187 | + # type: (Link) -> Download |
| 188 | + try: |
| 189 | + resp = _http_get_download(self._session, link) |
| 190 | + except requests.HTTPError as e: |
| 191 | + logger.critical( |
| 192 | + "HTTP error %s while getting %s", e.response.status_code, link |
| 193 | + ) |
| 194 | + raise |
| 195 | + |
| 196 | + return Download( |
| 197 | + resp, |
| 198 | + _get_http_response_filename(resp, link), |
| 199 | + _prepare_download(resp, link, self._progress_bar), |
| 200 | + ) |
0 commit comments