Adding timeout option for GDriveReader (#153)

NivekT · facebook-github-bot · commit ec32ee429837 · 2022-01-07T13:29:08.000-08:00
Summary: Pull Request resolved: #153 Fixes #132 Fixes #137 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D33370399 Pulled By: NivekT fbshipit-source-id: d20d6b3b17103bbeee79c788732c76bfb4f29f76
diff --git a/torchdata/datapipes/iter/load/online.py b/torchdata/datapipes/iter/load/online.py
@@ -1,6 +1,6 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
 import re
-from typing import Iterator, Tuple
+from typing import Iterator, Optional, Tuple
 from urllib.parse import urlparse
 
 import requests
@@ -10,7 +10,7 @@
 from torchdata.datapipes.utils import StreamWrapper
 
 
-def _get_response_from_http(url: str, *, timeout: float) -> Tuple[str, StreamWrapper]:
+def _get_response_from_http(url: str, *, timeout: Optional[float]) -> Tuple[str, StreamWrapper]:
     try:
         with requests.Session() as session:
             if timeout is None:
@@ -29,15 +29,14 @@ def _get_response_from_http(url: str, *, timeout: float) -> Tuple[str, StreamWra
 class HTTPReaderIterDataPipe(IterDataPipe[Tuple[str, StreamWrapper]]):
     r""":class:`HTTPReaderIterDataPipe`
 
-    Iterable DataPipe that takes file URLs (http URLs pointing to files), and
-    yields tuples of file URL and IO stream
+    Iterable DataPipe that takes file URLs (http URLs pointing to files), and yields tuples of file URL and IO stream.
 
     Args:
         source_datapipe: a DataPipe that contains URLs
-        timeout : timeout in seconds for http request
+        timeout: timeout in seconds for http request
     """
 
-    def __init__(self, source_datapipe: IterDataPipe[str], timeout=None) -> None:
+    def __init__(self, source_datapipe: IterDataPipe[str], timeout: Optional[float] = None) -> None:
         self.source_datapipe: IterDataPipe[str] = source_datapipe
         self.timeout = timeout
 
@@ -49,47 +48,54 @@ def __len__(self) -> int:
         return len(self.source_datapipe)
 
 
-def _get_response_from_google_drive(url: str) -> Tuple[str, StreamWrapper]:
+def _get_response_from_google_drive(url: str, *, timeout: Optional[float]) -> Tuple[str, StreamWrapper]:
     confirm_token = None
-    session = requests.Session()
-    response = session.get(url, stream=True)
-    for k, v in response.cookies.items():
-        if k.startswith("download_warning"):
-            confirm_token = v
-    if confirm_token is None:
-        if "Quota exceeded" in str(response.content):
-            raise RuntimeError(f"Google drive link {url} is currently unavailable, because the quota was exceeded.")
-
-    if confirm_token:
-        url = url + "&confirm=" + confirm_token
-
-    response = session.get(url, stream=True)
-
-    if "content-disposition" not in response.headers:
-        raise RuntimeError("Internal error: headers don't contain content-disposition.")
-
-    filename = re.findall('filename="(.+)"', response.headers["content-disposition"])
-    if filename is None:
-        raise RuntimeError("Filename could not be autodetected")
+    with requests.Session() as session:
+        if timeout is None:
+            response = session.get(url, stream=True)
+        else:
+            response = session.get(url, timeout=timeout, stream=True)
+        for k, v in response.cookies.items():
+            if k.startswith("download_warning"):
+                confirm_token = v
+        if confirm_token is None:
+            if "Quota exceeded" in str(response.content):
+                raise RuntimeError(f"Google drive link {url} is currently unavailable, because the quota was exceeded.")
+
+        if confirm_token:
+            url = url + "&confirm=" + confirm_token
+
+        if timeout is None:
+            response = session.get(url, stream=True)
+        else:
+            response = session.get(url, timeout=timeout, stream=True)
+
+        if "content-disposition" not in response.headers:
+            raise RuntimeError("Internal error: headers don't contain content-disposition.")
+
+        filename = re.findall('filename="(.+)"', response.headers["content-disposition"])
+        if filename is None:
+            raise RuntimeError("Filename could not be autodetected")
     return filename[0], StreamWrapper(response.raw)
 
 
 class GDriveReaderDataPipe(IterDataPipe[Tuple[str, StreamWrapper]]):
     r"""
-    Iterable DataPipe that takes URLs point at GDrive files, and
-    yields tuples of file name and IO stream
+    Iterable DataPipe that takes URLs point at GDrive files, and yields tuples of file name and IO stream.
 
     Args:
         source_datapipe: a DataPipe that contains URLs to GDrive files
+        timeout: timeout in seconds for http request
     """
     source_datapipe: IterDataPipe[str]
 
-    def __init__(self, source_datapipe: IterDataPipe[str]) -> None:
+    def __init__(self, source_datapipe: IterDataPipe[str], *, timeout: Optional[float] = None) -> None:
         self.source_datapipe = source_datapipe
+        self.timeout = timeout
 
     def __iter__(self) -> Iterator[Tuple[str, StreamWrapper]]:
         for url in self.source_datapipe:
-            yield _get_response_from_google_drive(url)
+            yield _get_response_from_google_drive(url, timeout=self.timeout)
 
     def __len__(self) -> int:
         return len(self.source_datapipe)
@@ -98,15 +104,15 @@ def __len__(self) -> int:
 class OnlineReaderIterDataPipe(IterDataPipe[Tuple[str, StreamWrapper]]):
     r""":class:
     Iterable DataPipe that takes file URLs (can be http URLs pointing to files or URLs to GDrive files), and
-    yields tuples of file URL and IO stream
+    yields tuples of file URL and IO stream.
 
     Args:
         source_datapipe: a DataPipe that contains URLs
-        timeout : timeout in seconds for http request
+        timeout: timeout in seconds for http request
     """
     source_datapipe: IterDataPipe[str]
 
-    def __init__(self, source_datapipe: IterDataPipe[str], *, timeout=None) -> None:
+    def __init__(self, source_datapipe: IterDataPipe[str], *, timeout: Optional[float] = None) -> None:
         self.source_datapipe = source_datapipe
         self.timeout = timeout
 
@@ -115,8 +121,7 @@ def __iter__(self) -> Iterator[Tuple[str, StreamWrapper]]:
             parts = urlparse(url)
 
             if re.match(r"(drive|docs)[.]google[.]com", parts.netloc):
-                # TODO(137): can this also have a timeout?
-                yield _get_response_from_google_drive(url)
+                yield _get_response_from_google_drive(url, timeout=self.timeout)
             else:
                 yield _get_response_from_http(url, timeout=self.timeout)
 
diff --git a/torchdata/datapipes/iter/util/hashchecker.py b/torchdata/datapipes/iter/util/hashchecker.py
@@ -58,7 +58,7 @@ def __iter__(self) -> Iterator[Tuple[str, StreamWrapper]]:
                 hash_func.update(data)
             # File Stream
             else:
-                # Not all of streams have `read(bytes)` method.
+                # Not all streams have `read(bytes)` method.
                 # `__iter__` method is chosen because it is a common interface for IOBase.
                 for d in data:
                     hash_func.update(d)
@@ -72,9 +72,8 @@ def __iter__(self) -> Iterator[Tuple[str, StreamWrapper]]:
 
             if hash_func.hexdigest() != self.hash_dict[file_name]:
                 raise RuntimeError(
-                    "The hash {} of {} does not match. Delete the file manually and retry.".format(
-                        hash_func.hexdigest(), file_name
-                    )
+                    f"The computed hash {hash_func.hexdigest()} of {file_name} does not match the expected"
+                    f"hash {self.hash_dict[file_name]}. Delete the file manually and retry."
                 )
 
             if isinstance(data, (str, bytes, bytearray)):