Merge pull request #3808 from jreback/iocom

jreback · jreback · commit 52ffd9a7ce7c · 2013-06-08T06:04:02.000-07:00
CLN: refactored url accessing and filepath conversion from urls to io.common
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -0,0 +1,79 @@
+""" Common api utilities """
+
+import urlparse
+from pandas.util import py3compat
+
+_VALID_URLS = set(urlparse.uses_relative + urlparse.uses_netloc +
+                  urlparse.uses_params)
+_VALID_URLS.discard('')
+
+
+def _is_url(url):
+    """Check to see if a URL has a valid protocol.
+
+    Parameters
+    ----------
+    url : str or unicode
+
+    Returns
+    -------
+    isurl : bool
+        If `url` has a valid protocol return True otherwise False.
+    """
+    try:
+        return urlparse.urlparse(url).scheme in _VALID_URLS
+    except:
+        return False
+
+def _is_s3_url(url):
+    """ Check for an s3 url """
+    try:
+        return urlparse.urlparse(url).scheme == 's3'
+    except:
+        return False
+
+def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
+    """ if the filepath_or_buffer is a url, translate and return the buffer
+        passthru otherwise
+
+        Parameters
+        ----------
+        filepath_or_buffer : a url, filepath, or buffer
+        encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
+
+        Returns
+        -------
+        a filepath_or_buffer, the encoding
+        
+        """
+
+    if _is_url(filepath_or_buffer):
+        from urllib2 import urlopen
+        filepath_or_buffer = urlopen(filepath_or_buffer)
+        if py3compat.PY3:  # pragma: no cover
+            if encoding:
+                errors = 'strict'
+            else:
+                errors = 'replace'
+                encoding = 'utf-8'
+            bytes = filepath_or_buffer.read()
+            filepath_or_buffer = StringIO(bytes.decode(encoding, errors))
+            return filepath_or_buffer, encoding
+        return filepath_or_buffer, None
+
+    if _is_s3_url(filepath_or_buffer):
+        try:
+            import boto
+        except:
+            raise ImportError("boto is required to handle s3 files")
+        # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
+        # are environment variables
+        parsed_url = urlparse.urlparse(filepath_or_buffer)
+        conn = boto.connect_s3()
+        b = conn.get_bucket(parsed_url.netloc)
+        k = boto.s3.key.Key(b)
+        k.key = parsed_url.path
+        filepath_or_buffer = StringIO(k.get_contents_as_string())
+        return filepath_or_buffer, None
+
+    return filepath_or_buffer, None
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from pandas import DataFrame, MultiIndex, isnull
-from pandas.io.parsers import _is_url
+from pandas.io.common import _is_url
 
 
 try:
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -4,7 +4,6 @@
 from StringIO import StringIO
 import re
 from itertools import izip
-import urlparse
 import csv
 
 import numpy as np
@@ -15,6 +14,7 @@
 import pandas.core.common as com
 from pandas.util import py3compat
 from pandas.io.date_converters import generic_parser
+from pandas.io.common import get_filepath_or_buffer
 
 from pandas.util.decorators import Appender
 
@@ -176,68 +176,14 @@ class DateConversionError(Exception):
 """ % (_parser_params % _fwf_widths)
 
 
-_VALID_URLS = set(urlparse.uses_relative + urlparse.uses_netloc +
-                  urlparse.uses_params)
-_VALID_URLS.discard('')
-
-
-def _is_url(url):
-    """Check to see if a URL has a valid protocol.
-
-    Parameters
-    ----------
-    url : str or unicode
-
-    Returns
-    -------
-    isurl : bool
-        If `url` has a valid protocol return True otherwise False.
-    """
-    try:
-        return urlparse.urlparse(url).scheme in _VALID_URLS
-    except:
-        return False
-
-def _is_s3_url(url):
-    """ Check for an s3 url """
-    try:
-        return urlparse.urlparse(url).scheme == 's3'
-    except:
-        return False
-
 def _read(filepath_or_buffer, kwds):
     "Generic reader of line files."
     encoding = kwds.get('encoding', None)
     skipfooter = kwds.pop('skipfooter', None)
     if skipfooter is not None:
         kwds['skip_footer'] = skipfooter
 
-    if isinstance(filepath_or_buffer, basestring):
-        if _is_url(filepath_or_buffer):
-            from urllib2 import urlopen
-            filepath_or_buffer = urlopen(filepath_or_buffer)
-            if py3compat.PY3:  # pragma: no cover
-                if encoding:
-                    errors = 'strict'
-                else:
-                    errors = 'replace'
-                    encoding = 'utf-8'
-                bytes = filepath_or_buffer.read()
-                filepath_or_buffer = StringIO(bytes.decode(encoding, errors))
-
-        if _is_s3_url(filepath_or_buffer):
-            try:
-                import boto
-            except:
-                raise ImportError("boto is required to handle s3 files")
-            # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
-            # are environment variables
-            parsed_url = urlparse.urlparse(filepath_or_buffer)
-            conn = boto.connect_s3()
-            b = conn.get_bucket(parsed_url.netloc)
-            k = boto.s3.key.Key(b)
-            k.key = parsed_url.path
-            filepath_or_buffer = StringIO(k.get_contents_as_string())
+    filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer)
 
     if kwds.get('date_parser', None) is not None:
         if isinstance(kwds['parse_dates'], bool):
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -21,7 +21,8 @@
 import datetime
 from pandas.util import py3compat
 from pandas import isnull
-from pandas.io.parsers import _parser_params, _is_url, Appender
+from pandas.io.parsers import _parser_params, Appender
+from pandas.io.common import get_filepath_or_buffer
 
 
 _read_stata_doc = """
@@ -288,18 +289,12 @@ def __init__(self, path_or_buf, encoding=None):
         self._missing_values = False
         self._data_read = False
         self._value_labels_read = False
-        if isinstance(path_or_buf, str) and _is_url(path_or_buf):
-            from urllib.request import urlopen
-            path_or_buf = urlopen(path_or_buf)
-            if py3compat.PY3:  # pragma: no cover
-                if self._encoding:
-                    errors = 'strict'
-                else:
-                    errors = 'replace'
-                    self._encoding = 'cp1252'
-                bytes = path_or_buf.read()
-                self.path_or_buf = StringIO(self._decode_bytes(bytes, errors))
-        elif type(path_or_buf) is str:
+        if isinstance(path_or_buf, str):
+            path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding='cp1252')
+            if encoding is not None:
+                self._encoding = encoding
+
+        if type(path_or_buf) is str:
             self.path_or_buf = open(path_or_buf, 'rb')
         else:
             self.path_or_buf = path_or_buf
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -185,7 +185,7 @@ def test_read_dta9(self):
     def test_stata_doc_examples(self):
         with ensure_clean(self.dta5) as path:
             df = DataFrame(np.random.randn(10,2),columns=list('AB'))
-            df.to_stata('path')
+            df.to_stata(path)
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],