From 0bfef8a28d66eac7b62a639529c20924ace53732 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 8 Jun 2013 08:37:28 -0400 Subject: [PATCH] CLN: refactored url accessing and filepath conversion from urls to io.common --- pandas/io/common.py | 79 +++++++++++++++++++++++++++++++++++ pandas/io/html.py | 2 +- pandas/io/parsers.py | 58 +------------------------ pandas/io/stata.py | 21 ++++------ pandas/io/tests/test_stata.py | 2 +- 5 files changed, 91 insertions(+), 71 deletions(-) create mode 100644 pandas/io/common.py diff --git a/pandas/io/common.py b/pandas/io/common.py new file mode 100644 index 0000000000000..46b47c06f7f5d --- /dev/null +++ b/pandas/io/common.py @@ -0,0 +1,79 @@ +""" Common api utilities """ + +import urlparse +from pandas.util import py3compat + +_VALID_URLS = set(urlparse.uses_relative + urlparse.uses_netloc + + urlparse.uses_params) +_VALID_URLS.discard('') + + +def _is_url(url): + """Check to see if a URL has a valid protocol. + + Parameters + ---------- + url : str or unicode + + Returns + ------- + isurl : bool + If `url` has a valid protocol return True otherwise False. + """ + try: + return urlparse.urlparse(url).scheme in _VALID_URLS + except: + return False + +def _is_s3_url(url): + """ Check for an s3 url """ + try: + return urlparse.urlparse(url).scheme == 's3' + except: + return False + +def get_filepath_or_buffer(filepath_or_buffer, encoding=None): + """ if the filepath_or_buffer is a url, translate and return the buffer + passthru otherwise + + Parameters + ---------- + filepath_or_buffer : a url, filepath, or buffer + encoding : the encoding to use to decode py3 bytes, default is 'utf-8' + + Returns + ------- + a filepath_or_buffer, the encoding + + """ + + if _is_url(filepath_or_buffer): + from urllib2 import urlopen + filepath_or_buffer = urlopen(filepath_or_buffer) + if py3compat.PY3: # pragma: no cover + if encoding: + errors = 'strict' + else: + errors = 'replace' + encoding = 'utf-8' + bytes = filepath_or_buffer.read() + filepath_or_buffer = StringIO(bytes.decode(encoding, errors)) + return filepath_or_buffer, encoding + return filepath_or_buffer, None + + if _is_s3_url(filepath_or_buffer): + try: + import boto + except: + raise ImportError("boto is required to handle s3 files") + # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY + # are environment variables + parsed_url = urlparse.urlparse(filepath_or_buffer) + conn = boto.connect_s3() + b = conn.get_bucket(parsed_url.netloc) + k = boto.s3.key.Key(b) + k.key = parsed_url.path + filepath_or_buffer = StringIO(k.get_contents_as_string()) + return filepath_or_buffer, None + + return filepath_or_buffer, None diff --git a/pandas/io/html.py b/pandas/io/html.py index a5798b3493732..08a9403cd18a7 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -20,7 +20,7 @@ import numpy as np from pandas import DataFrame, MultiIndex, isnull -from pandas.io.parsers import _is_url +from pandas.io.common import _is_url try: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 556d1ab1976b4..54ba7536afaee 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -4,7 +4,6 @@ from StringIO import StringIO import re from itertools import izip -import urlparse import csv import numpy as np @@ -15,6 +14,7 @@ import pandas.core.common as com from pandas.util import py3compat from pandas.io.date_converters import generic_parser +from pandas.io.common import get_filepath_or_buffer from pandas.util.decorators import Appender @@ -176,35 +176,6 @@ class DateConversionError(Exception): """ % (_parser_params % _fwf_widths) -_VALID_URLS = set(urlparse.uses_relative + urlparse.uses_netloc + - urlparse.uses_params) -_VALID_URLS.discard('') - - -def _is_url(url): - """Check to see if a URL has a valid protocol. - - Parameters - ---------- - url : str or unicode - - Returns - ------- - isurl : bool - If `url` has a valid protocol return True otherwise False. - """ - try: - return urlparse.urlparse(url).scheme in _VALID_URLS - except: - return False - -def _is_s3_url(url): - """ Check for an s3 url """ - try: - return urlparse.urlparse(url).scheme == 's3' - except: - return False - def _read(filepath_or_buffer, kwds): "Generic reader of line files." encoding = kwds.get('encoding', None) @@ -212,32 +183,7 @@ def _read(filepath_or_buffer, kwds): if skipfooter is not None: kwds['skip_footer'] = skipfooter - if isinstance(filepath_or_buffer, basestring): - if _is_url(filepath_or_buffer): - from urllib2 import urlopen - filepath_or_buffer = urlopen(filepath_or_buffer) - if py3compat.PY3: # pragma: no cover - if encoding: - errors = 'strict' - else: - errors = 'replace' - encoding = 'utf-8' - bytes = filepath_or_buffer.read() - filepath_or_buffer = StringIO(bytes.decode(encoding, errors)) - - if _is_s3_url(filepath_or_buffer): - try: - import boto - except: - raise ImportError("boto is required to handle s3 files") - # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY - # are environment variables - parsed_url = urlparse.urlparse(filepath_or_buffer) - conn = boto.connect_s3() - b = conn.get_bucket(parsed_url.netloc) - k = boto.s3.key.Key(b) - k.key = parsed_url.path - filepath_or_buffer = StringIO(k.get_contents_as_string()) + filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer) if kwds.get('date_parser', None) is not None: if isinstance(kwds['parse_dates'], bool): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index f1257f505ca9b..ddc9db0b76539 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -21,7 +21,8 @@ import datetime from pandas.util import py3compat from pandas import isnull -from pandas.io.parsers import _parser_params, _is_url, Appender +from pandas.io.parsers import _parser_params, Appender +from pandas.io.common import get_filepath_or_buffer _read_stata_doc = """ @@ -288,18 +289,12 @@ def __init__(self, path_or_buf, encoding=None): self._missing_values = False self._data_read = False self._value_labels_read = False - if isinstance(path_or_buf, str) and _is_url(path_or_buf): - from urllib.request import urlopen - path_or_buf = urlopen(path_or_buf) - if py3compat.PY3: # pragma: no cover - if self._encoding: - errors = 'strict' - else: - errors = 'replace' - self._encoding = 'cp1252' - bytes = path_or_buf.read() - self.path_or_buf = StringIO(self._decode_bytes(bytes, errors)) - elif type(path_or_buf) is str: + if isinstance(path_or_buf, str): + path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding='cp1252') + if encoding is not None: + self._encoding = encoding + + if type(path_or_buf) is str: self.path_or_buf = open(path_or_buf, 'rb') else: self.path_or_buf = path_or_buf diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 9f5d796763fb0..d512b0267ed13 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -185,7 +185,7 @@ def test_read_dta9(self): def test_stata_doc_examples(self): with ensure_clean(self.dta5) as path: df = DataFrame(np.random.randn(10,2),columns=list('AB')) - df.to_stata('path') + df.to_stata(path) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],