Skip to content

Commit 52ffd9a

Browse files
committed
Merge pull request #3808 from jreback/iocom
CLN: refactored url accessing and filepath conversion from urls to io.common
2 parents caddebe + 0bfef8a commit 52ffd9a

File tree

5 files changed

+91
-71
lines changed

5 files changed

+91
-71
lines changed

pandas/io/common.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
""" Common api utilities """
2+
3+
import urlparse
4+
from pandas.util import py3compat
5+
6+
_VALID_URLS = set(urlparse.uses_relative + urlparse.uses_netloc +
7+
urlparse.uses_params)
8+
_VALID_URLS.discard('')
9+
10+
11+
def _is_url(url):
12+
"""Check to see if a URL has a valid protocol.
13+
14+
Parameters
15+
----------
16+
url : str or unicode
17+
18+
Returns
19+
-------
20+
isurl : bool
21+
If `url` has a valid protocol return True otherwise False.
22+
"""
23+
try:
24+
return urlparse.urlparse(url).scheme in _VALID_URLS
25+
except:
26+
return False
27+
28+
def _is_s3_url(url):
29+
""" Check for an s3 url """
30+
try:
31+
return urlparse.urlparse(url).scheme == 's3'
32+
except:
33+
return False
34+
35+
def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
36+
""" if the filepath_or_buffer is a url, translate and return the buffer
37+
passthru otherwise
38+
39+
Parameters
40+
----------
41+
filepath_or_buffer : a url, filepath, or buffer
42+
encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
43+
44+
Returns
45+
-------
46+
a filepath_or_buffer, the encoding
47+
48+
"""
49+
50+
if _is_url(filepath_or_buffer):
51+
from urllib2 import urlopen
52+
filepath_or_buffer = urlopen(filepath_or_buffer)
53+
if py3compat.PY3: # pragma: no cover
54+
if encoding:
55+
errors = 'strict'
56+
else:
57+
errors = 'replace'
58+
encoding = 'utf-8'
59+
bytes = filepath_or_buffer.read()
60+
filepath_or_buffer = StringIO(bytes.decode(encoding, errors))
61+
return filepath_or_buffer, encoding
62+
return filepath_or_buffer, None
63+
64+
if _is_s3_url(filepath_or_buffer):
65+
try:
66+
import boto
67+
except:
68+
raise ImportError("boto is required to handle s3 files")
69+
# Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
70+
# are environment variables
71+
parsed_url = urlparse.urlparse(filepath_or_buffer)
72+
conn = boto.connect_s3()
73+
b = conn.get_bucket(parsed_url.netloc)
74+
k = boto.s3.key.Key(b)
75+
k.key = parsed_url.path
76+
filepath_or_buffer = StringIO(k.get_contents_as_string())
77+
return filepath_or_buffer, None
78+
79+
return filepath_or_buffer, None

pandas/io/html.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import numpy as np
2121

2222
from pandas import DataFrame, MultiIndex, isnull
23-
from pandas.io.parsers import _is_url
23+
from pandas.io.common import _is_url
2424

2525

2626
try:

pandas/io/parsers.py

Lines changed: 2 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from StringIO import StringIO
55
import re
66
from itertools import izip
7-
import urlparse
87
import csv
98

109
import numpy as np
@@ -15,6 +14,7 @@
1514
import pandas.core.common as com
1615
from pandas.util import py3compat
1716
from pandas.io.date_converters import generic_parser
17+
from pandas.io.common import get_filepath_or_buffer
1818

1919
from pandas.util.decorators import Appender
2020

@@ -176,68 +176,14 @@ class DateConversionError(Exception):
176176
""" % (_parser_params % _fwf_widths)
177177

178178

179-
_VALID_URLS = set(urlparse.uses_relative + urlparse.uses_netloc +
180-
urlparse.uses_params)
181-
_VALID_URLS.discard('')
182-
183-
184-
def _is_url(url):
185-
"""Check to see if a URL has a valid protocol.
186-
187-
Parameters
188-
----------
189-
url : str or unicode
190-
191-
Returns
192-
-------
193-
isurl : bool
194-
If `url` has a valid protocol return True otherwise False.
195-
"""
196-
try:
197-
return urlparse.urlparse(url).scheme in _VALID_URLS
198-
except:
199-
return False
200-
201-
def _is_s3_url(url):
202-
""" Check for an s3 url """
203-
try:
204-
return urlparse.urlparse(url).scheme == 's3'
205-
except:
206-
return False
207-
208179
def _read(filepath_or_buffer, kwds):
209180
"Generic reader of line files."
210181
encoding = kwds.get('encoding', None)
211182
skipfooter = kwds.pop('skipfooter', None)
212183
if skipfooter is not None:
213184
kwds['skip_footer'] = skipfooter
214185

215-
if isinstance(filepath_or_buffer, basestring):
216-
if _is_url(filepath_or_buffer):
217-
from urllib2 import urlopen
218-
filepath_or_buffer = urlopen(filepath_or_buffer)
219-
if py3compat.PY3: # pragma: no cover
220-
if encoding:
221-
errors = 'strict'
222-
else:
223-
errors = 'replace'
224-
encoding = 'utf-8'
225-
bytes = filepath_or_buffer.read()
226-
filepath_or_buffer = StringIO(bytes.decode(encoding, errors))
227-
228-
if _is_s3_url(filepath_or_buffer):
229-
try:
230-
import boto
231-
except:
232-
raise ImportError("boto is required to handle s3 files")
233-
# Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
234-
# are environment variables
235-
parsed_url = urlparse.urlparse(filepath_or_buffer)
236-
conn = boto.connect_s3()
237-
b = conn.get_bucket(parsed_url.netloc)
238-
k = boto.s3.key.Key(b)
239-
k.key = parsed_url.path
240-
filepath_or_buffer = StringIO(k.get_contents_as_string())
186+
filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer)
241187

242188
if kwds.get('date_parser', None) is not None:
243189
if isinstance(kwds['parse_dates'], bool):

pandas/io/stata.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
import datetime
2222
from pandas.util import py3compat
2323
from pandas import isnull
24-
from pandas.io.parsers import _parser_params, _is_url, Appender
24+
from pandas.io.parsers import _parser_params, Appender
25+
from pandas.io.common import get_filepath_or_buffer
2526

2627

2728
_read_stata_doc = """
@@ -288,18 +289,12 @@ def __init__(self, path_or_buf, encoding=None):
288289
self._missing_values = False
289290
self._data_read = False
290291
self._value_labels_read = False
291-
if isinstance(path_or_buf, str) and _is_url(path_or_buf):
292-
from urllib.request import urlopen
293-
path_or_buf = urlopen(path_or_buf)
294-
if py3compat.PY3: # pragma: no cover
295-
if self._encoding:
296-
errors = 'strict'
297-
else:
298-
errors = 'replace'
299-
self._encoding = 'cp1252'
300-
bytes = path_or_buf.read()
301-
self.path_or_buf = StringIO(self._decode_bytes(bytes, errors))
302-
elif type(path_or_buf) is str:
292+
if isinstance(path_or_buf, str):
293+
path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding='cp1252')
294+
if encoding is not None:
295+
self._encoding = encoding
296+
297+
if type(path_or_buf) is str:
303298
self.path_or_buf = open(path_or_buf, 'rb')
304299
else:
305300
self.path_or_buf = path_or_buf

pandas/io/tests/test_stata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def test_read_dta9(self):
185185
def test_stata_doc_examples(self):
186186
with ensure_clean(self.dta5) as path:
187187
df = DataFrame(np.random.randn(10,2),columns=list('AB'))
188-
df.to_stata('path')
188+
df.to_stata(path)
189189

190190
if __name__ == '__main__':
191191
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)