Skip to content

Commit 58c2a2b

Browse files
Chang Shewesm
Chang She
authored andcommitted
ENH: allow index_col to be named in read_csv etc #1294
1 parent 579c2e4 commit 58c2a2b

File tree

3 files changed

+55
-44
lines changed

3 files changed

+55
-44
lines changed

doc/source/io.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,9 @@ or store various date fields separately. the ``parse_dates`` keyword can be
152152
used to specify a combination of columns to parse the dates and/or times from.
153153

154154
You can specify a list of column lists to ``parse_dates``, the resulting date
155-
columns will be prepended to the output and the new column names will be the
156-
concatenation of the component column names:
155+
columns will be prepended to the output (so as to not affect the existing column
156+
order) and the new column names will be the concatenation of the component
157+
column names:
157158

158159
.. ipython:: python
159160
:suppress:

pandas/io/parsers.py

Lines changed: 47 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -655,16 +655,29 @@ def _get_index_name(self):
655655
self.index_col = range(implicit_first_cols)
656656
index_name = None
657657
elif np.isscalar(self.index_col):
658+
if isinstance(self.index_col, basestring):
659+
for i, c in enumerate(list(columns)):
660+
if c == self.index_col:
661+
self.index_col = i
658662
index_name = columns.pop(self.index_col)
663+
659664
if index_name is not None and 'Unnamed' in index_name:
660665
index_name = None
666+
661667
elif self.index_col is not None:
662668
cp_cols = list(columns)
663669
index_name = []
664-
for i in self.index_col:
665-
name = cp_cols[i]
666-
columns.remove(name)
667-
index_name.append(name)
670+
index_col = list(self.index_col)
671+
for i, c in enumerate(index_col):
672+
if isinstance(c, basestring):
673+
index_name = c
674+
for j, name in enumerate(cp_cols):
675+
if name == index_name:
676+
index_col[i] = j
677+
else:
678+
name = cp_cols[c]
679+
columns.remove(name)
680+
index_name.append(name)
668681

669682
return index_name
670683

@@ -698,42 +711,10 @@ def get_chunk(self, rows=None):
698711
zipped_content = list(lib.to_object_array(content).T)
699712

700713
if self.index_col is not None:
701-
if np.isscalar(self.index_col):
702-
index = zipped_content.pop(self.index_col)
703-
else: # given a list of index
704-
index = []
705-
for idx in self.index_col:
706-
index.append(zipped_content[idx])
707-
# remove index items from content and columns, don't pop in
708-
# loop
709-
for i in reversed(sorted(self.index_col)):
710-
zipped_content.pop(i)
711-
712-
if np.isscalar(self.index_col):
713-
if self._should_parse_dates(self.index_col):
714-
index = self._conv_date(index)
715-
index, na_count = _convert_types(index, self.na_values)
716-
index = Index(index, name=self.index_name)
717-
if self.verbose and na_count:
718-
print 'Found %d NA values in the index' % na_count
719-
else:
720-
arrays = []
721-
for i, arr in enumerate(index):
722-
if self._should_parse_dates(self.index_col[i]):
723-
arr = self._conv_date(arr)
724-
arr, _ = _convert_types(arr, self.na_values)
725-
arrays.append(arr)
726-
index = MultiIndex.from_arrays(arrays, names=self.index_name)
714+
index = self._extract_index(zipped_content)
727715
else:
728716
index = Index(np.arange(len(content)))
729717

730-
# if not index.is_unique:
731-
# dups = index.get_duplicates()
732-
# idx_str = 'Index' if not self._implicit_index else 'Implicit index'
733-
# err_msg = ('%s (columns %s) have duplicate values %s'
734-
# % (idx_str, self.index_col, str(dups)))
735-
# raise Exception(err_msg)
736-
737718
col_len, zip_len = len(self.columns), len(zipped_content)
738719
if col_len != zip_len:
739720
row_num = -1
@@ -769,6 +750,35 @@ def get_chunk(self, rows=None):
769750
return df[df.columns[0]]
770751
return df
771752

753+
def _extract_index(self, zipped_content):
754+
if np.isscalar(self.index_col):
755+
index = zipped_content.pop(self.index_col)
756+
else: # given a list of index
757+
index = []
758+
for idx in self.index_col:
759+
index.append(zipped_content[idx])
760+
# remove index items from content and columns, don't pop in
761+
# loop
762+
for i in reversed(sorted(self.index_col)):
763+
zipped_content.pop(i)
764+
765+
if np.isscalar(self.index_col):
766+
if self._should_parse_dates(self.index_col):
767+
index = self._conv_date(index)
768+
index, na_count = _convert_types(index, self.na_values)
769+
index = Index(index, name=self.index_name)
770+
if self.verbose and na_count:
771+
print 'Found %d NA values in the index' % na_count
772+
else:
773+
arrays = []
774+
for i, arr in enumerate(index):
775+
if self._should_parse_dates(self.index_col[i]):
776+
arr = self._conv_date(arr)
777+
arr, _ = _convert_types(arr, self.na_values)
778+
arrays.append(arr)
779+
index = MultiIndex.from_arrays(arrays, names=self.index_name)
780+
return index
781+
772782
def _find_line_number(self, exp_len, chunk_len, chunk_i):
773783
if exp_len is None:
774784
prev_pos = 0

pandas/io/tests/test_parsers.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ def test_multiple_date_cols_with_header(self):
197197
df = read_csv(StringIO(data), parse_dates={'nominal': [1, 2]})
198198
self.assert_(not isinstance(df.nominal[0], basestring))
199199

200-
def test_multiple_date_cols_index(self):
200+
def test_index_col_named(self):
201201
data = """\
202202
ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
203203
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
@@ -207,10 +207,10 @@ def test_multiple_date_cols_index(self):
207207
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
208208
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
209209

210-
df = read_csv(StringIO(data), index_col='ID')
211-
df = read_csv(StringIO(data), parse_dates={'nominal': [1, 2]},
212-
index_col='nominal')
213-
self.assert_(not isinstance(df.nominal[0], basestring))
210+
rs = read_csv(StringIO(data), index_col='ID')
211+
xp = read_csv(StringIO(data), header=0).set_index('ID')
212+
assert_frame_equal(rs, xp)
213+
214214

215215
def test_multiple_skts_example(self):
216216
data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11."

0 commit comments

Comments
 (0)