ENH: allow index_col to be named in read_csv etc #1294

Chang She · wesm · commit 58c2a2b3d4b1 · 2012-05-23T17:24:25.000-04:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -152,8 +152,9 @@ or store various date fields separately. the ``parse_dates`` keyword can be
 used to specify a combination of columns to parse the dates and/or times from.
 
 You can specify a list of column lists to ``parse_dates``, the resulting date
-columns will be prepended to the output and the new column names will be the
-concatenation of the component column names:
+columns will be prepended to the output (so as to not affect the existing column
+order) and the new column names will be the concatenation of the component
+column names:
 
 .. ipython:: python
    :suppress:
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -655,16 +655,29 @@ def _get_index_name(self):
                     self.index_col = range(implicit_first_cols)
             index_name = None
         elif np.isscalar(self.index_col):
+            if isinstance(self.index_col, basestring):
+                for i, c in enumerate(list(columns)):
+                    if c == self.index_col:
+                        self.index_col = i
             index_name = columns.pop(self.index_col)
+
             if index_name is not None and 'Unnamed' in index_name:
                 index_name = None
+
         elif self.index_col is not None:
             cp_cols = list(columns)
             index_name = []
-            for i in self.index_col:
-                name = cp_cols[i]
-                columns.remove(name)
-                index_name.append(name)
+            index_col = list(self.index_col)
+            for i, c in enumerate(index_col):
+                if isinstance(c, basestring):
+                    index_name = c
+                    for j, name in enumerate(cp_cols):
+                        if name == index_name:
+                            index_col[i] = j
+                else:
+                    name = cp_cols[c]
+                    columns.remove(name)
+                    index_name.append(name)
 
         return index_name
 
@@ -698,42 +711,10 @@ def get_chunk(self, rows=None):
         zipped_content = list(lib.to_object_array(content).T)
 
         if self.index_col is not None:
-            if np.isscalar(self.index_col):
-                index = zipped_content.pop(self.index_col)
-            else: # given a list of index
-                index = []
-                for idx in self.index_col:
-                    index.append(zipped_content[idx])
-                # remove index items from content and columns, don't pop in
-                # loop
-                for i in reversed(sorted(self.index_col)):
-                    zipped_content.pop(i)
-
-            if np.isscalar(self.index_col):
-                if self._should_parse_dates(self.index_col):
-                    index = self._conv_date(index)
-                index, na_count = _convert_types(index, self.na_values)
-                index = Index(index, name=self.index_name)
-                if self.verbose and na_count:
-                    print 'Found %d NA values in the index' % na_count
-            else:
-                arrays = []
-                for i, arr in enumerate(index):
-                    if self._should_parse_dates(self.index_col[i]):
-                        arr = self._conv_date(arr)
-                    arr, _ = _convert_types(arr, self.na_values)
-                    arrays.append(arr)
-                index = MultiIndex.from_arrays(arrays, names=self.index_name)
+            index = self._extract_index(zipped_content)
         else:
             index = Index(np.arange(len(content)))
 
-        # if not index.is_unique:
-        #     dups = index.get_duplicates()
-        #     idx_str = 'Index' if not self._implicit_index else 'Implicit index'
-        #     err_msg = ('%s (columns %s) have duplicate values %s'
-        #                % (idx_str, self.index_col, str(dups)))
-        #     raise Exception(err_msg)
-
         col_len, zip_len = len(self.columns), len(zipped_content)
         if col_len != zip_len:
             row_num = -1
@@ -769,6 +750,35 @@ def get_chunk(self, rows=None):
             return df[df.columns[0]]
         return df
 
+    def _extract_index(self, zipped_content):
+        if np.isscalar(self.index_col):
+            index = zipped_content.pop(self.index_col)
+        else: # given a list of index
+            index = []
+            for idx in self.index_col:
+                index.append(zipped_content[idx])
+            # remove index items from content and columns, don't pop in
+            # loop
+            for i in reversed(sorted(self.index_col)):
+                zipped_content.pop(i)
+
+        if np.isscalar(self.index_col):
+            if self._should_parse_dates(self.index_col):
+                index = self._conv_date(index)
+            index, na_count = _convert_types(index, self.na_values)
+            index = Index(index, name=self.index_name)
+            if self.verbose and na_count:
+                print 'Found %d NA values in the index' % na_count
+        else:
+            arrays = []
+            for i, arr in enumerate(index):
+                if self._should_parse_dates(self.index_col[i]):
+                    arr = self._conv_date(arr)
+                arr, _ = _convert_types(arr, self.na_values)
+                arrays.append(arr)
+            index = MultiIndex.from_arrays(arrays, names=self.index_name)
+        return index
+
     def _find_line_number(self, exp_len, chunk_len, chunk_i):
         if exp_len is None:
             prev_pos = 0
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -197,7 +197,7 @@ def test_multiple_date_cols_with_header(self):
         df = read_csv(StringIO(data), parse_dates={'nominal': [1, 2]})
         self.assert_(not isinstance(df.nominal[0], basestring))
 
-    def test_multiple_date_cols_index(self):
+    def test_index_col_named(self):
         data = """\
 ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
 KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
@@ -207,10 +207,10 @@ def test_multiple_date_cols_index(self):
 KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
 
-        df = read_csv(StringIO(data), index_col='ID')
-        df = read_csv(StringIO(data), parse_dates={'nominal': [1, 2]},
-                      index_col='nominal')
-        self.assert_(not isinstance(df.nominal[0], basestring))
+        rs = read_csv(StringIO(data), index_col='ID')
+        xp = read_csv(StringIO(data), header=0).set_index('ID')
+        assert_frame_equal(rs, xp)
+
 
     def test_multiple_skts_example(self):
         data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11."