Merge pull request #8122 from amras1/linecomments-whitespace-delim

jreback · jreback · commit a4a4fefb8306 · 2014-08-28T13:01:12.000-04:00
Made line comments work with delim_whitespace and custom line terminator
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -640,6 +640,8 @@ Bug Fixes
 - Bug in ``Float64Index`` where ``iat`` and ``at`` were not testing and were
   failing (:issue:`8092`).
 
+- Bug in ``read_csv`` where line comments were not handled correctly given
+  a custom line terminator or ``delim_whitespace=True`` (:issue:`8122`).
 
 
 - Bug in accessing groups from a ``GroupBy`` when the original grouper
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2944,6 +2944,14 @@ def test_line_comment(self):
                     [5., np.nan, 10.]]
         df = self.read_csv(StringIO(data), comment='#')
         tm.assert_almost_equal(df.values, expected)
+        # check with delim_whitespace=True
+        df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#',
+                            delim_whitespace=True)
+        tm.assert_almost_equal(df.values, expected)
+        # check with custom line terminator
+        df = self.read_csv(StringIO(data.replace('\n', '*')), comment='#',
+                            lineterminator='*')
+        tm.assert_almost_equal(df.values, expected)
 
     def test_comment_skiprows(self):
         data = """# empty
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -969,6 +969,10 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
                 END_LINE();
                 break;
             }
+            else if (c == self->commentchar) {
+                self->state = EAT_LINE_COMMENT;
+                break;
+            }
             /* normal character - handle as START_FIELD */
             self->state = START_FIELD;
             /* fallthru */
@@ -1103,6 +1107,13 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
             }
             break;
 
+        case EAT_LINE_COMMENT:
+            if (c == self->lineterminator) {
+                self->file_lines++;
+                self->state = START_RECORD;
+            }
+            break;
+
         case EAT_COMMENT:
             if (c == self->lineterminator) {
                 END_LINE();
@@ -1186,6 +1197,9 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
             } else if (IS_WHITESPACE(c)) {
                 self->state = EAT_WHITESPACE;
                 break;
+            } else if (c == self->commentchar) {
+                self->state = EAT_LINE_COMMENT;
+                break;
             } else {
                 /* normal character - handle as START_FIELD */
                 self->state = START_FIELD;
@@ -1231,6 +1245,16 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
             }
             break;
 
+        case EAT_LINE_COMMENT:
+            if (c == '\n') {
+                self->file_lines++;
+                self->state = START_RECORD;
+            } else if (c == '\r') {
+                self->file_lines++;
+                self->state = EAT_CRNL_NOP;
+            }
+            break;
+
         case ESCAPED_CHAR:
             /* if (c == '\0') */
             /*  c = '\n'; */
@@ -1351,6 +1375,15 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
             }
             break;
 
+        case EAT_CRNL_NOP: // inside an ignored comment line
+            self->state = START_RECORD;
+            /* \r line terminator -- parse this character again */
+            if (c != '\n' && c != self->delimiter) {
+                --i;
+                --buf;
+            }
+            break;
+
         case EAT_COMMENT:
             if (c == '\n') {
                 END_LINE();