Skip to content

Commit a4a4fef

Browse files
committed
Merge pull request #8122 from amras1/linecomments-whitespace-delim
Made line comments work with delim_whitespace and custom line terminator
2 parents 855f9aa + 9a877dd commit a4a4fef

File tree

3 files changed

+43
-0
lines changed

3 files changed

+43
-0
lines changed

doc/source/v0.15.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -640,6 +640,8 @@ Bug Fixes
640640
- Bug in ``Float64Index`` where ``iat`` and ``at`` were not testing and were
641641
failing (:issue:`8092`).
642642

643+
- Bug in ``read_csv`` where line comments were not handled correctly given
644+
a custom line terminator or ``delim_whitespace=True`` (:issue:`8122`).
643645

644646

645647
- Bug in accessing groups from a ``GroupBy`` when the original grouper

pandas/io/tests/test_parsers.py

+8
Original file line numberDiff line numberDiff line change
@@ -2944,6 +2944,14 @@ def test_line_comment(self):
29442944
[5., np.nan, 10.]]
29452945
df = self.read_csv(StringIO(data), comment='#')
29462946
tm.assert_almost_equal(df.values, expected)
2947+
# check with delim_whitespace=True
2948+
df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#',
2949+
delim_whitespace=True)
2950+
tm.assert_almost_equal(df.values, expected)
2951+
# check with custom line terminator
2952+
df = self.read_csv(StringIO(data.replace('\n', '*')), comment='#',
2953+
lineterminator='*')
2954+
tm.assert_almost_equal(df.values, expected)
29472955

29482956
def test_comment_skiprows(self):
29492957
data = """# empty

pandas/src/parser/tokenizer.c

+33
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,10 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
969969
END_LINE();
970970
break;
971971
}
972+
else if (c == self->commentchar) {
973+
self->state = EAT_LINE_COMMENT;
974+
break;
975+
}
972976
/* normal character - handle as START_FIELD */
973977
self->state = START_FIELD;
974978
/* fallthru */
@@ -1103,6 +1107,13 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
11031107
}
11041108
break;
11051109

1110+
case EAT_LINE_COMMENT:
1111+
if (c == self->lineterminator) {
1112+
self->file_lines++;
1113+
self->state = START_RECORD;
1114+
}
1115+
break;
1116+
11061117
case EAT_COMMENT:
11071118
if (c == self->lineterminator) {
11081119
END_LINE();
@@ -1186,6 +1197,9 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
11861197
} else if (IS_WHITESPACE(c)) {
11871198
self->state = EAT_WHITESPACE;
11881199
break;
1200+
} else if (c == self->commentchar) {
1201+
self->state = EAT_LINE_COMMENT;
1202+
break;
11891203
} else {
11901204
/* normal character - handle as START_FIELD */
11911205
self->state = START_FIELD;
@@ -1231,6 +1245,16 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
12311245
}
12321246
break;
12331247

1248+
case EAT_LINE_COMMENT:
1249+
if (c == '\n') {
1250+
self->file_lines++;
1251+
self->state = START_RECORD;
1252+
} else if (c == '\r') {
1253+
self->file_lines++;
1254+
self->state = EAT_CRNL_NOP;
1255+
}
1256+
break;
1257+
12341258
case ESCAPED_CHAR:
12351259
/* if (c == '\0') */
12361260
/* c = '\n'; */
@@ -1351,6 +1375,15 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
13511375
}
13521376
break;
13531377

1378+
case EAT_CRNL_NOP: // inside an ignored comment line
1379+
self->state = START_RECORD;
1380+
/* \r line terminator -- parse this character again */
1381+
if (c != '\n' && c != self->delimiter) {
1382+
--i;
1383+
--buf;
1384+
}
1385+
break;
1386+
13541387
case EAT_COMMENT:
13551388
if (c == '\n') {
13561389
END_LINE();

0 commit comments

Comments
 (0)