diff --git a/doc/source/io.rst b/doc/source/io.rst index 351a7059b2739..25925ef4a8b91 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -97,6 +97,12 @@ sep : str, defaults to ``','`` for :func:`read_csv`, ``\t`` for :func:`read_tabl Regex example: ``'\\r\\t'``. delimiter : str, default ``None`` Alternative argument name for sep. +delim_whitespace : boolean, default False + Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) + will be used as the delimiter. Equivalent to setting ``sep='\+s'``. + If this option is set to True, nothing should be passed in for the + ``delimiter`` parameter. This parameter is currently supported for + the C parser only. Column and Index Locations and Names ++++++++++++++++++++++++++++++++++++ diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 821f093083026..d386f32d35195 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -302,6 +302,7 @@ Bug Fixes - Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`) - Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`) - Bug in ``read_csv`` when specifying ``names``, ```usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`) +- Bug in ``read_csv`` when specifying ``delim_whitespace=True`` and ``lineterminator`` simultaneously with the C engine (:issue:`12912`) - Bug in ``Series.rename``, ``DataFrame.rename`` and ``DataFrame.rename_axis`` not treating ``Series`` as mappings to relabel (:issue:`12623`). - Clean in ``.rolling.min`` and ``.rolling.max`` to enhance dtype handling (:issue:`12373`) - Bug in ``groupby`` where complex types are coerced to float (:issue:`12902`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e08268a1944b7..4ece66122bcd0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -209,6 +209,11 @@ warn_bad_lines : boolean, default True If error_bad_lines is False, and warn_bad_lines is True, a warning for each "bad line" will be output. (Only valid with C parser). +delim_whitespace : boolean, default False + Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be used + as the delimiter. Equivalent to setting ``sep='\+s'``. If this option is set + to True, nothing should be passed in for the ``delimiter`` parameter. This + parameter is currently supported for the C parser only. Returns ------- diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index ab6103f0f523c..1fab316d80ae6 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3878,6 +3878,15 @@ def test_buffer_rd_bytes(self): except Exception as e: pass + def test_delim_whitespace_custom_terminator(self): + # See gh-12912 + data = """a b c~1 2 3~4 5 6~7 8 9""" + df = self.read_csv(StringIO(data), lineterminator='~', + delim_whitespace=True) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=['a', 'b', 'c']) + tm.assert_frame_equal(df, expected) + class TestCParserHighMemory(CParserTests, CompressionTests, tm.TestCase): engine = 'c' diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index a75ce2bde80e6..013c47cd09a9b 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -693,627 +693,38 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { #define IS_WHITESPACE(c) ((c == ' ' || c == '\t')) -typedef int (*parser_op)(parser_t *self, size_t line_limit); +#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && \ + c == '\n') || c == self->lineterminator) -#define _TOKEN_CLEANUP() \ - self->stream_len = slen; \ - self->datapos = i; \ - TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen)); - - -int skip_this_line(parser_t *self, int64_t rownum) { - if (self->skipset != NULL) { - return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != - ((kh_int64_t*)self->skipset)->n_buckets ); - } - else { - return ( rownum <= self->skip_first_N_rows ); - } -} - -int tokenize_delimited(parser_t *self, size_t line_limit) -{ - int i, slen, start_lines; - long maxstreamsize; - char c; - char *stream; - char *buf = self->data + self->datapos; - - - start_lines = self->lines; - - if (make_stream_space(self, self->datalen - self->datapos) < 0) { - self->error_msg = "out of memory"; - return -1; - } - - stream = self->stream + self->stream_len; - slen = self->stream_len; - maxstreamsize = self->stream_cap; - TRACE(("%s\n", buf)); - - for (i = self->datapos; i < self->datalen; ++i) - { - // Next character in file - c = *buf++; - - TRACE(("tokenize_delimited - Iter: %d Char: 0x%x Line %d field_count %d, state %d\n", - i, c, self->file_lines + 1, self->line_fields[self->lines], - self->state)); - - switch(self->state) { - - case SKIP_LINE: - TRACE(("tokenize_delimited SKIP_LINE 0x%x, state %d\n", c, self->state)); - if (c == '\n') { - END_LINE(); - } else if (c == '\r') { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - break; - - case START_RECORD: - // start of record - if (skip_this_line(self, self->file_lines)) { - self->state = SKIP_LINE; - if (c == '\n') { - END_LINE(); - } - break; - } - else if (c == '\n') { - // \n\r possible? - if (self->skip_empty_lines) - { - self->file_lines++; - } - else - { - END_LINE(); - } - break; - } - else if (c == '\r') { - if (self->skip_empty_lines) - { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - else - self->state = EAT_CRNL; - break; - } - else if (c == self->commentchar) { - self->state = EAT_LINE_COMMENT; - break; - } - else if (IS_WHITESPACE(c) && c != self->delimiter && self->skip_empty_lines) { - self->state = WHITESPACE_LINE; - break; - } - - /* normal character - handle as START_FIELD */ - self->state = START_FIELD; - /* fallthru */ - - case START_FIELD: - /* expecting field */ - if (c == '\n') { - END_FIELD(); - END_LINE(); - } else if (c == '\r') { - END_FIELD(); - self->state = EAT_CRNL; - } - else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { - /* start quoted field */ - self->state = IN_QUOTED_FIELD; - } - else if (c == self->escapechar) { - /* possible escaped character */ - self->state = ESCAPED_CHAR; - } - else if (c == ' ' && self->skipinitialspace) - /* ignore space at start of field */ - ; - else if (c == self->delimiter) { - /* save empty field */ - END_FIELD(); - } - else if (c == self->commentchar) { - END_FIELD(); - self->state = EAT_COMMENT; - } - else { - /* begin new unquoted field */ -// if (self->quoting == QUOTE_NONNUMERIC) -// self->numeric_field = 1; - - // TRACE(("pushing %c", c)); - PUSH_CHAR(c); - self->state = IN_FIELD; - } - break; - - case WHITESPACE_LINE: // check if line is whitespace-only - if (c == '\n') { - self->file_lines++; - self->state = START_RECORD; // ignore empty line - } - else if (c == '\r') { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - else if (IS_WHITESPACE(c) && c != self->delimiter) - ; - else { // backtrack - /* We have to use i + 1 because buf has been incremented but not i */ - do { - --buf; - --i; - } while (i + 1 > self->datapos && *buf != '\n'); - - if (*buf == '\n') // reached a newline rather than the beginning - { - ++buf; // move pointer to first char after newline - ++i; - } - self->state = START_FIELD; - } - break; - - case ESCAPED_CHAR: - /* if (c == '\0') */ - /* c = '\n'; */ - - PUSH_CHAR(c); - self->state = IN_FIELD; - break; - - case EAT_LINE_COMMENT: - if (c == '\n') { - self->file_lines++; - self->state = START_RECORD; - } else if (c == '\r') { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - break; - - case IN_FIELD: - /* in unquoted field */ - if (c == '\n') { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } else if (c == '\r') { - END_FIELD(); - self->state = EAT_CRNL; - } - else if (c == self->escapechar) { - /* possible escaped character */ - self->state = ESCAPED_CHAR; - } - else if (c == self->delimiter) { - // End of field. End of line not reached yet - END_FIELD(); - self->state = START_FIELD; - } - else if (c == self->commentchar) { - END_FIELD(); - self->state = EAT_COMMENT; - } - else { - /* normal character - save in field */ - PUSH_CHAR(c); - } - break; - - case IN_QUOTED_FIELD: - /* in quoted field */ - if (c == self->escapechar) { - /* Possible escape character */ - self->state = ESCAPE_IN_QUOTED_FIELD; - } - else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { - if (self->doublequote) { - /* doublequote; " represented by "" */ - self->state = QUOTE_IN_QUOTED_FIELD; - } - else { - /* end of quote part of field */ - self->state = IN_FIELD; - } - } - else { - /* normal character - save in field */ - PUSH_CHAR(c); - } - break; - - case ESCAPE_IN_QUOTED_FIELD: - /* if (c == '\0') */ - /* c = '\n'; */ - - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - break; - - case QUOTE_IN_QUOTED_FIELD: - /* doublequote - seen a quote in an quoted field */ - if (self->quoting != QUOTE_NONE && c == self->quotechar) { - /* save "" as " */ - - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - } - else if (c == self->delimiter) { - // End of field. End of line not reached yet - - END_FIELD(); - self->state = START_FIELD; - } - else if (c == '\n') { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } - else if (c == '\r') { - END_FIELD(); - self->state = EAT_CRNL; - } - else if (!self->strict) { - PUSH_CHAR(c); - self->state = IN_FIELD; - } - else { - self->error_msg = (char*) malloc(50); - sprintf(self->error_msg, "'%c' expected after '%c'", - self->delimiter, self->quotechar); - goto parsingerror; - } - break; - - case EAT_COMMENT: - if (c == '\n') { - END_LINE(); - } else if (c == '\r') { - self->state = EAT_CRNL; - } - break; - - case EAT_CRNL: - if (c == '\n') { - END_LINE(); - /* self->state = START_RECORD; */ - } else if (c == self->delimiter){ - // Handle \r-delimited files - END_LINE_AND_FIELD_STATE(START_FIELD); - } else { - /* \r line terminator */ - - /* UGH. we don't actually want to consume the token. fix this later */ - self->stream_len = slen; - if (end_line(self) < 0) { - goto parsingerror; - } - stream = self->stream + self->stream_len; - slen = self->stream_len; - self->state = START_RECORD; - - /* HACK, let's try this one again */ - --i; buf--; - if (line_limit > 0 && self->lines == start_lines + line_limit) { - goto linelimit; - } - - } - break; - - case EAT_CRNL_NOP: /* inside an ignored comment line */ - self->state = START_RECORD; - /* \r line terminator -- parse this character again */ - if (c != '\n' && c != self->delimiter) { - --i; - --buf; - } - break; - default: - break; - - } - } - - _TOKEN_CLEANUP(); - - TRACE(("Finished tokenizing input\n")) - - return 0; - -parsingerror: - i++; - _TOKEN_CLEANUP(); - - return -1; - -linelimit: - i++; - _TOKEN_CLEANUP(); - - return 0; -} - -/* custom line terminator */ -int tokenize_delim_customterm(parser_t *self, size_t line_limit) -{ - - int i, slen, start_lines; - long maxstreamsize; - char c; - char *stream; - char *buf = self->data + self->datapos; - - - start_lines = self->lines; - - if (make_stream_space(self, self->datalen - self->datapos) < 0) { - self->error_msg = "out of memory"; - return -1; - } - - stream = self->stream + self->stream_len; - slen = self->stream_len; - maxstreamsize = self->stream_cap; - - TRACE(("%s\n", buf)); - - for (i = self->datapos; i < self->datalen; ++i) - { - // Next character in file - c = *buf++; - - TRACE(("tokenize_delim_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n", - i, c, self->file_lines + 1, self->line_fields[self->lines], - self->state)); - - switch(self->state) { - - case SKIP_LINE: -// TRACE(("tokenize_delim_customterm SKIP_LINE %c, state %d\n", c, self->state)); - if (c == self->lineterminator) { - END_LINE(); - } - break; - - case START_RECORD: - // start of record - if (skip_this_line(self, self->file_lines)) { - self->state = SKIP_LINE; - if (c == self->lineterminator) { - END_LINE(); - } - break; - } - else if (c == self->lineterminator) { - // \n\r possible? - if (self->skip_empty_lines) - { - self->file_lines++; - } - else - { - END_LINE(); - } - break; - } - else if (c == self->commentchar) { - self->state = EAT_LINE_COMMENT; - break; - } - else if (IS_WHITESPACE(c) && c != self->delimiter && self->skip_empty_lines) - { - self->state = WHITESPACE_LINE; - break; - } - /* normal character - handle as START_FIELD */ - self->state = START_FIELD; - /* fallthru */ - case START_FIELD: - /* expecting field */ - if (c == self->lineterminator) { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } - else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { - /* start quoted field */ - self->state = IN_QUOTED_FIELD; - } - else if (c == self->escapechar) { - /* possible escaped character */ - self->state = ESCAPED_CHAR; - } - else if (c == ' ' && self->skipinitialspace) - /* ignore space at start of field */ - ; - else if (c == self->delimiter) { - /* save empty field */ - END_FIELD(); - } - else if (c == self->commentchar) { - END_FIELD(); - self->state = EAT_COMMENT; - } - else { - /* begin new unquoted field */ - if (self->quoting == QUOTE_NONNUMERIC) - self->numeric_field = 1; - - // TRACE(("pushing %c", c)); - PUSH_CHAR(c); - self->state = IN_FIELD; - } - break; - - case WHITESPACE_LINE: // check if line is whitespace-only - if (c == self->lineterminator) { - self->file_lines++; - self->state = START_RECORD; // ignore empty line - } - else if (IS_WHITESPACE(c) && c != self->delimiter) - ; - else { // backtrack - /* We have to use i + 1 because buf has been incremented but not i */ - do { - --buf; - --i; - } while (i + 1 > self->datapos && *buf != self->lineterminator); - - if (*buf == self->lineterminator) // reached a newline rather than the beginning - { - ++buf; // move pointer to first char after newline - ++i; - } - self->state = START_FIELD; - } - break; - - case ESCAPED_CHAR: - /* if (c == '\0') */ - /* c = '\n'; */ - - PUSH_CHAR(c); - self->state = IN_FIELD; - break; - - case IN_FIELD: - /* in unquoted field */ - if (c == self->lineterminator) { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } - else if (c == self->escapechar) { - /* possible escaped character */ - self->state = ESCAPED_CHAR; - } - else if (c == self->delimiter) { - // End of field. End of line not reached yet - END_FIELD(); - self->state = START_FIELD; - } - else if (c == self->commentchar) { - END_FIELD(); - self->state = EAT_COMMENT; - } - else { - /* normal character - save in field */ - PUSH_CHAR(c); - } - break; - - case IN_QUOTED_FIELD: - /* in quoted field */ - if (c == self->escapechar) { - /* Possible escape character */ - self->state = ESCAPE_IN_QUOTED_FIELD; - } - else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { - if (self->doublequote) { - /* doublequote; " represented by "" */ - self->state = QUOTE_IN_QUOTED_FIELD; - } - else { - /* end of quote part of field */ - self->state = IN_FIELD; - } - } - else { - /* normal character - save in field */ - PUSH_CHAR(c); - } - break; - - case ESCAPE_IN_QUOTED_FIELD: - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - break; - - case QUOTE_IN_QUOTED_FIELD: - /* doublequote - seen a quote in an quoted field */ - if (self->quoting != QUOTE_NONE && c == self->quotechar) { - /* save "" as " */ +#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - } - else if (c == self->delimiter) { - // End of field. End of line not reached yet +// don't parse '\r' with a custom line terminator +#define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r')) - END_FIELD(); - self->state = START_FIELD; - } - else if (c == self->lineterminator) { - END_FIELD(); - END_LINE(); - /* self->state = START_RECORD; */ - } - else if (!self->strict) { - PUSH_CHAR(c); - self->state = IN_FIELD; - } - else { - self->error_msg = (char*) malloc(50); - sprintf(self->error_msg, "'%c' expected after '%c'", - self->delimiter, self->quotechar); - goto parsingerror; - } - break; +#define IS_SKIPPABLE_SPACE(c) ((!self->delim_whitespace && c == ' ' && \ + self->skipinitialspace)) - case EAT_LINE_COMMENT: - if (c == self->lineterminator) { - self->file_lines++; - self->state = START_RECORD; - } - break; +// applied when in a field +#define IS_DELIMITER(c) ((!self->delim_whitespace && c == self->delimiter) || \ + (self->delim_whitespace && IS_WHITESPACE(c))) - case EAT_COMMENT: - if (c == self->lineterminator) { - END_LINE(); - } - break; +#define _TOKEN_CLEANUP() \ + self->stream_len = slen; \ + self->datapos = i; \ + TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen)); - default: - break; - } +int skip_this_line(parser_t *self, int64_t rownum) { + if (self->skipset != NULL) { + return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != + ((kh_int64_t*)self->skipset)->n_buckets ); + } + else { + return ( rownum <= self->skip_first_N_rows ); } - - _TOKEN_CLEANUP(); - - TRACE(("Finished tokenizing input\n")) - - return 0; - -parsingerror: - i++; - _TOKEN_CLEANUP(); - - return -1; - -linelimit: - i++; - _TOKEN_CLEANUP(); - - return 0; } -int tokenize_whitespace(parser_t *self, size_t line_limit) +int tokenize_bytes(parser_t *self, size_t line_limit) { int i, slen, start_lines; long maxstreamsize; @@ -1336,50 +747,66 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) for (i = self->datapos; i < self->datalen; ++i) { - // Next character in file + // next character in file c = *buf++; - TRACE(("tokenize_whitespace - Iter: %d Char: %c Line %d field_count %d, state %d\n", + TRACE(("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, state %d\n", i, c, self->file_lines + 1, self->line_fields[self->lines], self->state)); switch(self->state) { case SKIP_LINE: -// TRACE(("tokenize_whitespace SKIP_LINE %c, state %d\n", c, self->state)); - if (c == '\n') { + TRACE(("tokenize_bytes SKIP_LINE 0x%x, state %d\n", c, self->state)); + if (IS_TERMINATOR(c)) { END_LINE(); - } else if (c == '\r') { + } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; } break; case WHITESPACE_LINE: - if (c == '\n') { + if (IS_TERMINATOR(c)) { self->file_lines++; self->state = START_RECORD; break; - } - else if (c == '\r') { + } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; break; + } else if (!self->delim_whitespace) { + if (IS_WHITESPACE(c) && c != self->delimiter) { + ; + } else { // backtrack + // use i + 1 because buf has been incremented but not i + do { + --buf; + --i; + } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf)); + + // reached a newline rather than the beginning + if (IS_TERMINATOR(*buf)) { + ++buf; // move pointer to first char after newline + ++i; + } + self->state = START_FIELD; + } + break; } // fall through case EAT_WHITESPACE: - if (c == '\n') { + if (IS_TERMINATOR(c)) { END_LINE(); self->state = START_RECORD; break; - } else if (c == '\r') { + } else if (IS_CARRIAGE(c)) { self->state = EAT_CRNL; break; } else if (!IS_WHITESPACE(c)) { - // END_FIELD(); self->state = START_FIELD; - // Fall through to subsequent state + // fall through to subsequent state } else { // if whitespace char, keep slurping break; @@ -1389,237 +816,252 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) // start of record if (skip_this_line(self, self->file_lines)) { self->state = SKIP_LINE; - if (c == '\n') { + if (IS_TERMINATOR(c)) { END_LINE(); } break; - } else if (c == '\n') { - if (self->skip_empty_lines) + } else if (IS_TERMINATOR(c)) { // \n\r possible? - { + if (self->skip_empty_lines) { self->file_lines++; - } - else - { + } else { END_LINE(); } break; - } else if (c == '\r') { - if (self->skip_empty_lines) - { + } else if (IS_CARRIAGE(c)) { + if (self->skip_empty_lines) { self->file_lines++; self->state = EAT_CRNL_NOP; - } - else + } else { self->state = EAT_CRNL; - break; - } else if (IS_WHITESPACE(c)) { - if (self->skip_empty_lines) - self->state = WHITESPACE_LINE; - else - self->state = EAT_WHITESPACE; + } break; } else if (c == self->commentchar) { self->state = EAT_LINE_COMMENT; break; - } else { - /* normal character - handle as START_FIELD */ - self->state = START_FIELD; + } else if (IS_WHITESPACE(c)) { + if (self->delim_whitespace) { + if (self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + } else { + self->state = EAT_WHITESPACE; + } + break; + } else if (c != self->delimiter && self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + break; + } + // fall through } - /* fallthru */ + + // normal character - fall through + // to handle as START_FIELD + self->state = START_FIELD; + case START_FIELD: - /* expecting field */ - if (c == '\n') { + // expecting field + if (IS_TERMINATOR(c)) { END_FIELD(); END_LINE(); - /* self->state = START_RECORD; */ - } else if (c == '\r') { + } else if (IS_CARRIAGE(c)) { END_FIELD(); self->state = EAT_CRNL; - } - else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { - /* start quoted field */ + } else if (IS_QUOTE(c)) { + // start quoted field self->state = IN_QUOTED_FIELD; - } - else if (c == self->escapechar) { - /* possible escaped character */ + } else if (c == self->escapechar) { + // possible escaped character self->state = ESCAPED_CHAR; - } - /* else if (c == ' ' && self->skipinitialspace) */ - /* /\* ignore space at start of field *\/ */ - /* ; */ - else if (IS_WHITESPACE(c)) { - self->state = EAT_WHITESPACE; - } - else if (c == self->commentchar) { + } else if (IS_SKIPPABLE_SPACE(c)) { + // ignore space at start of field + ; + } else if (IS_DELIMITER(c)) { + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + // save empty field + END_FIELD(); + } + } else if (c == self->commentchar) { END_FIELD(); self->state = EAT_COMMENT; - } - else { - /* begin new unquoted field */ - if (self->quoting == QUOTE_NONNUMERIC) - self->numeric_field = 1; + } else { + // begin new unquoted field + // if (self->delim_whitespace && \ + // self->quoting == QUOTE_NONNUMERIC) { + // self->numeric_field = 1; + // } - // TRACE(("pushing %c", c)); PUSH_CHAR(c); self->state = IN_FIELD; } break; + case ESCAPED_CHAR: + PUSH_CHAR(c); + self->state = IN_FIELD; + break; + case EAT_LINE_COMMENT: - if (c == '\n') { + if (IS_TERMINATOR(c)) { self->file_lines++; self->state = START_RECORD; - } else if (c == '\r') { + } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; } break; - case ESCAPED_CHAR: - /* if (c == '\0') */ - /* c = '\n'; */ - - PUSH_CHAR(c); - self->state = IN_FIELD; - break; - case IN_FIELD: - /* in unquoted field */ - if (c == '\n') { + // in unquoted field + if (IS_TERMINATOR(c)) { END_FIELD(); END_LINE(); - /* self->state = START_RECORD; */ - } else if (c == '\r') { + } else if (IS_CARRIAGE(c)) { END_FIELD(); self->state = EAT_CRNL; - } - else if (c == self->escapechar) { - /* possible escaped character */ + } else if (c == self->escapechar) { + // possible escaped character self->state = ESCAPED_CHAR; - } - else if (IS_WHITESPACE(c)) { - // End of field. End of line not reached yet + } else if (IS_DELIMITER(c)) { + // end of field - end of line not reached yet END_FIELD(); - self->state = EAT_WHITESPACE; - } - else if (c == self->commentchar) { + + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + self->state = START_FIELD; + } + } else if (c == self->commentchar) { END_FIELD(); self->state = EAT_COMMENT; - } - else { - /* normal character - save in field */ + } else { + // normal character - save in field PUSH_CHAR(c); } break; case IN_QUOTED_FIELD: - /* in quoted field */ + // in quoted field if (c == self->escapechar) { - /* Possible escape character */ + // possible escape character self->state = ESCAPE_IN_QUOTED_FIELD; - } - else if (c == self->quotechar && - self->quoting != QUOTE_NONE) { + } else if (IS_QUOTE(c)) { if (self->doublequote) { - /* doublequote; " represented by "" */ + // double quote - " represented by "" self->state = QUOTE_IN_QUOTED_FIELD; - } - else { - /* end of quote part of field */ + } else { + // end of quote part of field self->state = IN_FIELD; } - } - else { - /* normal character - save in field */ + } else { + // normal character - save in field PUSH_CHAR(c); } break; case ESCAPE_IN_QUOTED_FIELD: - /* if (c == '\0') */ - /* c = '\n'; */ - PUSH_CHAR(c); self->state = IN_QUOTED_FIELD; break; case QUOTE_IN_QUOTED_FIELD: - /* doublequote - seen a quote in an quoted field */ - if (self->quoting != QUOTE_NONE && c == self->quotechar) { - /* save "" as " */ + // double quote - seen a quote in an quoted field + if (IS_QUOTE(c)) { + // save "" as " PUSH_CHAR(c); self->state = IN_QUOTED_FIELD; - } - else if (IS_WHITESPACE(c)) { - // End of field. End of line not reached yet - + } else if (IS_DELIMITER(c)) { + // end of field - end of line not reached yet END_FIELD(); - self->state = EAT_WHITESPACE; - } - else if (c == '\n') { + + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + self->state = START_FIELD; + } + } else if (IS_TERMINATOR(c)) { END_FIELD(); END_LINE(); - /* self->state = START_RECORD; */ - } - else if (c == '\r') { + } else if (IS_CARRIAGE(c)) { END_FIELD(); self->state = EAT_CRNL; - } - else if (!self->strict) { + } else if (!self->strict) { PUSH_CHAR(c); self->state = IN_FIELD; - } - else { + } else { self->error_msg = (char*) malloc(50); - sprintf(self->error_msg, "'%c' expected after '%c'", - self->delimiter, self->quotechar); + sprintf(self->error_msg, + "delimiter expected after " + "quote in quote"); goto parsingerror; } break; + case EAT_COMMENT: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->state = EAT_CRNL; + } + break; + + // only occurs with non-custom line terminator, + // which is why we directly check for '\n' case EAT_CRNL: if (c == '\n') { END_LINE(); - /* self->state = START_RECORD; */ - } else if (IS_WHITESPACE(c)){ - // Handle \r-delimited files - END_LINE_STATE(EAT_WHITESPACE); + } else if (IS_DELIMITER(c)){ + + if (self->delim_whitespace) { + END_LINE_STATE(EAT_WHITESPACE); + } else { + // Handle \r-delimited files + END_LINE_AND_FIELD_STATE(START_FIELD); + } } else { - /* XXX - * first character of a new record--need to back up and reread - * to handle properly... - */ - i--; buf--; /* back up one character (HACK!) */ - END_LINE_STATE(START_RECORD); + if (self->delim_whitespace) { + /* XXX + * first character of a new record--need to back up and reread + * to handle properly... + */ + i--; buf--; // back up one character (HACK!) + END_LINE_STATE(START_RECORD); + } else { + // \r line terminator + // UGH. we don't actually want + // to consume the token. fix this later + self->stream_len = slen; + if (end_line(self) < 0) { + goto parsingerror; + } + + stream = self->stream + self->stream_len; + slen = self->stream_len; + self->state = START_RECORD; + + --i; buf--; // let's try this character again (HACK!) + if (line_limit > 0 && self->lines == start_lines + line_limit) { + goto linelimit; + } + } } break; + // only occurs with non-custom line terminator, + // which is why we directly check for '\n' case EAT_CRNL_NOP: // inside an ignored comment line self->state = START_RECORD; - /* \r line terminator -- parse this character again */ - if (c != '\n' && c != self->delimiter) { + // \r line terminator -- parse this character again + if (c != '\n' && !IS_DELIMITER(c)) { --i; --buf; } break; - - case EAT_COMMENT: - if (c == '\n') { - END_LINE(); - } else if (c == '\r') { - self->state = EAT_CRNL; - } - break; - default: break; - - } - } _TOKEN_CLEANUP(); @@ -1641,7 +1083,6 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) return 0; } - static int parser_handle_eof(parser_t *self) { TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) @@ -1845,19 +1286,9 @@ void debug_print_parser(parser_t *self) { */ int _tokenize_helper(parser_t *self, size_t nrows, int all) { - parser_op tokenize_bytes; - int status = 0; int start_lines = self->lines; - if (self->delim_whitespace) { - tokenize_bytes = tokenize_whitespace; - } else if (self->lineterminator == '\0') { - tokenize_bytes = tokenize_delimited; - } else { - tokenize_bytes = tokenize_delim_customterm; - } - if (self->state == FINISHED) { return 0; } @@ -1884,12 +1315,9 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", self->datalen - self->datapos, self->datalen, self->datapos)); - /* TRACE(("sourcetype: %c, status: %d\n", self->sourcetype, status)); */ status = tokenize_bytes(self, nrows); - /* debug_print_parser(self); */ - if (status < 0) { // XXX TRACE(("_tokenize_helper: Status %d returned from tokenize_bytes, breaking\n",