Skip to content

Commit c2bc5dd

Browse files
committed
Speed up comparisons for special symbols during tokenizing csv row
1 parent 75f053c commit c2bc5dd

File tree

1 file changed

+18
-8
lines changed

1 file changed

+18
-8
lines changed

pandas/_libs/src/parser/tokenizer.c

+18-8
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ static int push_char(register parser_t *self, char c) {
381381
return 0;
382382
}
383383

384-
int PANDAS_INLINE end_field(regiter parser_t *self) {
384+
int PANDAS_INLINE end_field(register parser_t *self) {
385385
// XXX cruft
386386
if (self->words_len >= self->words_cap) {
387387
TRACE(
@@ -677,18 +677,16 @@ static int parser_buffer_bytes(register parser_t *self, size_t nbytes) {
677677
#define IS_WHITESPACE(c) ((c == ' ' || c == '\t'))
678678

679679
#define IS_TERMINATOR(c) \
680-
((self->lineterminator == '\0' && c == '\n') || \
681-
(self->lineterminator != '\0' && c == self->lineterminator))
680+
(c == line_terminator)
682681

683682
#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
684683

685684
// don't parse '\r' with a custom line terminator
686-
#define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r'))
685+
#define IS_CARRIAGE(c) (c == carriage_symbol)
687686

688-
#define IS_COMMENT_CHAR(c) \
689-
((self->commentchar != '\0' && c == self->commentchar))
687+
#define IS_COMMENT_CHAR(c) (c == comment_symbol)
690688

691-
#define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar))
689+
#define IS_ESCAPE_CHAR(c) (c == escape_symbol)
692690

693691
#define IS_SKIPPABLE_SPACE(c) \
694692
((!self->delim_whitespace && c == ' ' && self->skipinitialspace))
@@ -739,13 +737,25 @@ int skip_this_line(register parser_t *self, int64_t rownum) {
739737
}
740738
}
741739

742-
int tokenize_bytes(register parser_t *self, size_t line_limit, int64_t start_lines) {
740+
int tokenize_bytes(register parser_t *self,
741+
size_t line_limit, int64_t start_lines) {
743742
int64_t i, slen;
744743
int should_skip;
745744
char c;
746745
char *stream;
747746
char *buf = self->data + self->datapos;
748747

748+
const char line_terminator = (self->lineterminator == '\0') ?
749+
'\n' : self->lineterminator;
750+
751+
// 1000 is something that couldn't fit in "char"
752+
// thus comparing a char to it would always be "false"
753+
const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000;
754+
const int comment_symbol = (self->commentchar != '\0') ?
755+
self->commentchar : 1000;
756+
const int escape_symbol = (self->escapechar != '\0') ?
757+
self->escapechar : 1000;
758+
749759
if (make_stream_space(self, self->datalen - self->datapos) < 0) {
750760
int64_t bufsize = 100;
751761
self->error_msg = (char *)malloc(bufsize);

0 commit comments

Comments
 (0)