Skip to content

Speed up tokenizing of a row in csv and xstrtod parsing #25784

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ Performance Improvements
- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is
int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`)
- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`)

- Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`)

.. _whatsnew_0250.bug_fixes:

Expand Down
92 changes: 59 additions & 33 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) {
self->line_start = parser->line_start + start;
}

coliter_t *coliter_new(parser_t *self, int i) {
coliter_t *coliter_new(register parser_t *self, int i) {
// column i, starting at 0
coliter_t *iter = (coliter_t *)malloc(sizeof(coliter_t));

Expand Down Expand Up @@ -97,7 +97,7 @@ static void *grow_buffer(void *buffer, int64_t length, int64_t *capacity,
return newbuffer;
}

void parser_set_default_options(parser_t *self) {
void parser_set_default_options(register parser_t *self) {
self->decimal = '.';
self->sci = 'E';

Expand Down Expand Up @@ -131,11 +131,11 @@ void parser_set_default_options(parser_t *self) {
self->skip_footer = 0;
}

int get_parser_memory_footprint(parser_t *self) { return 0; }
int get_parser_memory_footprint(register parser_t *self) { return 0; }

parser_t *parser_new() { return (parser_t *)calloc(1, sizeof(parser_t)); }

int parser_clear_data_buffers(parser_t *self) {
int parser_clear_data_buffers(register parser_t *self) {
free_if_not_null((void *)&self->stream);
free_if_not_null((void *)&self->words);
free_if_not_null((void *)&self->word_starts);
Expand All @@ -144,7 +144,7 @@ int parser_clear_data_buffers(parser_t *self) {
return 0;
}

int parser_cleanup(parser_t *self) {
int parser_cleanup(register parser_t *self) {
int status = 0;

// XXX where to put this
Expand All @@ -170,7 +170,7 @@ int parser_cleanup(parser_t *self) {
return status;
}

int parser_init(parser_t *self) {
int parser_init(register parser_t *self) {
int64_t sz;

/*
Expand Down Expand Up @@ -240,16 +240,16 @@ int parser_init(parser_t *self) {
return 0;
}

void parser_free(parser_t *self) {
void parser_free(register parser_t *self) {
// opposite of parser_init
parser_cleanup(self);
}

void parser_del(parser_t *self) {
void parser_del(register parser_t *self) {
free(self);
}

static int make_stream_space(parser_t *self, size_t nbytes) {
static int make_stream_space(register parser_t *self, size_t nbytes) {
int64_t i, cap, length;
int status;
void *orig_ptr, *newptr;
Expand Down Expand Up @@ -363,7 +363,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
return 0;
}

static int push_char(parser_t *self, char c) {
static int push_char(register parser_t *self, char c) {
TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n",
self->stream_len + 1, c, self->stream_cap))
if (self->stream_len >= self->stream_cap) {
Expand All @@ -381,7 +381,7 @@ static int push_char(parser_t *self, char c) {
return 0;
}

int PANDAS_INLINE end_field(parser_t *self) {
int PANDAS_INLINE end_field(register parser_t *self) {
// XXX cruft
if (self->words_len >= self->words_cap) {
TRACE(
Expand Down Expand Up @@ -419,7 +419,7 @@ int PANDAS_INLINE end_field(parser_t *self) {
return 0;
}

static void append_warning(parser_t *self, const char *msg) {
static void append_warning(register parser_t *self, const char *msg) {
int64_t ex_length;
int64_t length = strlen(msg);
void *newptr;
Expand All @@ -437,7 +437,7 @@ static void append_warning(parser_t *self, const char *msg) {
}
}

static int end_line(parser_t *self) {
static int end_line(register parser_t *self) {
char *msg;
int64_t fields;
int ex_fields = self->expected_fields;
Expand Down Expand Up @@ -556,7 +556,7 @@ static int end_line(parser_t *self) {
return 0;
}

int parser_add_skiprow(parser_t *self, int64_t row) {
int parser_add_skiprow(register parser_t *self, int64_t row) {
khiter_t k;
kh_int64_t *set;
int ret = 0;
Expand All @@ -573,7 +573,7 @@ int parser_add_skiprow(parser_t *self, int64_t row) {
return 0;
}

int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
int parser_set_skipfirstnrows(register parser_t *self, int64_t nrows) {
// self->file_lines is zero based so subtract 1 from nrows
if (nrows > 0) {
self->skip_first_N_rows = nrows - 1;
Expand All @@ -582,7 +582,7 @@ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
return 0;
}

static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
static int parser_buffer_bytes(register parser_t *self, size_t nbytes) {
int status;
size_t bytes_read;

Expand Down Expand Up @@ -677,18 +677,16 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
#define IS_WHITESPACE(c) ((c == ' ' || c == '\t'))

#define IS_TERMINATOR(c) \
((self->lineterminator == '\0' && c == '\n') || \
(self->lineterminator != '\0' && c == self->lineterminator))
(c == line_terminator)

#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))

// don't parse '\r' with a custom line terminator
#define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r'))
#define IS_CARRIAGE(c) (c == carriage_symbol)

#define IS_COMMENT_CHAR(c) \
((self->commentchar != '\0' && c == self->commentchar))
#define IS_COMMENT_CHAR(c) (c == comment_symbol)

#define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar))
#define IS_ESCAPE_CHAR(c) (c == escape_symbol)

#define IS_SKIPPABLE_SPACE(c) \
((!self->delim_whitespace && c == ' ' && self->skipinitialspace))
Expand All @@ -710,7 +708,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
self->datapos += 3; \
}

int skip_this_line(parser_t *self, int64_t rownum) {
int skip_this_line(register parser_t *self, int64_t rownum) {
int should_skip;
PyObject *result;
PyGILState_STATE state;
Expand Down Expand Up @@ -739,13 +737,25 @@ int skip_this_line(parser_t *self, int64_t rownum) {
}
}

int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) {
int tokenize_bytes(register parser_t *self,
size_t line_limit, int64_t start_lines) {
int64_t i, slen;
int should_skip;
char c;
char *stream;
char *buf = self->data + self->datapos;

const char line_terminator = (self->lineterminator == '\0') ?
'\n' : self->lineterminator;

// 1000 is something that couldn't fit in "char"
// thus comparing a char to it would always be "false"
const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000;
const int comment_symbol = (self->commentchar != '\0') ?
self->commentchar : 1000;
const int escape_symbol = (self->escapechar != '\0') ?
self->escapechar : 1000;

if (make_stream_space(self, self->datalen - self->datapos) < 0) {
int64_t bufsize = 100;
self->error_msg = (char *)malloc(bufsize);
Expand Down Expand Up @@ -1149,7 +1159,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) {
return 0;
}

static int parser_handle_eof(parser_t *self) {
static int parser_handle_eof(register parser_t *self) {
int64_t bufsize = 100;

TRACE(
Expand Down Expand Up @@ -1194,7 +1204,7 @@ static int parser_handle_eof(parser_t *self) {
return 0;
}

int parser_consume_rows(parser_t *self, size_t nrows) {
int parser_consume_rows(register parser_t *self, size_t nrows) {
int64_t i, offset, word_deletions, char_count;

if (nrows > self->lines) {
Expand Down Expand Up @@ -1250,7 +1260,7 @@ static size_t _next_pow2(size_t sz) {
return result;
}

int parser_trim_buffers(parser_t *self) {
int parser_trim_buffers(register parser_t *self) {
/*
Free memory
*/
Expand Down Expand Up @@ -1353,7 +1363,7 @@ int parser_trim_buffers(parser_t *self) {
all : tokenize all the data vs. certain number of rows
*/

int _tokenize_helper(parser_t *self, size_t nrows, int all) {
int _tokenize_helper(register parser_t *self, size_t nrows, int all) {
int status = 0;
int64_t start_lines = self->lines;

Expand Down Expand Up @@ -1402,12 +1412,12 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
return status;
}

int tokenize_nrows(parser_t *self, size_t nrows) {
int tokenize_nrows(register parser_t *self, size_t nrows) {
int status = _tokenize_helper(self, nrows, 0);
return status;
}

int tokenize_all_rows(parser_t *self) {
int tokenize_all_rows(register parser_t *self) {
int status = _tokenize_helper(self, -1, 1);
return status;
}
Expand Down Expand Up @@ -1529,9 +1539,14 @@ int main(int argc, char *argv[]) {
// * Add tsep argument for thousands separator
//

// pessimistic but quick assessment,
// assuming that each decimal digit requires 4 bits to store
const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4;

double xstrtod(const char *str, char **endptr, char decimal, char sci,
char tsep, int skip_trailing) {
double number;
unsigned int i_number = 0;
int exponent;
int negative;
char *p = (char *)str;
Expand All @@ -1554,19 +1569,30 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
p++;
}

number = 0.;
exponent = 0;
num_digits = 0;
num_decimals = 0;

// Process string of digits.
while (isdigit_ascii(*p)) {
number = number * 10. + (*p - '0');
while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) {
i_number = i_number * 10 + (*p - '0');
p++;
num_digits++;

p += (tsep != '\0' && *p == tsep);
}
number = i_number;

if (num_digits > max_int_decimal_digits) {
// process what's left as double
while (isdigit_ascii(*p)) {
number = number * 10. + (*p - '0');
p++;
num_digits++;

p += (tsep != '\0' && *p == tsep);
}
}

// Process decimal part.
if (*p == decimal) {
Expand Down
30 changes: 15 additions & 15 deletions pandas/_libs/src/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,35 +212,35 @@ typedef struct coliter_t {
} coliter_t;

void coliter_setup(coliter_t *self, parser_t *parser, int i, int start);
coliter_t *coliter_new(parser_t *self, int i);
coliter_t *coliter_new(register parser_t *self, int i);

#define COLITER_NEXT(iter, word) \
do { \
const int64_t i = *iter.line_start++ + iter.col; \
word = i < *iter.line_start ? iter.words[i] : ""; \
#define COLITER_NEXT(iter, word) \
do { \
const int64_t i = *iter.line_start++ + iter.col; \
word = i >= *iter.line_start ? "" : iter.words[i]; \
} while (0)

parser_t *parser_new(void);

int parser_init(parser_t *self);
int parser_init(register parser_t *self);

int parser_consume_rows(parser_t *self, size_t nrows);
int parser_consume_rows(register parser_t *self, size_t nrows);

int parser_trim_buffers(parser_t *self);
int parser_trim_buffers(register parser_t *self);

int parser_add_skiprow(parser_t *self, int64_t row);
int parser_add_skiprow(register parser_t *self, int64_t row);

int parser_set_skipfirstnrows(parser_t *self, int64_t nrows);
int parser_set_skipfirstnrows(register parser_t *self, int64_t nrows);

void parser_free(parser_t *self);
void parser_free(register parser_t *self);

void parser_del(parser_t *self);
void parser_del(register parser_t *self);

void parser_set_default_options(parser_t *self);
void parser_set_default_options(register parser_t *self);

int tokenize_nrows(parser_t *self, size_t nrows);
int tokenize_nrows(register parser_t *self, size_t nrows);

int tokenize_all_rows(parser_t *self);
int tokenize_all_rows(register parser_t *self);

// Have parsed / type-converted a chunk of data
// and want to free memory from the token stream
Expand Down