diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 88b918e9cc515..b73b70caf1597 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -119,24 +119,24 @@ cdef extern from "parser/tokenizer.h": # where to write out tokenized data char *stream - int64_t stream_len - int64_t stream_cap + uint64_t stream_len + uint64_t stream_cap # Store words in (potentially ragged) matrix for now, hmm char **words int64_t *word_starts # where we are in the stream - int64_t words_len - int64_t words_cap - int64_t max_words_cap # maximum word cap encountered + uint64_t words_len + uint64_t words_cap + uint64_t max_words_cap # maximum word cap encountered char *pword_start # pointer to stream start of current field int64_t word_start # position start of current field int64_t *line_start # position in words for start of line int64_t *line_fields # Number of fields in each line - int64_t lines # Number of lines observed - int64_t file_lines # Number of lines observed (with bad/skipped) - int64_t lines_cap # Vector capacity + uint64_t lines # Number of lines observed + uint64_t file_lines # Number of lines observed (with bad/skipped) + uint64_t lines_cap # Vector capacity # Tokenizing stuff ParserState state @@ -168,7 +168,7 @@ cdef extern from "parser/tokenizer.h": int header # Boolean: 1: has header, 0: no header int64_t header_start # header row start - int64_t header_end # header row end + uint64_t header_end # header row end void *skipset PyObject *skipfunc diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 723bf56a79512..3146e49455609 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -71,9 +71,9 @@ static void free_if_not_null(void **ptr) { */ -static void *grow_buffer(void *buffer, int64_t length, int64_t *capacity, +static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity, int64_t space, int64_t elsize, int *error) { - int64_t cap = *capacity; + uint64_t cap = *capacity; void *newbuffer = buffer; // Can we fit potentially nbytes tokens (+ null terminators) in the stream? @@ -248,7 +248,7 @@ void parser_del(parser_t *self) { } static int make_stream_space(parser_t *self, size_t nbytes) { - int64_t i, cap, length; + uint64_t i, cap, length; int status; void *orig_ptr, *newptr; @@ -263,7 +263,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { ("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, - (int64_t*)&self->stream_cap, nbytes * 2, + &self->stream_cap, nbytes * 2, sizeof(char), &status); TRACE( ("make_stream_space: self->stream=%p, self->stream_len = %zu, " @@ -305,7 +305,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->words = (char **)grow_buffer((void *)self->words, length, - (int64_t*)&self->words_cap, nbytes, + &self->words_cap, nbytes, sizeof(char *), &status); TRACE( ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " @@ -336,7 +336,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { cap = self->lines_cap; self->line_start = (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1, - (int64_t*)&self->lines_cap, nbytes, + &self->lines_cap, nbytes, sizeof(int64_t), &status); TRACE(( "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", @@ -471,7 +471,7 @@ static int end_line(parser_t *self) { return 0; } - if (!(self->lines <= (int64_t) self->header_end + 1) && + if (!(self->lines <= self->header_end + 1) && (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) { // increment file line count self->file_lines++; @@ -507,7 +507,7 @@ static int end_line(parser_t *self) { } } else { // missing trailing delimiters - if ((self->lines >= (int64_t) self->header_end + 1) && + if ((self->lines >= self->header_end + 1) && fields < ex_fields) { // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { @@ -651,7 +651,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + (int64_t)line_limit) { \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ goto linelimit; \ } @@ -666,7 +666,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { stream = self->stream + self->stream_len; \ slen = self->stream_len; \ self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + (int64_t)line_limit) { \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ goto linelimit; \ } @@ -737,7 +737,8 @@ int skip_this_line(parser_t *self, int64_t rownum) { int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) { - int64_t i, slen; + int64_t i; + uint64_t slen; int should_skip; char c; char *stream; @@ -1203,7 +1204,8 @@ static int parser_handle_eof(parser_t *self) { } int parser_consume_rows(parser_t *self, size_t nrows) { - int64_t i, offset, word_deletions, char_count; + int64_t offset, word_deletions; + uint64_t char_count, i; if (nrows > self->lines) { nrows = self->lines; @@ -1229,6 +1231,8 @@ int parser_consume_rows(parser_t *self, size_t nrows) { self->stream_len -= char_count; /* move token metadata */ + // Note: We should always have words_len < word_deletions, so this + // subtraction will remain appropriately-typed. for (i = 0; i < self->words_len - word_deletions; ++i) { offset = i + word_deletions; @@ -1242,6 +1246,8 @@ int parser_consume_rows(parser_t *self, size_t nrows) { self->word_start -= char_count; /* move line metadata */ + // Note: We should always have self->lines - nrows + 1 >= 0, so this + // subtraction will remain appropriately-typed. for (i = 0; i < self->lines - nrows + 1; ++i) { offset = i + nrows; self->line_start[i] = self->line_start[offset] - word_deletions; @@ -1265,7 +1271,7 @@ int parser_trim_buffers(parser_t *self) { size_t new_cap; void *newptr; - int64_t i; + uint64_t i; /** * Before we free up space and trim, we should diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index b6d5d6937f4db..66ef1887d6bc3 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -104,24 +104,24 @@ typedef struct parser_t { // where to write out tokenized data char *stream; - int64_t stream_len; - int64_t stream_cap; + uint64_t stream_len; + uint64_t stream_cap; // Store words in (potentially ragged) matrix for now, hmm char **words; int64_t *word_starts; // where we are in the stream - int64_t words_len; - int64_t words_cap; - int64_t max_words_cap; // maximum word cap encountered + uint64_t words_len; + uint64_t words_cap; + uint64_t max_words_cap; // maximum word cap encountered char *pword_start; // pointer to stream start of current field int64_t word_start; // position start of current field int64_t *line_start; // position in words for start of line int64_t *line_fields; // Number of fields in each line - int64_t lines; // Number of (good) lines observed - int64_t file_lines; // Number of lines (including bad or skipped) - int64_t lines_cap; // Vector capacity + uint64_t lines; // Number of (good) lines observed + uint64_t file_lines; // Number of lines (including bad or skipped) + uint64_t lines_cap; // Vector capacity // Tokenizing stuff ParserState state; @@ -153,7 +153,7 @@ typedef struct parser_t { int header; // Boolean: 1: has header, 0: no header int64_t header_start; // header row start - int64_t header_end; // header row end + uint64_t header_end; // header row end void *skipset; PyObject *skipfunc;