Skip to content

Commit feccd27

Browse files
committed
BUG: Don't over-optimize memory with jagged CSV
With jagged CSV's, we risk being too quick to dump memory that we need to allocate because previous chunks would have indicated much larger rows than we can anticipate in subsequent chunks. Closes gh-23509.
1 parent c6366f5 commit feccd27

File tree

5 files changed

+33
-2
lines changed

5 files changed

+33
-2
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1276,6 +1276,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
12761276
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
12771277
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
12781278
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
1279+
- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`)
12791280
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
12801281
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
12811282
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)

pandas/_libs/parsers.pyx

+1
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ cdef extern from "parser/tokenizer.h":
132132
int64_t *word_starts # where we are in the stream
133133
int64_t words_len
134134
int64_t words_cap
135+
int64_t max_words_cap # maximum word cap encountered
135136

136137
char *pword_start # pointer to stream start of current field
137138
int64_t word_start # position start of current field

pandas/_libs/src/parser/tokenizer.c

+14-2
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ int parser_init(parser_t *self) {
197197
sz = sz ? sz : 1;
198198
self->words = (char **)malloc(sz * sizeof(char *));
199199
self->word_starts = (int64_t *)malloc(sz * sizeof(int64_t));
200+
self->max_words_cap = sz;
200201
self->words_cap = sz;
201202
self->words_len = 0;
202203

@@ -247,7 +248,7 @@ void parser_del(parser_t *self) {
247248
}
248249

249250
static int make_stream_space(parser_t *self, size_t nbytes) {
250-
int64_t i, cap;
251+
int64_t i, cap, length;
251252
int status;
252253
void *orig_ptr, *newptr;
253254

@@ -287,8 +288,15 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
287288
*/
288289

289290
cap = self->words_cap;
291+
292+
if (self->words_len + nbytes < self->max_words_cap) {
293+
length = self->max_words_cap - nbytes;
294+
} else {
295+
length = self->words_len;
296+
}
297+
290298
self->words =
291-
(char **)grow_buffer((void *)self->words, self->words_len,
299+
(char **)grow_buffer((void *)self->words, length,
292300
(int64_t*)&self->words_cap, nbytes,
293301
sizeof(char *), &status);
294302
TRACE(
@@ -1241,6 +1249,10 @@ int parser_trim_buffers(parser_t *self) {
12411249

12421250
int64_t i;
12431251

1252+
if (self->words_cap > self->max_words_cap) {
1253+
self->max_words_cap = self->words_cap;
1254+
}
1255+
12441256
/* trim words, word_starts */
12451257
new_cap = _next_pow2(self->words_len) + 1;
12461258
if (new_cap < self->words_cap) {

pandas/_libs/src/parser/tokenizer.h

+1
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ typedef struct parser_t {
142142
int64_t *word_starts; // where we are in the stream
143143
int64_t words_len;
144144
int64_t words_cap;
145+
int64_t max_words_cap; // maximum word cap encountered
145146

146147
char *pword_start; // pointer to stream start of current field
147148
int64_t word_start; // position start of current field

pandas/tests/io/parser/common.py

+16
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,22 @@ def test_read_chunksize_generated_index(self):
459459

460460
tm.assert_frame_equal(pd.concat(reader), df)
461461

462+
def test_read_chunksize_jagged_names(self):
463+
# see gh-23509
464+
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
465+
reader = self.read_csv(StringIO(data), names=range(10), chunksize=4)
466+
467+
expected = DataFrame()
468+
469+
for i in range(10):
470+
if i == 0:
471+
expected[i] = [0] * 8
472+
else:
473+
expected[i] = [np.nan] * 7 + [0]
474+
475+
result = pd.concat(reader)
476+
tm.assert_frame_equal(result, expected)
477+
462478
def test_read_text_list(self):
463479
data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
464480
as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar',

0 commit comments

Comments
 (0)