Skip to content

Commit cad6dc7

Browse files
shangyianjreback
authored andcommitted
Preliminary format refactor (#20341)
1 parent cabc05f commit cad6dc7

File tree

7 files changed

+1111
-1056
lines changed

7 files changed

+1111
-1056
lines changed

pandas/core/frame.py

+16-16
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@
9292
import pandas.core.common as com
9393
import pandas.core.nanops as nanops
9494
import pandas.core.ops as ops
95-
import pandas.io.formats.format as fmt
9695
import pandas.io.formats.console as console
96+
import pandas.io.formats.format as fmt
9797
from pandas.io.formats.printing import pprint_thing
9898
import pandas.plotting._core as gfx
9999

@@ -1695,18 +1695,19 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
16951695
else:
16961696
tupleize_cols = False
16971697

1698-
formatter = fmt.CSVFormatter(self, path_or_buf,
1699-
line_terminator=line_terminator, sep=sep,
1700-
encoding=encoding,
1701-
compression=compression, quoting=quoting,
1702-
na_rep=na_rep, float_format=float_format,
1703-
cols=columns, header=header, index=index,
1704-
index_label=index_label, mode=mode,
1705-
chunksize=chunksize, quotechar=quotechar,
1706-
tupleize_cols=tupleize_cols,
1707-
date_format=date_format,
1708-
doublequote=doublequote,
1709-
escapechar=escapechar, decimal=decimal)
1698+
from pandas.io.formats.csvs import CSVFormatter
1699+
formatter = CSVFormatter(self, path_or_buf,
1700+
line_terminator=line_terminator, sep=sep,
1701+
encoding=encoding,
1702+
compression=compression, quoting=quoting,
1703+
na_rep=na_rep, float_format=float_format,
1704+
cols=columns, header=header, index=index,
1705+
index_label=index_label, mode=mode,
1706+
chunksize=chunksize, quotechar=quotechar,
1707+
tupleize_cols=tupleize_cols,
1708+
date_format=date_format,
1709+
doublequote=doublequote,
1710+
escapechar=escapechar, decimal=decimal)
17101711
formatter.save()
17111712

17121713
if path_or_buf is None:
@@ -1997,7 +1998,6 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
19971998
- If False, never show counts.
19981999
19992000
"""
2000-
from pandas.io.formats.format import _put_lines
20012001

20022002
if buf is None: # pragma: no cover
20032003
buf = sys.stdout
@@ -2009,7 +2009,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
20092009

20102010
if len(self.columns) == 0:
20112011
lines.append('Empty %s' % type(self).__name__)
2012-
_put_lines(buf, lines)
2012+
fmt.buffer_put_lines(buf, lines)
20132013
return
20142014

20152015
cols = self.columns
@@ -2096,7 +2096,7 @@ def _sizeof_fmt(num, size_qualifier):
20962096
mem_usage = self.memory_usage(index=True, deep=deep).sum()
20972097
lines.append("memory usage: %s\n" %
20982098
_sizeof_fmt(mem_usage, size_qualifier))
2099-
_put_lines(buf, lines)
2099+
fmt.buffer_put_lines(buf, lines)
21002100

21012101
def memory_usage(self, index=True, deep=False):
21022102
"""Memory usage of DataFrame columns.

pandas/io/formats/common.py

-44
This file was deleted.

pandas/io/formats/csvs.py

+280
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Module for formatting output data into CSV files.
4+
"""
5+
6+
from __future__ import print_function
7+
8+
import csv as csvlib
9+
import numpy as np
10+
11+
from pandas.core.dtypes.missing import notna
12+
from pandas.core.index import Index, MultiIndex
13+
from pandas import compat
14+
from pandas.compat import (StringIO, range, zip)
15+
16+
from pandas.io.common import (_get_handle, UnicodeWriter, _expand_user,
17+
_stringify_path)
18+
from pandas._libs import writers as libwriters
19+
from pandas.core.indexes.datetimes import DatetimeIndex
20+
from pandas.core.indexes.period import PeriodIndex
21+
22+
23+
class CSVFormatter(object):
24+
25+
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
26+
float_format=None, cols=None, header=True, index=True,
27+
index_label=None, mode='w', nanRep=None, encoding=None,
28+
compression=None, quoting=None, line_terminator='\n',
29+
chunksize=None, tupleize_cols=False, quotechar='"',
30+
date_format=None, doublequote=True, escapechar=None,
31+
decimal='.'):
32+
33+
self.obj = obj
34+
35+
if path_or_buf is None:
36+
path_or_buf = StringIO()
37+
38+
self.path_or_buf = _expand_user(_stringify_path(path_or_buf))
39+
self.sep = sep
40+
self.na_rep = na_rep
41+
self.float_format = float_format
42+
self.decimal = decimal
43+
44+
self.header = header
45+
self.index = index
46+
self.index_label = index_label
47+
self.mode = mode
48+
self.encoding = encoding
49+
self.compression = compression
50+
51+
if quoting is None:
52+
quoting = csvlib.QUOTE_MINIMAL
53+
self.quoting = quoting
54+
55+
if quoting == csvlib.QUOTE_NONE:
56+
# prevents crash in _csv
57+
quotechar = None
58+
self.quotechar = quotechar
59+
60+
self.doublequote = doublequote
61+
self.escapechar = escapechar
62+
63+
self.line_terminator = line_terminator
64+
65+
self.date_format = date_format
66+
67+
self.tupleize_cols = tupleize_cols
68+
self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and
69+
not self.tupleize_cols)
70+
71+
# validate mi options
72+
if self.has_mi_columns:
73+
if cols is not None:
74+
raise TypeError("cannot specify cols with a MultiIndex on the "
75+
"columns")
76+
77+
if cols is not None:
78+
if isinstance(cols, Index):
79+
cols = cols.to_native_types(na_rep=na_rep,
80+
float_format=float_format,
81+
date_format=date_format,
82+
quoting=self.quoting)
83+
else:
84+
cols = list(cols)
85+
self.obj = self.obj.loc[:, cols]
86+
87+
# update columns to include possible multiplicity of dupes
88+
# and make sure sure cols is just a list of labels
89+
cols = self.obj.columns
90+
if isinstance(cols, Index):
91+
cols = cols.to_native_types(na_rep=na_rep,
92+
float_format=float_format,
93+
date_format=date_format,
94+
quoting=self.quoting)
95+
else:
96+
cols = list(cols)
97+
98+
# save it
99+
self.cols = cols
100+
101+
# preallocate data 2d list
102+
self.blocks = self.obj._data.blocks
103+
ncols = sum(b.shape[0] for b in self.blocks)
104+
self.data = [None] * ncols
105+
106+
if chunksize is None:
107+
chunksize = (100000 // (len(self.cols) or 1)) or 1
108+
self.chunksize = int(chunksize)
109+
110+
self.data_index = obj.index
111+
if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and
112+
date_format is not None):
113+
self.data_index = Index([x.strftime(date_format) if notna(x) else
114+
'' for x in self.data_index])
115+
116+
self.nlevels = getattr(self.data_index, 'nlevels', 1)
117+
if not index:
118+
self.nlevels = 0
119+
120+
def save(self):
121+
# create the writer & save
122+
if self.encoding is None:
123+
if compat.PY2:
124+
encoding = 'ascii'
125+
else:
126+
encoding = 'utf-8'
127+
else:
128+
encoding = self.encoding
129+
130+
if hasattr(self.path_or_buf, 'write'):
131+
f = self.path_or_buf
132+
close = False
133+
else:
134+
f, handles = _get_handle(self.path_or_buf, self.mode,
135+
encoding=encoding,
136+
compression=self.compression)
137+
close = True
138+
139+
try:
140+
writer_kwargs = dict(lineterminator=self.line_terminator,
141+
delimiter=self.sep, quoting=self.quoting,
142+
doublequote=self.doublequote,
143+
escapechar=self.escapechar,
144+
quotechar=self.quotechar)
145+
if encoding == 'ascii':
146+
self.writer = csvlib.writer(f, **writer_kwargs)
147+
else:
148+
writer_kwargs['encoding'] = encoding
149+
self.writer = UnicodeWriter(f, **writer_kwargs)
150+
151+
self._save()
152+
153+
finally:
154+
if close:
155+
f.close()
156+
157+
def _save_header(self):
158+
159+
writer = self.writer
160+
obj = self.obj
161+
index_label = self.index_label
162+
cols = self.cols
163+
has_mi_columns = self.has_mi_columns
164+
header = self.header
165+
encoded_labels = []
166+
167+
has_aliases = isinstance(header, (tuple, list, np.ndarray, Index))
168+
if not (has_aliases or self.header):
169+
return
170+
if has_aliases:
171+
if len(header) != len(cols):
172+
raise ValueError(('Writing {ncols} cols but got {nalias} '
173+
'aliases'.format(ncols=len(cols),
174+
nalias=len(header))))
175+
else:
176+
write_cols = header
177+
else:
178+
write_cols = cols
179+
180+
if self.index:
181+
# should write something for index label
182+
if index_label is not False:
183+
if index_label is None:
184+
if isinstance(obj.index, MultiIndex):
185+
index_label = []
186+
for i, name in enumerate(obj.index.names):
187+
if name is None:
188+
name = ''
189+
index_label.append(name)
190+
else:
191+
index_label = obj.index.name
192+
if index_label is None:
193+
index_label = ['']
194+
else:
195+
index_label = [index_label]
196+
elif not isinstance(index_label,
197+
(list, tuple, np.ndarray, Index)):
198+
# given a string for a DF with Index
199+
index_label = [index_label]
200+
201+
encoded_labels = list(index_label)
202+
else:
203+
encoded_labels = []
204+
205+
if not has_mi_columns or has_aliases:
206+
encoded_labels += list(write_cols)
207+
writer.writerow(encoded_labels)
208+
else:
209+
# write out the mi
210+
columns = obj.columns
211+
212+
# write out the names for each level, then ALL of the values for
213+
# each level
214+
for i in range(columns.nlevels):
215+
216+
# we need at least 1 index column to write our col names
217+
col_line = []
218+
if self.index:
219+
220+
# name is the first column
221+
col_line.append(columns.names[i])
222+
223+
if isinstance(index_label, list) and len(index_label) > 1:
224+
col_line.extend([''] * (len(index_label) - 1))
225+
226+
col_line.extend(columns._get_level_values(i))
227+
228+
writer.writerow(col_line)
229+
230+
# Write out the index line if it's not empty.
231+
# Otherwise, we will print out an extraneous
232+
# blank line between the mi and the data rows.
233+
if encoded_labels and set(encoded_labels) != set(['']):
234+
encoded_labels.extend([''] * len(columns))
235+
writer.writerow(encoded_labels)
236+
237+
def _save(self):
238+
239+
self._save_header()
240+
241+
nrows = len(self.data_index)
242+
243+
# write in chunksize bites
244+
chunksize = self.chunksize
245+
chunks = int(nrows / chunksize) + 1
246+
247+
for i in range(chunks):
248+
start_i = i * chunksize
249+
end_i = min((i + 1) * chunksize, nrows)
250+
if start_i >= end_i:
251+
break
252+
253+
self._save_chunk(start_i, end_i)
254+
255+
def _save_chunk(self, start_i, end_i):
256+
257+
data_index = self.data_index
258+
259+
# create the data for a chunk
260+
slicer = slice(start_i, end_i)
261+
for i in range(len(self.blocks)):
262+
b = self.blocks[i]
263+
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
264+
float_format=self.float_format,
265+
decimal=self.decimal,
266+
date_format=self.date_format,
267+
quoting=self.quoting)
268+
269+
for col_loc, col in zip(b.mgr_locs, d):
270+
# self.data is a preallocated list
271+
self.data[col_loc] = col
272+
273+
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
274+
float_format=self.float_format,
275+
decimal=self.decimal,
276+
date_format=self.date_format,
277+
quoting=self.quoting)
278+
279+
libwriters.write_csv_rows(self.data, ix, self.nlevels,
280+
self.cols, self.writer)

0 commit comments

Comments
 (0)