Skip to content

Commit 1b3e9d9

Browse files
authored
FEAT-#2451: Read multiple csv files simultaneously via glob paths (#2662)
Signed-off-by: William Ma <[email protected]>
1 parent 9495ff7 commit 1b3e9d9

File tree

10 files changed

+732
-3
lines changed

10 files changed

+732
-3
lines changed

modin/backends/pandas/parsers.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,52 @@ def parse(fname, **kwargs):
124124
]
125125

126126

127+
class PandasCSVGlobParser(PandasCSVParser):
128+
@staticmethod
129+
def parse(chunks, **kwargs):
130+
warnings.filterwarnings("ignore")
131+
num_splits = kwargs.pop("num_splits", None)
132+
index_col = kwargs.get("index_col", None)
133+
134+
pandas_dfs = []
135+
for fname, start, end in chunks:
136+
if start is not None and end is not None:
137+
# pop "compression" from kwargs because bio is uncompressed
138+
bio = FileDispatcher.file_open(
139+
fname, "rb", kwargs.pop("compression", "infer")
140+
)
141+
if kwargs.get("encoding", None) is not None:
142+
header = b"" + bio.readline()
143+
else:
144+
header = b""
145+
bio.seek(start)
146+
to_read = header + bio.read(end - start)
147+
bio.close()
148+
pandas_dfs.append(pandas.read_csv(BytesIO(to_read), **kwargs))
149+
else:
150+
# This only happens when we are reading with only one worker (Default)
151+
return pandas.read_csv(fname, **kwargs)
152+
153+
# Combine read in data.
154+
if len(pandas_dfs) > 1:
155+
pandas_df = pandas.concat(pandas_dfs)
156+
elif len(pandas_dfs) > 0:
157+
pandas_df = pandas_dfs[0]
158+
else:
159+
pandas_df = pandas.DataFrame()
160+
161+
# Set internal index.
162+
if index_col is not None:
163+
index = pandas_df.index
164+
else:
165+
# The lengths will become the RangeIndex
166+
index = len(pandas_df)
167+
return _split_result_for_readers(1, num_splits, pandas_df) + [
168+
index,
169+
pandas_df.dtypes,
170+
]
171+
172+
127173
class PandasFWFParser(PandasParser):
128174
@staticmethod
129175
def parse(fname, **kwargs):

modin/data_management/factories/dispatcher.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ def read_parquet(cls, **kwargs):
103103
def read_csv(cls, **kwargs):
104104
return cls.__engine._read_csv(**kwargs)
105105

106+
@classmethod
107+
def read_csv_glob(cls, **kwargs):
108+
return cls.__engine._read_csv_glob(**kwargs)
109+
106110
@classmethod
107111
def read_json(cls, **kwargs):
108112
return cls.__engine._read_json(**kwargs)

modin/data_management/factories/factories.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,10 @@ def prepare(cls):
231231

232232
cls.io_cls = ExperimentalPandasOnRayIO
233233

234+
@classmethod
235+
def _read_csv_glob(cls, **kwargs):
236+
return cls.io_cls.read_csv_glob(**kwargs)
237+
234238

235239
class ExperimentalPandasOnPythonFactory(ExperimentalBaseFactory, PandasOnPythonFactory):
236240
pass

modin/engines/base/io/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from modin.engines.base.io.io import BaseIO
1515
from modin.engines.base.io.text.csv_dispatcher import CSVDispatcher
16+
from modin.engines.base.io.text.csv_glob_dispatcher import CSVGlobDispatcher
1617
from modin.engines.base.io.text.fwf_dispatcher import FWFDispatcher
1718
from modin.engines.base.io.text.json_dispatcher import JSONDispatcher
1819
from modin.engines.base.io.text.excel_dispatcher import ExcelDispatcher
@@ -26,6 +27,7 @@
2627
__all__ = [
2728
"BaseIO",
2829
"CSVDispatcher",
30+
"CSVGlobDispatcher",
2931
"FWFDispatcher",
3032
"JSONDispatcher",
3133
"FileDispatcher",

0 commit comments

Comments
 (0)