Skip to content

Commit 98996b9

Browse files
dnlcesilvaFabio Beranizo
andauthored
Correcting file upload. (#64)
* correcting file upload. * fix tests. * fix tests. * fix tests. * fix tests. * fix tests. * Changing the read_indo_dataframe function in datasets.py. * Changing the read_indo_dataframe function in datasets.py. * Changing the read_indo_dataframe function in datasets.py. * Changing the read_indo_dataframe function in datasets.py. * Changing the read_indo_dataframe function in datasets.py. * Changing the create_dataset function in datasets.py. * Changing the create_dataset function in datasets.py. * Replaces file.seek by pdread.seek * Removes unnecessary file.read() Prevents memory to fill. * Replaces BytesIO by a SpooledTemporaryFile It's a better option to avoid filling the memory. Co-authored-by: Fabio Beranizo <[email protected]>
1 parent 7196c01 commit 98996b9

File tree

3 files changed

+80
-51
lines changed

3 files changed

+80
-51
lines changed

datasets/datasets.py

Lines changed: 40 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# -*- coding: utf-8 -*-
22
import json
3-
from io import BytesIO
3+
from io import TextIOWrapper
44
from os import SEEK_SET
55
from os.path import splitext
6+
from tempfile import SpooledTemporaryFile
67
from unicodedata import normalize
78
from uuid import uuid4
89

@@ -16,11 +17,14 @@
1617
from pandas.io.common import infer_compression
1718
from platiagro import load_dataset, save_dataset, stat_dataset, update_dataset_metadata
1819
from platiagro.featuretypes import infer_featuretypes, validate_featuretypes
20+
21+
from datasets import monkeypatch # noqa: F401
1922
from datasets.exceptions import BadRequest, NotFound
2023

2124
from datasets.utils import data_pagination
2225

2326
NOT_FOUND = NotFound("The specified dataset does not exist")
27+
SPOOLED_MAX_SIZE = 1024 * 1024 # 1MB
2428

2529

2630
def list_datasets():
@@ -83,21 +87,24 @@ def create_dataset(file_object):
8387
featuretypes = infer_featuretypes(df)
8488

8589
metadata = {
90+
"columns": columns,
8691
"featuretypes": featuretypes,
8792
"original-filename": filename,
93+
"total": len(df.index),
8894
}
8995

96+
file.seek(0, SEEK_SET)
9097
# uses PlatIAgro SDK to save the dataset
91-
save_dataset(name, df, metadata=metadata)
98+
save_dataset(name, file, metadata=metadata)
9299

93100
columns = [{"name": col, "featuretype": ftype} for col, ftype in zip(columns, featuretypes)]
94-
content = load_dataset(name=name)
101+
95102
# Replaces NaN value by a text "NaN" so JSON encode doesn't fail
96-
content.replace(np.nan, "NaN", inplace=True, regex=True)
97-
content.replace(np.inf, "Inf", inplace=True, regex=True)
98-
content.replace(-np.inf, "-Inf", inplace=True, regex=True)
99-
data = content.values.tolist()
100-
return {"name": name, "columns": columns, "data": data, "total": len(content.index), "filename": filename}
103+
df.replace(np.nan, "NaN", inplace=True, regex=True)
104+
df.replace(np.inf, "Inf", inplace=True, regex=True)
105+
df.replace(-np.inf, "-Inf", inplace=True, regex=True)
106+
data = df.values.tolist()
107+
return {"name": name, "columns": columns, "data": data, "total": len(df.index), "filename": filename}
101108

102109

103110
def create_google_drive_dataset(gfile):
@@ -148,7 +155,7 @@ def create_google_drive_dataset(gfile):
148155
else:
149156
request = service.files().get_media(fileId=file_id)
150157

151-
fh = BytesIO()
158+
fh = SpooledTemporaryFile(max_size=SPOOLED_MAX_SIZE)
152159
downloader = MediaIoBaseDownload(fh, request)
153160
done = False
154161
try:
@@ -291,6 +298,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
291298
-----
292299
If no filename is given, a hex uuid will be used as the file name.
293300
"""
301+
294302
detector = UniversalDetector()
295303
for line, text in enumerate(file):
296304
detector.feed(text)
@@ -305,23 +313,23 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
305313
compression = infer_compression(filename, "infer")
306314

307315
file.seek(0, SEEK_SET)
308-
contents = file.read()
309-
310-
with BytesIO(contents) as file:
311-
df0 = pd.read_csv(
312-
file,
313-
encoding=encoding,
314-
compression=compression,
315-
sep=None,
316-
engine="python",
317-
header="infer",
318-
nrows=nrows,
319-
)
316+
317+
pdread = TextIOWrapper(file, encoding=encoding)
318+
df0 = pd.read_csv(
319+
pdread,
320+
encoding=encoding,
321+
compression=compression,
322+
sep=None,
323+
engine="python",
324+
header="infer",
325+
nrows=nrows,
326+
)
320327

321328
df0_cols = list(df0.columns)
322329

323330
# Check if all columns are strings and short strings(text values tend to be long)
324331
column_names_checker = all([type(item) == str for item in df0_cols])
332+
325333
if column_names_checker:
326334
column_names_checker = all([len(item) < max_characters for item in df0_cols])
327335

@@ -340,16 +348,17 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
340348
header = "infer" if final_checker else None
341349
prefix = None if header else "col"
342350

343-
with BytesIO(contents) as file:
344-
df = pd.read_csv(
345-
file,
346-
encoding=encoding,
347-
compression=compression,
348-
sep=None,
349-
engine="python",
350-
header=header,
351-
prefix=prefix,
352-
)
351+
pdread.seek(0, SEEK_SET)
352+
df = pd.read_csv(
353+
pdread,
354+
encoding=encoding,
355+
compression=compression,
356+
sep=None,
357+
engine="python",
358+
header=header,
359+
nrows=nrows,
360+
prefix=prefix,
361+
)
353362
return df
354363

355364

datasets/monkeypatch.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Monkey's patched methods for the SpooledTemporaryFile class.
4+
This is because the SpooledTemporaryFile does not inherit / implement the IOBase class.
5+
"""
6+
from tempfile import SpooledTemporaryFile
7+
8+
9+
def _readable(self):
10+
return self._file.readable()
11+
12+
13+
def _writable(self):
14+
return self._file.writable()
15+
16+
17+
def _seekable(self):
18+
return self._file.seekable()
19+
20+
21+
SpooledTemporaryFile.readable = _readable
22+
SpooledTemporaryFile.writable = _writable
23+
SpooledTemporaryFile.seekable = _seekable

tests/test_api.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -173,12 +173,11 @@ def test_get_dataset(self):
173173
{"name": "col4", "featuretype": "Numerical"},
174174
{"name": "col5", "featuretype": "Categorical"},
175175
],
176-
"data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
177-
['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
176+
"data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
178177
['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
179178
['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
180179
"filename": "iris.data",
181-
"total": 4
180+
"total": 3
182181
}
183182

184183
self.assertIn("name", result)
@@ -198,10 +197,10 @@ def test_get_dataset(self):
198197
{"name": "col4", "featuretype": "Numerical"},
199198
{"name": "col5", "featuretype": "Categorical"},
200199
],
201-
"data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
202-
['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']],
200+
"data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
201+
['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']],
203202
"filename": "iris.data",
204-
"total": 4
203+
"total": 3
205204
}
206205
del result["name"]
207206
self.assertDictEqual(expected, result)
@@ -218,19 +217,19 @@ def test_get_dataset(self):
218217
{"name": "col4", "featuretype": "Numerical"},
219218
{"name": "col5", "featuretype": "Categorical"},
220219
],
221-
"data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
222-
['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
223-
['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']],
220+
"data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
221+
['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
222+
['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
224223
"filename": "iris.data",
225-
"total": 4
224+
"total": 3
226225
}
227226
del result["name"]
228227
self.assertDictEqual(expected, result)
229228
self.assertEqual(rv.status_code, 200)
230229

231230
rv = TEST_CLIENT.get("/datasets/iris.data?page=15&page_size=2")
232231
result = rv.json()
233-
expected = {"message": "The specified page does not exist"}
232+
expected = {'message': 'The specified page does not exist'}
234233
self.assertDictEqual(expected, result)
235234
self.assertEqual(rv.status_code, 404)
236235

@@ -262,12 +261,11 @@ def test_get_dataset(self):
262261
{"name": "col4", "featuretype": "Numerical"},
263262
{"name": "col5", "featuretype": "Categorical"},
264263
],
265-
"data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
266-
['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
264+
"data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
267265
['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
268266
['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
269267
"filename": "iris.data",
270-
"total": 4
268+
"total": 3
271269
}
272270
# name is machine-generated
273271
# we assert it exists, but we don't check its value
@@ -287,10 +285,10 @@ def test_get_dataset(self):
287285
{"name": "col4", "featuretype": "Numerical"},
288286
{"name": "col5", "featuretype": "Categorical"},
289287
],
290-
"data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
291-
['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']],
288+
"data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
289+
['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']],
292290
"filename": "iris.data",
293-
"total": 4
291+
"total": 3
294292
}
295293
# name is machine-generated
296294
# we assert it exists, but we don't check its value
@@ -434,13 +432,12 @@ def test_patch_dataset(self):
434432
{"name": "col4", "featuretype": "Numerical"},
435433
{"name": "col5", "featuretype": "Categorical"},
436434
],
437-
"data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
438-
['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
435+
"data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
439436
['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
440437
['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
441438
"filename": "iris.data",
442439
"name": name,
443-
"total": 4
440+
"total": 3
444441
}
445442
self.assertDictEqual(expected, result)
446443
self.assertEqual(rv.status_code, 200)

0 commit comments

Comments
 (0)