Skip to content

Commit d9dbf99

Browse files
authored
Allow to define CultureInfo for parsing values on reading DataFrame from csv (#6782)
* Use CultureInfo for parsing values in csv file * Fix merge issues
1 parent ccf34e3 commit d9dbf99

File tree

3 files changed

+71
-13
lines changed

3 files changed

+71
-13
lines changed

src/Microsoft.Data.Analysis/DataFrame.IO.cs

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -98,18 +98,21 @@ private static Type MaxKind(Type a, Type b)
9898
/// <param name="guessRows">number of rows used to guess types</param>
9999
/// <param name="addIndexColumn">add one column with the row index</param>
100100
/// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
101+
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
102+
/// <param name="cultureInfo">culture info for formatting values</param>
101103
/// <returns>DataFrame</returns>
102104
public static DataFrame LoadCsv(string filename,
103105
char separator = ',', bool header = true,
104106
string[] columnNames = null, Type[] dataTypes = null,
105107
int numRows = -1, int guessRows = 10,
106-
bool addIndexColumn = false, Encoding encoding = null)
108+
bool addIndexColumn = false, Encoding encoding = null,
109+
bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null)
107110
{
108111
using (Stream fileStream = new FileStream(filename, FileMode.Open))
109112
{
110113
return LoadCsv(fileStream,
111114
separator: separator, header: header, columnNames: columnNames, dataTypes: dataTypes, numberOfRowsToRead: numRows,
112-
guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding);
115+
guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding, renameDuplicatedColumns: renameDuplicatedColumns, cultureInfo: cultureInfo);
113116
}
114117
}
115118

@@ -351,8 +354,14 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe
351354
char separator = ',', bool header = true,
352355
string[] columnNames = null, Type[] dataTypes = null,
353356
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
354-
bool renameDuplicatedColumns = false)
357+
bool renameDuplicatedColumns = false,
358+
CultureInfo cultureInfo = null)
355359
{
360+
if (cultureInfo == null)
361+
{
362+
cultureInfo = CultureInfo.CurrentCulture;
363+
}
364+
356365
if (dataTypes == null && guessRows <= 0)
357366
{
358367
throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
@@ -452,7 +461,7 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe
452461
}
453462
else
454463
{
455-
ret.Append(fields, inPlace: true);
464+
ret.Append(fields, inPlace: true, cultureInfo: cultureInfo);
456465
}
457466
++rowline;
458467
}
@@ -508,7 +517,6 @@ public TextReader GetTextReader()
508517
}
509518

510519
}
511-
512520
}
513521

514522
/// <summary>
@@ -522,14 +530,18 @@ public TextReader GetTextReader()
522530
/// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param>
523531
/// <param name="guessRows">number of rows used to guess types</param>
524532
/// <param name="addIndexColumn">add one column with the row index</param>
533+
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
534+
/// <param name="cultureInfo">culture info for formatting values</param>
525535
/// <returns><see cref="DataFrame"/></returns>
526536
public static DataFrame LoadCsvFromString(string csvString,
527537
char separator = ',', bool header = true,
528538
string[] columnNames = null, Type[] dataTypes = null,
529-
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false)
539+
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
540+
bool renameDuplicatedColumns = false,
541+
CultureInfo cultureInfo = null)
530542
{
531543
WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvString);
532-
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn);
544+
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo);
533545
}
534546

535547
/// <summary>
@@ -545,12 +557,13 @@ public static DataFrame LoadCsvFromString(string csvString,
545557
/// <param name="addIndexColumn">add one column with the row index</param>
546558
/// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
547559
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
560+
/// <param name="cultureInfo">culture info for formatting values</param>
548561
/// <returns><see cref="DataFrame"/></returns>
549562
public static DataFrame LoadCsv(Stream csvStream,
550563
char separator = ',', bool header = true,
551564
string[] columnNames = null, Type[] dataTypes = null,
552565
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
553-
Encoding encoding = null, bool renameDuplicatedColumns = false)
566+
Encoding encoding = null, bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null)
554567
{
555568
if (!csvStream.CanSeek)
556569
{
@@ -563,7 +576,7 @@ public static DataFrame LoadCsv(Stream csvStream,
563576
}
564577

565578
WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvStream, encoding ?? Encoding.UTF8);
566-
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns);
579+
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo);
567580
}
568581

569582
/// <summary>

src/Microsoft.Data.Analysis/DataFrame.cs

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System;
66
using System.Collections.Generic;
77
using System.Diagnostics;
8+
using System.Globalization;
89
using System.Linq;
910
using System.Text;
1011

@@ -484,12 +485,13 @@ private void ResizeByOneAndAppend(DataFrameColumn column, object value)
484485
/// <remarks>If a <seealso cref="DataFrameRow"/> in <paramref name="rows"/> is null, a null value is appended to each column</remarks>
485486
/// <param name="rows">The rows to be appended to this DataFrame </param>
486487
/// <param name="inPlace">If set, appends <paramref name="rows"/> in place. Otherwise, a new DataFrame is returned with the <paramref name="rows"/> appended</param>
487-
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false)
488+
/// <param name="cultureInfo">culture info for formatting values</param>
489+
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false, CultureInfo cultureInfo = null)
488490
{
489491
DataFrame ret = inPlace ? this : Clone();
490492
foreach (DataFrameRow row in rows)
491493
{
492-
ret.Append(row, inPlace: true);
494+
ret.Append(row, inPlace: true, cultureInfo: cultureInfo);
493495
}
494496
return ret;
495497
}
@@ -501,8 +503,14 @@ public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false)
501503
/// <remarks>If <paramref name="row"/> is null, a null value is appended to each column</remarks>
502504
/// <param name="row"></param>
503505
/// <param name="inPlace">If set, appends a <paramref name="row"/> in place. Otherwise, a new DataFrame is returned with an appended <paramref name="row"/> </param>
504-
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
506+
/// <param name="cultureInfo">culture info for formatting values</param>
507+
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false, CultureInfo cultureInfo = null)
505508
{
509+
if (cultureInfo == null)
510+
{
511+
cultureInfo = CultureInfo.CurrentCulture;
512+
}
513+
506514
DataFrame ret = inPlace ? this : Clone();
507515
IEnumerator<DataFrameColumn> columnEnumerator = ret.Columns.GetEnumerator();
508516
bool columnMoveNext = columnEnumerator.MoveNext();
@@ -530,7 +538,7 @@ public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
530538
}
531539
if (value != null)
532540
{
533-
value = Convert.ChangeType(value, column.DataType);
541+
value = Convert.ChangeType(value, column.DataType, cultureInfo);
534542

535543
if (value is null)
536544
{

test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
using System.Data.SQLite.EF6;
1515
using Xunit;
1616
using Microsoft.ML.TestFramework.Attributes;
17+
using System.Threading;
1718

1819
namespace Microsoft.Data.Analysis.Tests
1920
{
@@ -154,6 +155,42 @@ void ReducedRowsTest(DataFrame reducedRows)
154155
ReducedRowsTest(csvDf);
155156
}
156157

158+
[Fact]
159+
public void TestReadCsvWithHeaderCultureInfoAndSeparator()
160+
{
161+
string data = @$"vendor_id;rate_code;passenger_count;trip_time_in_secs;trip_distance;payment_type;fare_amount
162+
CMT;1;1;1271;3,8;CRD;17,5
163+
CMT;1;1;474;1,5;CRD;8
164+
CMT;1;1;637;1,4;CRD;8,5
165+
CMT;1;1;181;0,6;CSH;4,5";
166+
167+
void RegularTest(DataFrame df)
168+
{
169+
Assert.Equal(4, df.Rows.Count);
170+
Assert.Equal(7, df.Columns.Count);
171+
172+
Assert.Equal(3.8f, (float)df["trip_distance"][0]);
173+
Assert.Equal(17.5f, (float)df["fare_amount"][0]);
174+
175+
Assert.Equal(1.5f, (float)df["trip_distance"][1]);
176+
Assert.Equal(8f, (float)df["fare_amount"][1]);
177+
178+
Assert.Equal(1.4f, (float)df["trip_distance"][2]);
179+
Assert.Equal(8.5f, (float)df["fare_amount"][2]);
180+
181+
VerifyColumnTypes(df);
182+
}
183+
184+
// de-DE has ',' as decimal separator
185+
var cultureInfo = new CultureInfo("de-DE");
186+
DataFrame df = DataFrame.LoadCsv(GetStream(data), separator: ';', cultureInfo: cultureInfo);
187+
188+
RegularTest(df);
189+
190+
DataFrame csvDf = DataFrame.LoadCsvFromString(data, separator: ';', cultureInfo: cultureInfo);
191+
RegularTest(csvDf);
192+
}
193+
157194
[Fact]
158195
public void TestReadCsvWithHeaderAndDuplicatedColumns_WithoutRenaming()
159196
{

0 commit comments

Comments
 (0)