Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions dlt/common/data_writers/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,12 +414,14 @@ def __init__(
delimiter: str = ",",
include_header: bool = True,
quoting: CsvQuoting = "quote_needed",
lineterminator: str = "\n",
bytes_encoding: str = "utf-8",
) -> None:
super().__init__(f, caps)
self.include_header = include_header
self.delimiter = delimiter
self.quoting: CsvQuoting = quoting
self.lineterminator = lineterminator
self.writer: csv.DictWriter[str] = None
self.bytes_encoding = bytes_encoding

Expand All @@ -443,6 +445,7 @@ def write_header(self, columns_schema: TTableSchemaColumns) -> None:
dialect=csv.unix_dialect,
delimiter=self.delimiter,
quoting=quoting,
lineterminator=self.lineterminator,
)
if self.include_header:
self.writer.writeheader()
Expand Down
1 change: 1 addition & 0 deletions dlt/common/destination/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class CsvFormatConfiguration(BaseConfiguration):
delimiter: str = ","
include_header: bool = True
quoting: CsvQuoting = "quote_needed"
lineterminator: str = "\n"

# read options
on_error_continue: bool = False
Expand Down
5 changes: 4 additions & 1 deletion docs/website/docs/dlt-ecosystem/file-formats/csv.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Internally, we use two implementations:
* separators are commas
* quotes are **"** and are escaped as **""**
* `NULL` values are both empty strings and empty tokens as in the example below
* UNIX new lines are used
* UNIX new lines (`"\n"`) are used by default
* dates are represented as ISO 8601
* quoting style is "when needed"

Expand All @@ -51,6 +51,7 @@ with standard settings:

* `delimiter`: change the delimiting character (default: ',')
* `include_header`: include the header row (default: True)
* `lineterminator`: specify the string used to terminate lines (default: `\n` - UNIX line endings, use `\r\n` for Windows line endings)
* `quoting`: controls when quotes should be generated around field values. Available options:

- `quote_needed` (default): quote only values that need quoting, i.e., non-numeric values
Expand All @@ -72,6 +73,7 @@ with standard settings:
delimiter="|"
include_header=false
quoting="quote_all"
lineterminator="\r\n"
```

Or using environment variables:
Expand All @@ -80,6 +82,7 @@ Or using environment variables:
NORMALIZE__DATA_WRITER__DELIMITER=|
NORMALIZE__DATA_WRITER__INCLUDE_HEADER=False
NORMALIZE__DATA_WRITER__QUOTING=quote_all
NORMALIZE__DATA_WRITER__LINETERMINATOR="\r\n"
```

### Destination settings
Expand Down
25 changes: 24 additions & 1 deletion tests/libs/test_csv_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
TABLE_ROW_ALL_DATA_TYPES,
arrow_table_all_data_types,
)
from tests.utils import TestDataItemFormat
from tests.utils import TestDataItemFormat, custom_environ


def test_csv_arrow_writer_all_data_fields() -> None:
Expand Down Expand Up @@ -311,3 +311,26 @@ def test_csv_writer_quoting_parameters(quoting: CsvQuoting) -> None:

mock_writer_instance.writeheader.assert_called_once()
mock_writer_instance.writerows.assert_called_once_with(test_data)


@pytest.mark.parametrize(
"test_case",
[
{"lineterminator": "\n", "expected": b'"col1"\n"value1"\n"value2"\n'},
{"lineterminator": "\r\n", "expected": b'"col1"\r\n"value1"\r\n"value2"\r\n'},
],
)
def test_csv_lineterminator(test_case: Dict[str, str]) -> None:
lineterminator = test_case["lineterminator"]
expected = test_case["expected"]

schema: TTableSchemaColumns = {"col1": {"name": "col1", "data_type": "text"}}
data = [{"col1": "value1"}, {"col1": "value2"}]

with custom_environ({"DATA_WRITER__LINETERMINATOR": lineterminator}):
with get_writer(CsvWriter, disable_compression=True) as writer:
writer.write_data_item(data, schema)

with open(writer.closed_files[0].file_path, "rb") as f:
content = f.read()
assert content == expected
Loading