I/O: Improve CSV file import, with suggestions by CodeRabbit

amotl · amotl · commit b8ae50970c88 · 2026-04-14T16:10:29.000+02:00
diff --git a/cratedb_toolkit/io/file/csv.py b/cratedb_toolkit/io/file/csv.py
@@ -31,7 +31,7 @@ class CsvFileAddress:
     url: URL
     location: str
     pipeline: Optional[List[str]] = dataclasses.field(default_factory=list)
-    batch_size: Optional[int] = DEFAULT_BATCH_SIZE
+    batch_size: int = DEFAULT_BATCH_SIZE
     # TODO: What about other parameters? See `polars.io.csv.functions`.
     separator: Optional[str] = DEFAULT_SEPARATOR
     quote_char: Optional[str] = DEFAULT_QUOTE_CHAR
@@ -47,11 +47,15 @@ def from_url(cls, url: str) -> "CsvFileAddress":
         https://guided-path.s3.us-east-1.amazonaws.com/demo_climate_data_export.csv
         """
         url_obj, location = parse_uri(url, "csv")
+        try:
+            batch_size = int(url_obj.query_params.get("batch-size", DEFAULT_BATCH_SIZE))
+        except ValueError as ex:
+            raise ValueError("Invalid value for batch size") from ex
         return cls(
             url=url_obj,
             location=location,
             pipeline=url_obj.query_params.getlist("pipe"),
-            batch_size=int(url_obj.query_params.get("batch-size", DEFAULT_BATCH_SIZE)),
+            batch_size=batch_size,
             separator=url_obj.query_params.get("separator", DEFAULT_SEPARATOR),
             quote_char=url_obj.query_params.get("quote-char", DEFAULT_QUOTE_CHAR),
         )
@@ -81,18 +85,22 @@ def collect_properties(query_params: Dict, prefixes: List) -> Dict[str, str]:
                     break
         return opts
 
-    def load_table(self) -> pl.LazyFrame:
+    def load_table(self, lazy: bool = True) -> pl.LazyFrame:
         """
         Load the CSV file as a Polars LazyFrame.
         """
 
         # Read from data source.
-        lf = pl.scan_csv(
-            self.location,
-            separator=self.separator,
-            quote_char=self.quote_char,
-            storage_options=self.storage_options,
-        )
+        kwargs = {
+            "separator": self.separator,
+            "quote_char": self.quote_char,
+            "storage_options": self.storage_options,
+        }
+        # Note: Type checker ignores are only for Python 3.9.
+        if lazy:
+            lf = pl.scan_csv(self.location, **kwargs)  # ty: ignore[invalid-argument-type]
+        else:
+            lf = pl.read_csv(self.location, **kwargs).lazy()  # ty: ignore[invalid-argument-type]
 
         # Optionally apply transformations.
         if self.pipeline:
@@ -118,8 +126,27 @@ def from_csv(source_url, target_url, progress: bool = False) -> bool:
     """
     source = CsvFileAddress.from_url(source_url)
     logger.info(f"File address: {source.location}")
-    return polars_to_cratedb(
-        frame=source.load_table(),
-        target_url=target_url,
-        chunk_size=source.batch_size,
-    )
+
+    try:
+        return polars_to_cratedb(
+            frame=source.load_table(),
+            target_url=target_url,
+            chunk_size=source.batch_size or DEFAULT_BATCH_SIZE,
+        )
+
+    # OSError: object-store error: Generic S3 error: Error performing PUT http://169.254.169.254/latest/api/token
+    # in 218.979617ms, after 2 retries, max_retries: 2, retry_timeout: 10s  - HTTP error:
+    # error sending request (path: s3://guided-path/demo_climate_data_export.csv)
+    except OSError as ex:
+        msg = str(ex)
+        if "Generic S3 error" in msg and "/api/token" in msg:
+            logger.warning(
+                "Storage backend authentication is required for streaming reads but failed. "
+                "Falling back to non-streaming mode: This may result in inefficient reads."
+            )
+            return polars_to_cratedb(
+                frame=source.load_table(lazy=False),
+                target_url=target_url,
+                chunk_size=source.batch_size,
+            )
+        raise OSError(f"Loading data from CSV failed: {source_url}: {msg}") from ex
diff --git a/cratedb_toolkit/io/router.py b/cratedb_toolkit/io/router.py
@@ -116,7 +116,7 @@ def load_table(
 
             adjusted_url = str(source_url_obj)
             if source_url_obj.scheme.startswith("csv"):
-                adjusted_url = str(source_url_obj.path)
+                source_url_obj.scheme = None
 
             return from_csv(adjusted_url, target_url)
 
diff --git a/cratedb_toolkit/model.py b/cratedb_toolkit/model.py
@@ -206,8 +206,12 @@ def schema(self) -> t.Union[str, None]:
     def with_table_address(self, table_address: "TableAddress") -> "DatabaseAddress":
         cp = deepcopy(self)
         cp.uri.path = f"/{table_address.schema}/{table_address.table}"
+        # Use `if-exists` from table address.
         if table_address.if_exists:
             cp.uri.query_params["if-exists"] = table_address.if_exists
+        # When not supplied, don't let existing spots leak.
+        else:
+            cp.uri.query_params.pop("if-exists", None)
         return cp
 
 
diff --git a/tests/cluster/test_import.py b/tests/cluster/test_import.py
@@ -59,7 +59,7 @@ def test_parquet_import_remote(cloud_environment, caplog):
     assert result.exit_code == 0, f"ERROR: {result.output}"
 
     assert "Loading data." in caplog.text
-    assert "target=TableAddress(schema=None, table='basic')" in caplog.text
+    assert "target=TableAddress(schema=None, table='basic'" in caplog.text
     assert "Import succeeded (status: SUCCEEDED)" in caplog.text
 
     with ManagedCluster.from_env() as cluster:
diff --git a/tests/io/file/test_csv.py b/tests/io/file/test_csv.py
@@ -11,11 +11,12 @@
 climate_json_json = (
     str(data_folder / "climate_json_json.csv") + "?quote-char='&pipe=json_array_to_wkt_point:geo_location"
 )
-climate_json_python = (
+climate_json_python_local = (
     str(data_folder / "climate_json_python.csv")
     + '?quote-char="&pipe=json_array_to_wkt_point:geo_location&pipe=python_to_json:data'
 )
 climate_wkt_json = str(data_folder / "climate_wkt_json.csv") + "?quote-char='"
+climate_json_python_s3 = "https://guided-path.s3.us-east-1.amazonaws.com/demo_climate_data_export.csv?pipe=json_array_to_wkt_point:geo_location&pipe=python_to_json:data"
 
 table_address = TableAddress(schema=TESTDRIVE_DATA_SCHEMA, table="climate_data", if_exists="append")
 
@@ -25,22 +26,39 @@ def provision_ddl(cratedb_synchronized) -> None:
     cratedb_synchronized.database.run_sql(ddl)
 
 
-def test_load_csv_json_json(cratedb_synchronized, provision_ddl):
+def test_load_csv_wkt_json(cratedb_synchronized, provision_ddl):
+    """Load a CSV file that does not need any geo transformations."""
     cluster = DatabaseCluster.create(cluster_url=cratedb_synchronized.database.dburi)
-    cluster.load_table(InputOutputResource(climate_json_json), target=table_address)
+    cluster.load_table(InputOutputResource(climate_wkt_json), target=table_address)
     cluster.adapter.refresh_table(table_address.fullname)
     assert cluster.adapter.count_records(table_address.fullname) == 3, "Wrong number of records returned"
 
 
-def test_load_csv_json_python(cratedb_synchronized, provision_ddl):
+def test_load_geo_csv_json_json(cratedb_synchronized, provision_ddl):
+    """Load a CSV file that needs geo transformations."""
+    pytest.importorskip("polars_st", reason="CSV import needs geo transformations")
     cluster = DatabaseCluster.create(cluster_url=cratedb_synchronized.database.dburi)
-    cluster.load_table(InputOutputResource(climate_json_python), target=table_address)
+    cluster.load_table(InputOutputResource(climate_json_json), target=table_address)
     cluster.adapter.refresh_table(table_address.fullname)
     assert cluster.adapter.count_records(table_address.fullname) == 3, "Wrong number of records returned"
 
 
-def test_load_csv_wkt_json(cratedb_synchronized, provision_ddl):
+def test_load_geo_csv_json_python_local(cratedb_synchronized, provision_ddl):
+    """Load a CSV file that needs geo transformations."""
+    pytest.importorskip("polars_st", reason="CSV import needs geo transformations")
     cluster = DatabaseCluster.create(cluster_url=cratedb_synchronized.database.dburi)
-    cluster.load_table(InputOutputResource(climate_wkt_json), target=table_address)
+    cluster.load_table(InputOutputResource(climate_json_python_local), target=table_address)
     cluster.adapter.refresh_table(table_address.fullname)
     assert cluster.adapter.count_records(table_address.fullname) == 3, "Wrong number of records returned"
+
+
+@pytest.mark.skip(
+    "Test takes too long to complete. When aiming to test a remote data source, please use a smaller dataset."
+)
+def test_load_geo_csv_json_python_s3(cratedb_synchronized, provision_ddl):
+    """Load a CSV file that needs geo transformations."""
+    pytest.importorskip("polars_st", reason="CSV import needs geo transformations")
+    cluster = DatabaseCluster.create(cluster_url=cratedb_synchronized.database.dburi)
+    cluster.load_table(InputOutputResource(climate_json_python_s3), target=table_address)
+    cluster.adapter.refresh_table(table_address.fullname)
+    assert cluster.adapter.count_records(table_address.fullname) == 22650, "Wrong number of records returned"