I/O: Improve CSV file import

amotl · amotl · commit dca6332b643c · 2026-04-14T13:53:22.000+02:00
diff --git a/cratedb_toolkit/io/file/csv.py b/cratedb_toolkit/io/file/csv.py
@@ -31,7 +31,7 @@ class CsvFileAddress:
     url: URL
     location: str
     pipeline: Optional[List[str]] = dataclasses.field(default_factory=list)
-    batch_size: Optional[int] = DEFAULT_BATCH_SIZE
+    batch_size: int = DEFAULT_BATCH_SIZE
     # TODO: What about other parameters? See `polars.io.csv.functions`.
     separator: Optional[str] = DEFAULT_SEPARATOR
     quote_char: Optional[str] = DEFAULT_QUOTE_CHAR
@@ -47,11 +47,15 @@ def from_url(cls, url: str) -> "CsvFileAddress":
         https://guided-path.s3.us-east-1.amazonaws.com/demo_climate_data_export.csv
         """
         url_obj, location = parse_uri(url, "csv")
+        try:
+            batch_size = int(url_obj.query_params.get("batch-size", DEFAULT_BATCH_SIZE))
+        except ValueError as ex:
+            raise ValueError("Invalid value for batch size") from ex
         return cls(
             url=url_obj,
             location=location,
             pipeline=url_obj.query_params.getlist("pipe"),
-            batch_size=int(url_obj.query_params.get("batch-size", DEFAULT_BATCH_SIZE)),
+            batch_size=batch_size,
             separator=url_obj.query_params.get("separator", DEFAULT_SEPARATOR),
             quote_char=url_obj.query_params.get("quote-char", DEFAULT_QUOTE_CHAR),
         )
@@ -81,18 +85,21 @@ def collect_properties(query_params: Dict, prefixes: List) -> Dict[str, str]:
                     break
         return opts
 
-    def load_table(self) -> pl.LazyFrame:
+    def load_table(self, lazy: bool = True) -> pl.LazyFrame:
         """
         Load the CSV file as a Polars LazyFrame.
         """
 
         # Read from data source.
-        lf = pl.scan_csv(
-            self.location,
-            separator=self.separator,
-            quote_char=self.quote_char,
-            storage_options=self.storage_options,
-        )
+        kwargs = {
+            "separator": self.separator,
+            "quote_char": self.quote_char,
+            "storage_options": self.storage_options,
+        }
+        if lazy:
+            lf = pl.scan_csv(self.location, **kwargs)
+        else:
+            lf = pl.read_csv(self.location, **kwargs).lazy()
 
         # Optionally apply transformations.
         if self.pipeline:
@@ -118,8 +125,28 @@ def from_csv(source_url, target_url, progress: bool = False) -> bool:
     """
     source = CsvFileAddress.from_url(source_url)
     logger.info(f"File address: {source.location}")
-    return polars_to_cratedb(
-        frame=source.load_table(),
-        target_url=target_url,
-        chunk_size=source.batch_size,
-    )
+
+    try:
+        return polars_to_cratedb(
+            frame=source.load_table(),
+            target_url=target_url,
+            chunk_size=source.batch_size or DEFAULT_BATCH_SIZE,
+        )
+
+    # OSError: object-store error: Generic S3 error: Error performing PUT http://169.254.169.254/latest/api/token
+    # in 218.979617ms, after 2 retries, max_retries: 2, retry_timeout: 10s  - HTTP error:
+    # error sending request (path: s3://guided-path/demo_climate_data_export.csv)
+    except OSError as e:
+        msg = str(e)
+        if "Generic S3 error" in msg and "/api/token" in msg:
+            logger.warning(
+                "Authentication with the storage backend is required for lazy reading, but failed. "
+                "Falling back to complete reading: This may exhaust your system memory."
+            )
+            return polars_to_cratedb(
+                frame=source.load_table(lazy=False),
+                target_url=target_url,
+                chunk_size=source.batch_size,
+            )
+
+    raise RuntimeError(f"Loading data from CSV failed (unknown error): {source_url}")
diff --git a/cratedb_toolkit/io/router.py b/cratedb_toolkit/io/router.py
@@ -116,7 +116,7 @@ def load_table(
 
             adjusted_url = str(source_url_obj)
             if source_url_obj.scheme.startswith("csv"):
-                adjusted_url = str(source_url_obj.path)
+                source_url_obj.scheme = None
 
             return from_csv(adjusted_url, target_url)
 
diff --git a/tests/io/file/test_csv.py b/tests/io/file/test_csv.py
@@ -11,11 +11,12 @@
 climate_json_json = (
     str(data_folder / "climate_json_json.csv") + "?quote-char='&pipe=json_array_to_wkt_point:geo_location"
 )
-climate_json_python = (
+climate_json_python_local = (
     str(data_folder / "climate_json_python.csv")
     + '?quote-char="&pipe=json_array_to_wkt_point:geo_location&pipe=python_to_json:data'
 )
 climate_wkt_json = str(data_folder / "climate_wkt_json.csv") + "?quote-char='"
+climate_json_python_s3 = "https://guided-path.s3.us-east-1.amazonaws.com/demo_climate_data_export.csv?pipe=json_array_to_wkt_point:geo_location&pipe=python_to_json:data"
 
 table_address = TableAddress(schema=TESTDRIVE_DATA_SCHEMA, table="climate_data", if_exists="append")
 
@@ -32,13 +33,23 @@ def test_load_csv_json_json(cratedb_synchronized, provision_ddl):
     assert cluster.adapter.count_records(table_address.fullname) == 3, "Wrong number of records returned"
 
 
-def test_load_csv_json_python(cratedb_synchronized, provision_ddl):
+def test_load_csv_json_python_local(cratedb_synchronized, provision_ddl):
     cluster = DatabaseCluster.create(cluster_url=cratedb_synchronized.database.dburi)
-    cluster.load_table(InputOutputResource(climate_json_python), target=table_address)
+    cluster.load_table(InputOutputResource(climate_json_python_local), target=table_address)
     cluster.adapter.refresh_table(table_address.fullname)
     assert cluster.adapter.count_records(table_address.fullname) == 3, "Wrong number of records returned"
 
 
+@pytest.mark.skip(
+    "Test takes too long to complete. When aiming to test a remote data source, please use a smaller dataset."
+)
+def test_load_csv_json_python_s3(cratedb_synchronized, provision_ddl):
+    cluster = DatabaseCluster.create(cluster_url=cratedb_synchronized.database.dburi)
+    cluster.load_table(InputOutputResource(climate_json_python_s3), target=table_address)
+    cluster.adapter.refresh_table(table_address.fullname)
+    assert cluster.adapter.count_records(table_address.fullname) == 22650, "Wrong number of records returned"
+
+
 def test_load_csv_wkt_json(cratedb_synchronized, provision_ddl):
     cluster = DatabaseCluster.create(cluster_url=cratedb_synchronized.database.dburi)
     cluster.load_table(InputOutputResource(climate_wkt_json), target=table_address)