@@ -31,7 +31,7 @@ class CsvFileAddress:
3131 url : URL
3232 location : str
3333 pipeline : Optional [List [str ]] = dataclasses .field (default_factory = list )
34- batch_size : Optional [ int ] = DEFAULT_BATCH_SIZE
34+ batch_size : int = DEFAULT_BATCH_SIZE
3535 # TODO: What about other parameters? See `polars.io.csv.functions`.
3636 separator : Optional [str ] = DEFAULT_SEPARATOR
3737 quote_char : Optional [str ] = DEFAULT_QUOTE_CHAR
@@ -47,11 +47,15 @@ def from_url(cls, url: str) -> "CsvFileAddress":
4747 https://guided-path.s3.us-east-1.amazonaws.com/demo_climate_data_export.csv
4848 """
4949 url_obj , location = parse_uri (url , "csv" )
50+ try :
51+ batch_size = int (url_obj .query_params .get ("batch-size" , DEFAULT_BATCH_SIZE ))
52+ except ValueError as ex :
53+ raise ValueError ("Invalid value for batch size" ) from ex
5054 return cls (
5155 url = url_obj ,
5256 location = location ,
5357 pipeline = url_obj .query_params .getlist ("pipe" ),
54- batch_size = int ( url_obj . query_params . get ( "batch-size" , DEFAULT_BATCH_SIZE )) ,
58+ batch_size = batch_size ,
5559 separator = url_obj .query_params .get ("separator" , DEFAULT_SEPARATOR ),
5660 quote_char = url_obj .query_params .get ("quote-char" , DEFAULT_QUOTE_CHAR ),
5761 )
@@ -81,18 +85,22 @@ def collect_properties(query_params: Dict, prefixes: List) -> Dict[str, str]:
8185 break
8286 return opts
8387
84- def load_table (self ) -> pl .LazyFrame :
88+ def load_table (self , lazy : bool = True ) -> pl .LazyFrame :
8589 """
8690 Load the CSV file as a Polars LazyFrame.
8791 """
8892
8993 # Read from data source.
90- lf = pl .scan_csv (
91- self .location ,
92- separator = self .separator ,
93- quote_char = self .quote_char ,
94- storage_options = self .storage_options ,
95- )
94+ kwargs = {
95+ "separator" : self .separator ,
96+ "quote_char" : self .quote_char ,
97+ "storage_options" : self .storage_options ,
98+ }
99+ # Note: Type checker ignores are only for Python 3.9.
100+ if lazy :
101+ lf = pl .scan_csv (self .location , ** kwargs ) # ty: ignore[invalid-argument-type]
102+ else :
103+ lf = pl .read_csv (self .location , ** kwargs ).lazy () # ty: ignore[invalid-argument-type]
96104
97105 # Optionally apply transformations.
98106 if self .pipeline :
@@ -118,8 +126,27 @@ def from_csv(source_url, target_url, progress: bool = False) -> bool:
118126 """
119127 source = CsvFileAddress .from_url (source_url )
120128 logger .info (f"File address: { source .location } " )
121- return polars_to_cratedb (
122- frame = source .load_table (),
123- target_url = target_url ,
124- chunk_size = source .batch_size ,
125- )
129+
130+ try :
131+ return polars_to_cratedb (
132+ frame = source .load_table (),
133+ target_url = target_url ,
134+ chunk_size = source .batch_size or DEFAULT_BATCH_SIZE ,
135+ )
136+
137+ # OSError: object-store error: Generic S3 error: Error performing PUT http://169.254.169.254/latest/api/token
138+ # in 218.979617ms, after 2 retries, max_retries: 2, retry_timeout: 10s - HTTP error:
139+ # error sending request (path: s3://guided-path/demo_climate_data_export.csv)
140+ except OSError as ex :
141+ msg = str (ex )
142+ if "Generic S3 error" in msg and "/api/token" in msg :
143+ logger .warning (
144+ "Storage backend authentication is required for streaming reads but failed. "
145+ "Falling back to non-streaming mode: This may result in inefficient reads."
146+ )
147+ return polars_to_cratedb (
148+ frame = source .load_table (lazy = False ),
149+ target_url = target_url ,
150+ chunk_size = source .batch_size ,
151+ )
152+ raise OSError (f"Loading data from CSV failed: { source_url } : { msg } " ) from ex
0 commit comments