@@ -1195,7 +1195,7 @@ def query(
1195
1195
An additional SQL WHERE clause to filter the data (used in Rust queries).
1196
1196
files : list[str], optional
1197
1197
A specific list of files to query from. If provided, these files are used
1198
- instead of discovering files through the normal process. Forces PyArrow backend.
1198
+ instead of discovering files through the normal process.
1199
1199
**kwargs : Any
1200
1200
Additional keyword arguments passed to the underlying query implementation.
1201
1201
@@ -1233,6 +1233,7 @@ def query(
1233
1233
start = start ,
1234
1234
end = end ,
1235
1235
where = where ,
1236
+ files = files ,
1236
1237
** kwargs ,
1237
1238
)
1238
1239
else :
@@ -1270,6 +1271,7 @@ def _query_rust(
1270
1271
start : TimestampLike | None = None ,
1271
1272
end : TimestampLike | None = None ,
1272
1273
where : str | None = None ,
1274
+ files : list [str ] | None = None ,
1273
1275
** kwargs : Any ,
1274
1276
) -> list [Data ]:
1275
1277
query_data_cls = OrderBookDelta if data_cls == OrderBookDeltas else data_cls
@@ -1279,6 +1281,7 @@ def _query_rust(
1279
1281
start = start ,
1280
1282
end = end ,
1281
1283
where = where ,
1284
+ file = files ,
1282
1285
** kwargs ,
1283
1286
)
1284
1287
result = session .to_query_result ()
@@ -1304,6 +1307,7 @@ def backend_session(
1304
1307
end : TimestampLike | None = None ,
1305
1308
where : str | None = None ,
1306
1309
session : DataBackendSession | None = None ,
1310
+ files : list [str ] | None = None ,
1307
1311
** kwargs : Any ,
1308
1312
) -> DataBackendSession :
1309
1313
"""
@@ -1327,6 +1331,9 @@ def backend_session(
1327
1331
An additional SQL WHERE clause to filter the data.
1328
1332
session : DataBackendSession, optional
1329
1333
An existing session to update. If None, a new session is created.
1334
+ files : list[str], optional
1335
+ A specific list of files to query from. If provided, these files are used
1336
+ instead of discovering files through the normal process.
1330
1337
**kwargs : Any
1331
1338
Additional keyword arguments.
1332
1339
@@ -1351,7 +1358,7 @@ def backend_session(
1351
1358
1352
1359
"""
1353
1360
data_type : NautilusDataType = ParquetDataCatalog ._nautilus_data_cls_to_data_type (data_cls )
1354
- files = self ._query_files (data_cls , identifiers , start , end )
1361
+ file_list = files if files else self ._query_files (data_cls , identifiers , start , end )
1355
1362
file_prefix = class_to_filename (data_cls )
1356
1363
1357
1364
if session is None :
@@ -1361,7 +1368,7 @@ def backend_session(
1361
1368
if self .fs_protocol != "file" :
1362
1369
self ._register_object_store_with_session (session )
1363
1370
1364
- for idx , file in enumerate (files ):
1371
+ for idx , file in enumerate (file_list ):
1365
1372
table = f"{ file_prefix } _{ idx } "
1366
1373
query = self ._build_query (
1367
1374
table ,
@@ -1492,10 +1499,7 @@ def _query_pyarrow(
1492
1499
** kwargs : Any ,
1493
1500
) -> list [Data ]:
1494
1501
# Load dataset - use provided files or query for them
1495
- if files is not None :
1496
- file_list = files
1497
- else :
1498
- file_list = self ._query_files (data_cls , identifiers , start , end )
1502
+ file_list = files if files else self ._query_files (data_cls , identifiers , start , end )
1499
1503
1500
1504
if not file_list :
1501
1505
return []
@@ -1536,32 +1540,50 @@ def _query_files(
1536
1540
file_prefix = class_to_filename (data_cls )
1537
1541
base_path = self .path .rstrip ("/" )
1538
1542
glob_path = f"{ base_path } /data/{ file_prefix } /**/*.parquet"
1539
- file_names : list [str ] = self .fs .glob (glob_path )
1543
+ file_paths : list [str ] = self .fs .glob (glob_path )
1540
1544
1541
1545
if identifiers :
1542
1546
if not isinstance (identifiers , list ):
1543
1547
identifiers = [identifiers ]
1544
1548
1545
1549
safe_identifiers = [urisafe_identifier (identifier ) for identifier in identifiers ]
1546
- file_names = [
1547
- file_name
1548
- for file_name in file_names
1549
- if any (safe_identifier in file_name for safe_identifier in safe_identifiers )
1550
+
1551
+ # Exact match by default for instrument_ids or bar_types
1552
+ exact_match_file_paths = [
1553
+ file_path
1554
+ for file_path in file_paths
1555
+ if any (
1556
+ safe_identifier == file_path .split ("/" )[- 2 ]
1557
+ for safe_identifier in safe_identifiers
1558
+ )
1550
1559
]
1551
1560
1561
+ if not exact_match_file_paths and data_cls in [Bar , * Bar .__subclasses__ ()]:
1562
+ # Partial match of instrument_ids in bar_types for bars
1563
+ file_paths = [
1564
+ file_path
1565
+ for file_path in file_paths
1566
+ if any (
1567
+ file_path .split ("/" )[- 2 ].startswith (f"{ safe_identifier } -" )
1568
+ for safe_identifier in safe_identifiers
1569
+ )
1570
+ ]
1571
+ else :
1572
+ file_paths = exact_match_file_paths
1573
+
1552
1574
used_start : pd .Timestamp | None = time_object_to_dt (start )
1553
1575
used_end : pd .Timestamp | None = time_object_to_dt (end )
1554
- file_names = [
1555
- file_name
1556
- for file_name in file_names
1557
- if _query_intersects_filename (file_name , used_start , used_end )
1576
+ file_paths = [
1577
+ file_path
1578
+ for file_path in file_paths
1579
+ if _query_intersects_filename (file_path , used_start , used_end )
1558
1580
]
1559
1581
1560
1582
if self .show_query_paths :
1561
- for file_name in file_names :
1562
- print (file_name )
1583
+ for file_path in file_paths :
1584
+ print (file_path )
1563
1585
1564
- return file_names
1586
+ return file_paths
1565
1587
1566
1588
@staticmethod
1567
1589
def _handle_table_nautilus (
0 commit comments