Skip to content

Commit bf000e4

Browse files
committed
fix: do not persist storage when disabled
Closes: #539
1 parent 8fa2b4b commit bf000e4

File tree

3 files changed

+57
-3
lines changed

3 files changed

+57
-3
lines changed

src/crawlee/service_container.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,16 @@ class _Services(TypedDict):
3939
_default_storage_client_type: StorageClientType = 'local'
4040

4141

42-
def get_storage_client(*, client_type: StorageClientType | None = None) -> BaseStorageClient:
42+
def get_storage_client(
43+
*,
44+
client_type: StorageClientType | None = None,
45+
configuration: Configuration | None = None,
46+
) -> BaseStorageClient:
4347
"""Get the storage client instance for the current environment.
4448
4549
Args:
4650
client_type: Allows retrieving a specific storage client type, regardless of where we are running.
51+
configuration: The configuration to use.
4752
4853
Returns:
4954
The current storage client instance.
@@ -57,7 +62,7 @@ def get_storage_client(*, client_type: StorageClientType | None = None) -> BaseS
5762
return _services['cloud_storage_client']
5863

5964
if 'local_storage_client' not in _services:
60-
_services['local_storage_client'] = MemoryStorageClient()
65+
_services['local_storage_client'] = MemoryStorageClient(configuration=configuration)
6166

6267
return _services['local_storage_client']
6368

src/crawlee/storages/_creation_management.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ async def open_storage(
129129
) -> TResource:
130130
"""Open either a new storage or restore an existing one and return it."""
131131
configuration = configuration or Configuration.get_global_configuration()
132-
storage_client = storage_client or service_container.get_storage_client()
132+
storage_client = storage_client or service_container.get_storage_client(configuration=configuration)
133133

134134
# Try to restore the storage from cache by name
135135
if name:

tests/unit/test_configuration.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,56 @@
1+
# TODO: type ignores and crawlee_storage_dir
2+
# https://github.com/apify/crawlee-python/issues/146
3+
14
from __future__ import annotations
25

6+
from typing import TYPE_CHECKING
7+
38
from crawlee.configuration import Configuration
9+
from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext
10+
11+
if TYPE_CHECKING:
12+
from pathlib import Path
413

514

615
def test_global_configuration_works() -> None:
716
assert Configuration.get_global_configuration() is Configuration.get_global_configuration()
17+
18+
19+
async def test_storage_not_persisted_when_disabled(tmp_path: Path) -> None:
20+
# Configure the crawler to not persist storage or metadata.
21+
configuration = Configuration(
22+
persist_storage=False,
23+
write_metadata=False,
24+
crawlee_storage_dir=str(tmp_path), # type: ignore
25+
)
26+
27+
crawler = HttpCrawler(configuration=configuration)
28+
29+
@crawler.router.default_handler
30+
async def default_handler(context: HttpCrawlingContext) -> None:
31+
await context.push_data({'url': context.request.url})
32+
33+
await crawler.run(['https://crawlee.dev'])
34+
35+
# Verify that no files were created in the storage directory.
36+
assert not any(tmp_path.iterdir()), 'Expected the storage directory to be empty, but it is not.'
37+
38+
39+
async def test_storage_persisted_when_enabled(tmp_path: Path) -> None:
40+
# Configure the crawler to persist storage and metadata.
41+
configuration = Configuration(
42+
persist_storage=True,
43+
write_metadata=True,
44+
crawlee_storage_dir=str(tmp_path), # type: ignore
45+
)
46+
47+
crawler = HttpCrawler(configuration=configuration)
48+
49+
@crawler.router.default_handler
50+
async def default_handler(context: HttpCrawlingContext) -> None:
51+
await context.push_data({'url': context.request.url})
52+
53+
await crawler.run(['https://crawlee.dev'])
54+
55+
# Verify that files were created in the storage directory.
56+
assert any(tmp_path.iterdir()), 'Expected the storage directory to contain files, but it does not.'

0 commit comments

Comments
 (0)