Skip to content

fix: Do not persist storage when it is configured #559

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/crawlee/service_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,16 @@ class _Services(TypedDict):
_default_storage_client_type: StorageClientType = 'local'


def get_storage_client(*, client_type: StorageClientType | None = None) -> BaseStorageClient:
def get_storage_client(
*,
client_type: StorageClientType | None = None,
configuration: Configuration | None = None,
) -> BaseStorageClient:
"""Get the storage client instance for the current environment.

Args:
client_type: Allows retrieving a specific storage client type, regardless of where we are running.
configuration: The configuration to use.

Returns:
The current storage client instance.
Expand All @@ -57,7 +62,7 @@ def get_storage_client(*, client_type: StorageClientType | None = None) -> BaseS
return _services['cloud_storage_client']

if 'local_storage_client' not in _services:
_services['local_storage_client'] = MemoryStorageClient()
_services['local_storage_client'] = MemoryStorageClient(configuration=configuration)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if there is a local_storage_client with a different configuration though? I'm afraid we'll need some changes on an architectural level...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we even store those two separately? those should be two flavors of a single service, not two different services, at least that's how it was designed in the js version.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you mean why we have a separate local_storage_client and cloud_storage_client, we do this so that the force_cloud switch in the SDK works. In the JS SDK, the Actor class holds the cloud client. This feels more readable to me.


return _services['local_storage_client']

Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/storages/_creation_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ async def open_storage(
) -> TResource:
"""Open either a new storage or restore an existing one and return it."""
configuration = configuration or Configuration.get_global_configuration()
storage_client = storage_client or service_container.get_storage_client()
storage_client = storage_client or service_container.get_storage_client(configuration=configuration)

# Try to restore the storage from cache by name
if name:
Expand Down
49 changes: 49 additions & 0 deletions tests/unit/test_configuration.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,56 @@
# TODO: type ignores and crawlee_storage_dir
# https://github.com/apify/crawlee-python/issues/146

from __future__ import annotations

from typing import TYPE_CHECKING

from crawlee.configuration import Configuration
from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext

if TYPE_CHECKING:
from pathlib import Path


def test_global_configuration_works() -> None:
assert Configuration.get_global_configuration() is Configuration.get_global_configuration()


async def test_storage_not_persisted_when_disabled(tmp_path: Path) -> None:
# Configure the crawler to not persist storage or metadata.
configuration = Configuration(
persist_storage=False,
write_metadata=False,
crawlee_storage_dir=str(tmp_path), # type: ignore
)

crawler = HttpCrawler(configuration=configuration)

@crawler.router.default_handler
async def default_handler(context: HttpCrawlingContext) -> None:
await context.push_data({'url': context.request.url})

await crawler.run(['https://crawlee.dev'])

# Verify that no files were created in the storage directory.
assert not any(tmp_path.iterdir()), 'Expected the storage directory to be empty, but it is not.'


async def test_storage_persisted_when_enabled(tmp_path: Path) -> None:
# Configure the crawler to persist storage and metadata.
configuration = Configuration(
persist_storage=True,
write_metadata=True,
crawlee_storage_dir=str(tmp_path), # type: ignore
)

crawler = HttpCrawler(configuration=configuration)

@crawler.router.default_handler
async def default_handler(context: HttpCrawlingContext) -> None:
await context.push_data({'url': context.request.url})

await crawler.run(['https://crawlee.dev'])

# Verify that files were created in the storage directory.
assert any(tmp_path.iterdir()), 'Expected the storage directory to contain files, but it does not.'
Loading