Skip to content

feat: Upgrade to Crawlee v0.5 #355

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
360 changes: 304 additions & 56 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ keywords = [
python = "^3.9"
apify-client = ">=1.8.1"
apify-shared = ">=1.2.1"
crawlee = "~0.4.0"
crawlee = "~0.5.1"
cryptography = ">=42.0.0"
httpx = ">=0.27.0"
lazy-object-proxy = ">=1.10.0"
Expand Down
59 changes: 36 additions & 23 deletions src/apify/_actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from apify_client import ApifyClientAsync
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
from crawlee import service_container
from crawlee import service_locator
from crawlee.events._types import Event, EventMigratingData, EventPersistStateData

from apify._configuration import Configuration
Expand All @@ -34,6 +34,7 @@
from typing_extensions import Self

from crawlee.proxy_configuration import _NewUrlFunction
from crawlee.storage_clients import BaseStorageClient

from apify._models import Webhook

Expand Down Expand Up @@ -71,17 +72,22 @@ def __init__(
self._configure_logging = configure_logging
self._apify_client = self.new_client()

self._event_manager: EventManager
if self._configuration.is_at_home:
self._event_manager = PlatformEventManager(
# Create an instance of the cloud storage client, the local storage client is obtained
# from the service locator.
self._cloud_storage_client = ApifyStorageClient.from_config(config=self._configuration)

# Set the event manager based on whether the Actor is running on the platform or locally.
self._event_manager = (
PlatformEventManager(
config=self._configuration,
persist_state_interval=self._configuration.persist_state_interval,
)
else:
self._event_manager = LocalEventManager(
if self.is_at_home()
else LocalEventManager(
system_info_interval=self._configuration.system_info_interval,
persist_state_interval=self._configuration.persist_state_interval,
)
)

self._is_initialized = False

Expand All @@ -94,9 +100,6 @@ async def __aenter__(self) -> Self:
When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while
executing the block code, the `Actor.fail` method is called.
"""
if self._configure_logging:
_configure_logging(self._configuration)

await self.init()
return self

Expand Down Expand Up @@ -156,6 +159,11 @@ def log(self) -> logging.Logger:
"""The logging.Logger instance the Actor uses."""
return logger

@property
def _local_storage_client(self) -> BaseStorageClient:
"""The local storage client the Actor instance uses."""
return service_locator.get_storage_client()

def _raise_if_not_initialized(self) -> None:
if not self._is_initialized:
raise RuntimeError('The Actor was not initialized!')
Expand Down Expand Up @@ -184,18 +192,19 @@ async def init(self) -> None:
if self._is_initialized:
raise RuntimeError('The Actor was already initialized!')

if self._configuration.token:
service_container.set_cloud_storage_client(ApifyStorageClient(configuration=self._configuration))
self._is_exiting = False
self._was_final_persist_state_emitted = False

if self._configuration.is_at_home:
service_container.set_default_storage_client_type('cloud')
else:
service_container.set_default_storage_client_type('local')
# If the Actor is running on the Apify platform, we set the cloud storage client.
if self.is_at_home():
service_locator.set_storage_client(self._cloud_storage_client)

service_container.set_event_manager(self._event_manager)
service_locator.set_event_manager(self.event_manager)
service_locator.set_configuration(self.configuration)

self._is_exiting = False
self._was_final_persist_state_emitted = False
# The logging configuration has to be called after all service_locator set methods.
if self._configure_logging:
_configure_logging()

self.log.info('Initializing Actor...')
self.log.info('System info', extra=get_system_info())
Expand Down Expand Up @@ -245,7 +254,6 @@ async def finalize() -> None:
await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout)

await self._event_manager.__aexit__(None, None, None)
cast(dict, service_container._services).clear() # noqa: SLF001

await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds())
self._is_initialized = False
Expand Down Expand Up @@ -349,11 +357,13 @@ async def open_dataset(
self._raise_if_not_initialized()
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)

storage_client = self._cloud_storage_client if force_cloud else self._local_storage_client

return await Dataset.open(
id=id,
name=name,
configuration=self._configuration,
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
storage_client=storage_client,
)

async def open_key_value_store(
Expand Down Expand Up @@ -381,12 +391,13 @@ async def open_key_value_store(
"""
self._raise_if_not_initialized()
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
storage_client = self._cloud_storage_client if force_cloud else self._local_storage_client

return await KeyValueStore.open(
id=id,
name=name,
configuration=self._configuration,
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
storage_client=storage_client,
)

async def open_request_queue(
Expand Down Expand Up @@ -417,11 +428,13 @@ async def open_request_queue(
self._raise_if_not_initialized()
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)

storage_client = self._cloud_storage_client if force_cloud else self._local_storage_client

return await RequestQueue.open(
id=id,
name=name,
configuration=self._configuration,
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
storage_client=storage_client,
)

async def push_data(self, data: dict | list[dict]) -> None:
Expand Down Expand Up @@ -963,7 +976,7 @@ async def create_proxy_configuration(
password: str | None = None,
groups: list[str] | None = None,
country_code: str | None = None,
proxy_urls: list[str] | None = None,
proxy_urls: list[str | None] | None = None,
new_url_function: _NewUrlFunction | None = None,
) -> ProxyConfiguration | None:
"""Create a ProxyConfiguration object with the passed proxy configuration.
Expand Down
12 changes: 12 additions & 0 deletions src/apify/_configuration.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from datetime import datetime, timedelta
from logging import getLogger
from typing import Annotated, Any

from pydantic import AliasChoices, BeforeValidator, Field
Expand All @@ -12,6 +13,8 @@

from apify._utils import docs_group

logger = getLogger(__name__)


def _transform_to_list(value: Any) -> list[str] | None:
if value is None:
Expand Down Expand Up @@ -353,6 +356,15 @@ class Configuration(CrawleeConfiguration):
),
] = None

@classmethod
def get_global_configuration(cls) -> Configuration:
"""Retrieve the global instance of the configuration.
Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`
instead.
"""
return cls()


# Monkey-patch the base class so that it works with the extended configuration
CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]
6 changes: 3 additions & 3 deletions src/apify/_proxy_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,9 @@ def __init__(
password: str | None = None,
groups: list[str] | None = None,
country_code: str | None = None,
proxy_urls: list[str] | None = None,
proxy_urls: list[str | None] | None = None,
new_url_function: _NewUrlFunction | None = None,
tiered_proxy_urls: list[list[str]] | None = None,
tiered_proxy_urls: list[list[str | None]] | None = None,
_actor_config: Configuration | None = None,
_apify_client: ApifyClientAsync | None = None,
) -> None:
Expand Down Expand Up @@ -148,7 +148,7 @@ def __init__(
' "groups" or "country_code".'
)

if proxy_urls and any('apify.com' in url for url in proxy_urls):
if proxy_urls and any('apify.com' in (url or '') for url in proxy_urls):
logger.warning(
'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties '
'instead of `proxy_urls`.\n'
Expand Down
14 changes: 12 additions & 2 deletions src/apify/apify_storage_client/_apify_storage_client.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from typing_extensions import override

from apify_client import ApifyClientAsync
from crawlee._utils.crypto import crypto_random_object_id
from crawlee.base_storage_client import BaseStorageClient
from crawlee.storage_clients import BaseStorageClient

from apify._configuration import Configuration
from apify._utils import docs_group
from apify.apify_storage_client._dataset_client import DatasetClient
from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
Expand All @@ -13,6 +16,9 @@
from apify.apify_storage_client._request_queue_client import RequestQueueClient
from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient

if TYPE_CHECKING:
from apify._configuration import Configuration


@docs_group('Classes')
class ApifyStorageClient(BaseStorageClient):
Expand All @@ -29,6 +35,10 @@ def __init__(self, *, configuration: Configuration) -> None:
)
self._configuration = configuration

@classmethod
def from_config(cls, config: Configuration) -> ApifyStorageClient:
return cls(configuration=config)

@override
def dataset(self, id: str) -> DatasetClient:
return DatasetClient(self._apify_client.dataset(id))
Expand Down
3 changes: 2 additions & 1 deletion src/apify/apify_storage_client/_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from typing_extensions import override

from crawlee.base_storage_client import BaseDatasetClient, DatasetItemsListPage, DatasetMetadata
from crawlee.storage_clients._base import BaseDatasetClient
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata

if TYPE_CHECKING:
from collections.abc import AsyncIterator
Expand Down
3 changes: 2 additions & 1 deletion src/apify/apify_storage_client/_dataset_collection_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from typing_extensions import override

from crawlee.base_storage_client import BaseDatasetCollectionClient, DatasetListPage, DatasetMetadata
from crawlee.storage_clients._base import BaseDatasetCollectionClient
from crawlee.storage_clients.models import DatasetListPage, DatasetMetadata

if TYPE_CHECKING:
from apify_client.clients import DatasetCollectionClientAsync
Expand Down
8 changes: 2 additions & 6 deletions src/apify/apify_storage_client/_key_value_store_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,8 @@

from typing_extensions import override

from crawlee.base_storage_client import (
BaseKeyValueStoreClient,
KeyValueStoreListKeysPage,
KeyValueStoreMetadata,
KeyValueStoreRecord,
)
from crawlee.storage_clients._base import BaseKeyValueStoreClient
from crawlee.storage_clients.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord

if TYPE_CHECKING:
from collections.abc import AsyncIterator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from typing_extensions import override

from crawlee.base_storage_client import BaseKeyValueStoreCollectionClient, KeyValueStoreListPage, KeyValueStoreMetadata
from crawlee.storage_clients._base import BaseKeyValueStoreCollectionClient
from crawlee.storage_clients.models import KeyValueStoreListPage, KeyValueStoreMetadata

if TYPE_CHECKING:
from apify_client.clients import KeyValueStoreCollectionClientAsync
Expand Down
24 changes: 2 additions & 22 deletions src/apify/apify_storage_client/_request_queue_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from typing_extensions import override

from crawlee import Request
from crawlee.base_storage_client import (
BaseRequestQueueClient,
from crawlee.storage_clients._base import BaseRequestQueueClient
from crawlee.storage_clients.models import (
BatchRequestsOperationResponse,
ProcessedRequest,
ProlongRequestLockResponse,
Expand Down Expand Up @@ -80,10 +80,6 @@ async def add_request(
by_alias=True,
exclude={
'id',
'json_',
'order_no',
'query_params',
'data',
},
),
forefront=forefront,
Expand All @@ -107,12 +103,6 @@ async def update_request(
| await self._client.update_request(
request=request.model_dump(
by_alias=True,
exclude={
'json_',
'order_no',
'query_params',
'data',
},
),
forefront=forefront,
)
Expand Down Expand Up @@ -164,10 +154,6 @@ async def batch_add_requests(
by_alias=True,
exclude={
'id',
'json_',
'order_no',
'query_params',
'data',
},
)
for r in requests
Expand All @@ -183,12 +169,6 @@ async def batch_delete_requests(self, requests: list[Request]) -> BatchRequestsO
requests=[
r.model_dump(
by_alias=True,
exclude={
'json_',
'order_no',
'query_params',
'data',
},
)
for r in requests
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from typing_extensions import override

from crawlee.base_storage_client import BaseRequestQueueCollectionClient, RequestQueueListPage, RequestQueueMetadata
from crawlee.storage_clients._base import BaseRequestQueueCollectionClient
from crawlee.storage_clients.models import RequestQueueListPage, RequestQueueMetadata

if TYPE_CHECKING:
from apify_client.clients import RequestQueueCollectionClientAsync
Expand Down
Loading
Loading