diff --git a/.github/workflows/check_pr_title.yaml b/.github/workflows/check_pr_title.yaml index 759d684e7d..6970d93cfe 100644 --- a/.github/workflows/check_pr_title.yaml +++ b/.github/workflows/check_pr_title.yaml @@ -2,11 +2,11 @@ name: Check PR title on: pull_request_target: - types: [ opened, edited, synchronize ] + types: [opened, edited, synchronize] jobs: check_pr_title: - name: 'Check PR title' + name: Check PR title runs-on: ubuntu-latest steps: - uses: amannn/action-semantic-pull-request@v5.5.3 diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml new file mode 100644 index 0000000000..f33ba21dce --- /dev/null +++ b/.github/workflows/docs.yaml @@ -0,0 +1,63 @@ +name: docs + +on: + push: + branches: + - master + workflow_dispatch: + +jobs: + build: + environment: + name: github-pages + permissions: + contents: write + pages: write + id-token: write + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Use Node.js 20 + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Enable corepack + run: | + corepack enable + corepack prepare yarn@stable --activate + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.12 + + - name: Install Python dependencies + run: make install-dev + + - name: Build generated API reference + run: make build-api-reference + + - name: Build & deploy docs + run: | + # go to website dir + cd website + # install website deps + yarn + # build the docs + yarn build + env: + APIFY_SIGNING_TOKEN: ${{ secrets.APIFY_SIGNING_TOKEN }} + + - name: Set up GitHub Pages + uses: actions/configure-pages@v5 + + - name: Upload GitHub Pages artifact + uses: actions/upload-pages-artifact@v3 + with: + path: ./website/build + + - name: Deploy artifact to GitHub Pages + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index cb2e0f4cce..0000000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,63 +0,0 @@ -name: docs - -on: - push: - branches: - - master - workflow_dispatch: - -jobs: - build: - environment: - name: github-pages - permissions: - contents: write - pages: write - id-token: write - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - - name: Use Node.js 20 - uses: actions/setup-node@v4 - with: - node-version: 20 - - - name: Enable corepack - run: | - corepack enable - corepack prepare yarn@stable --activate - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.12 - - - name: Install Python dependencies - run: make install-dev - - - name: Build generated API reference - run: make build-api-reference - - - name: Build & deploy docs - run: | - # go to website dir - cd website - # install website deps - yarn - # build the docs - yarn build - env: - APIFY_SIGNING_TOKEN: ${{ secrets.APIFY_SIGNING_TOKEN }} - - - name: Set up GitHub Pages - uses: actions/configure-pages@v5 - - - name: Upload GitHub Pages artifact - uses: actions/upload-pages-artifact@v3 - with: - path: ./website/build - - - name: Deploy artifact to GitHub Pages - uses: actions/deploy-pages@v4 diff --git a/.github/workflows/run_release.yaml b/.github/workflows/run_release.yaml index 19acb76766..218974b969 100644 --- a/.github/workflows/run_release.yaml +++ b/.github/workflows/run_release.yaml @@ -23,7 +23,7 @@ on: description: The custom version to bump to (only for "custom" type) required: false type: string - default: '' + default: "" jobs: run_code_checks: @@ -114,7 +114,7 @@ jobs: with: author_name: Apify Release Bot author_email: noreply@apify.com - message: 'chore(release): Update changelog and package version [skip ci]' + message: "chore(release): Update changelog and package version [skip ci]" create_github_release: name: Create github release diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index aa639a4aa9..fb326f0630 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -37,14 +37,14 @@ class BaseRequestData(BaseModel): """URL of the web page to crawl""" unique_key: Annotated[str, Field(alias='uniqueKey')] - """A unique key identifying the request. Two requests with the same `uniqueKey` are considered as pointing to the - same URL. + """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing + to the same URL. - If `uniqueKey` is not provided, then it is automatically generated by normalizing the URL. - For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `uniqueKey` + If `unique_key` is not provided, then it is automatically generated by normalizing the URL. + For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key` of `http://www.example.com/something`. - Pass an arbitrary non-empty text value to the `uniqueKey` property + Pass an arbitrary non-empty text value to the `unique_key` property to override the default behavior and specify which URLs shall be considered equal. """ diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index 0a02258e05..a9d623554b 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -75,22 +75,32 @@ class ExportToKwargs(TypedDict): class Dataset(BaseStorage): - """Represents an append-only structured storage, ideal for tabular data akin to database tables. + """Represents an append-only structured storage, ideal for tabular data similar to database tables. - Represents a structured data store similar to a table, where each object (row) has consistent attributes (columns). - Datasets operate on an append-only basis, allowing for the addition of new records without the modification or - removal of existing ones. This class is typically used for storing crawling results. + The `Dataset` class is designed to store structured data, where each entry (row) maintains consistent attributes + (columns) across the dataset. It operates in an append-only mode, allowing new records to be added, but not + modified or deleted. This makes it particularly useful for storing results from web crawling operations. - Data can be stored locally or in the cloud, with local storage paths formatted as: - `{CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json`. Here, `{DATASET_ID}` is either "default" or - a specific dataset ID, and `{INDEX}` represents the zero-based index of the item in the dataset. + Data can be stored either locally or in the cloud. It depends on the setup of underlying storage client. + By default a `MemoryStorageClient` is used, but it can be changed to a different one. - To open a dataset, use the `open` class method with an `id`, `name`, or `config`. If unspecified, the default - dataset for the current crawler run is used. Opening a non-existent dataset by `id` raises an error, while - by `name`, it is created. + By default, data is stored using the following path structure: + ``` + {CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json + ``` + - `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable. + - `{DATASET_ID}`: Specifies the dataset, either "default" or a custom dataset ID. + - `{INDEX}`: Represents the zero-based index of the record within the dataset. + + To open a dataset, use the `open` class method by specifying an `id`, `name`, or `configuration`. If none are + provided, the default dataset for the current crawler run is used. Attempting to open a dataset by `id` that does + not exist will raise an error; however, if accessed by `name`, the dataset will be created if it doesn't already + exist. Usage: - dataset = await Dataset.open(id='my_dataset_id') + ```python + dataset = await Dataset.open(name='my_dataset') + ``` """ _MAX_PAYLOAD_SIZE = ByteSize.from_mb(9) diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index 27f629965d..b012ea74f7 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -15,24 +15,34 @@ class KeyValueStore(BaseStorage): - """Represents a key-value based storage for reading data records or files. - - Each record is identified by a unique key and associated with a MIME content type. This class is used within - crawler runs to store inputs and outputs, typically in JSON format, but supports other types as well. - - The data can be stored on a local filesystem or in the cloud, determined by the `CRAWLEE_STORAGE_DIR` - environment variable. - - By default, data is stored in `{CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{INDEX}.{EXT}`, where - `{STORE_ID}` is either "default" or specified by `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`, `{KEY}` is the record key, - and `{EXT}` is the MIME type. - - To open a key-value store, use the class method `open`, providing either an `id` or `name` along with optional - `config`. If neither is provided, the default store for the crawler run is used. Opening a non-existent store by - `id` raises an error, while a non-existent store by `name` is created. + """Represents a key-value based storage for reading and writing data records or files. + + Each data record is identified by a unique key and associated with a specific MIME content type. This class is + commonly used in crawler runs to store inputs and outputs, typically in JSON format, but it also supports other + content types. + + Data can be stored either locally or in the cloud. It depends on the setup of underlying storage client. + By default a `MemoryStorageClient` is used, but it can be changed to a different one. + + By default, data is stored using the following path structure: + ``` + {CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}.{EXT} + ``` + - `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable. + - `{STORE_ID}`: The identifier for the key-value store, either "default" or as specified by + `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`. + - `{KEY}`: The unique key for the record. + - `{EXT}`: The file extension corresponding to the MIME type of the content. + + To open a key-value store, use the `open` class method, providing an `id`, `name`, or optional `configuration`. + If none are specified, the default store for the current crawler run is used. Attempting to open a store by `id` + that does not exist will raise an error; however, if accessed by `name`, the store will be created if it does not + already exist. Usage: - kvs = await KeyValueStore.open(id='my_kvs_id') + ```python + kvs = await KeyValueStore.open(name='my_kvs') + ``` """ def __init__( diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index 58fd1c8315..ebfc55c2d9 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -63,23 +63,32 @@ class CachedRequest(TypedDict): class RequestQueue(BaseStorage, RequestProvider): - """Represents a queue storage for HTTP requests to crawl. + """Represents a queue storage for managing HTTP requests in web crawling operations. - Manages a queue of requests with unique URLs for structured deep web crawling with support for both breadth-first - and depth-first orders. This queue is designed for crawling websites by starting with initial URLs and recursively - following links. Each URL is uniquely identified by a `unique_key` field, which can be overridden to add the same - URL multiple times under different keys. + The `RequestQueue` class handles a queue of HTTP requests, each identified by a unique URL, to facilitate structured + web crawling. It supports both breadth-first and depth-first crawling strategies, allowing for recursive crawling + starting from an initial set of URLs. Each URL in the queue is uniquely identified by a `unique_key`, which can be + customized to allow the same URL to be added multiple times under different keys. - Local storage path (if `CRAWLEE_STORAGE_DIR` is set): - `{CRAWLEE_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json`, where `{QUEUE_ID}` is the request - queue's ID (default or specified) and `{REQUEST_ID}` is the request's ID. + Data can be stored either locally or in the cloud. It depends on the setup of underlying storage client. + By default a `MemoryStorageClient` is used, but it can be changed to a different one. - Usage includes creating or opening existing queues by ID or name, with named queues retained indefinitely and - unnamed queues expiring after 7 days unless specified otherwise. Supports mutable operations—URLs can be added - and deleted. + By default, data is stored using the following path structure: + ``` + {CRAWLEE_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json + ``` + - `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable. + - `{QUEUE_ID}`: The identifier for the request queue, either "default" or as specified. + - `{REQUEST_ID}`: The unique identifier for each request in the queue. + + The `RequestQueue` supports both creating new queues and opening existing ones by `id` or `name`. Named queues + persist indefinitely, while unnamed queues expire after 7 days unless specified otherwise. The queue supports + mutable operations, allowing URLs to be added and removed as needed. Usage: - rq = await RequestQueue.open(id='my_rq_id') + ```python + rq = await RequestQueue.open(name='my_rq') + ``` """ _MAX_CACHED_REQUESTS = 1_000_000 @@ -176,9 +185,9 @@ async def add_request( ) -> ProcessedRequest: """Adds a request to the `RequestQueue` while managing deduplication and positioning within the queue. - The deduplication of requests relies on the `uniqueKey` field within the request dictionary. If `uniqueKey` + The deduplication of requests relies on the `unique_key` field within the request dictionary. If `unique_key` exists, it remains unchanged; if it does not, it is generated based on the request's `url`, `method`, - and `payload` fields. The generation of `uniqueKey` can be influenced by the `keep_url_fragment` and + and `payload` fields. The generation of `unique_key` can be influenced by the `keep_url_fragment` and `use_extended_unique_key` flags, which dictate whether to include the URL fragment and the request's method and payload, respectively, in its computation. @@ -188,17 +197,15 @@ async def add_request( Args: request: The request object to be added to the queue. Must include at least the `url` key. - Optionaly it can include the `method`, `payload` and `uniqueKey` keys. - + Optionaly it can include the `method`, `payload` and `unique_key` keys. forefront: If True, adds the request to the forefront of the queue; otherwise, adds it to the end. - keep_url_fragment: Determines whether the URL fragment (the part of the URL after '#') should be retained - in the unique key computation. + in the `unique_key` computation. + use_extended_unique_key: Determines whether to use an extended `unique_key`, incorporating the request's + method and payload into the `unique_key` computation. - use_extended_unique_key: Determines whether to use an extended unique key, incorporating the request's - method and payload into the unique key computation. - - Returns: Information about the processed request. + Returns: + Information about the processed request. """ request = self._transform_request(request) self._last_activity = datetime.now(timezone.utc) @@ -340,9 +347,9 @@ async def fetch_next_request(self) -> Request | None: # 1) # Queue head index is ahead of the main table and the request is not present in the main table yet - # (i.e. getRequest() returned null). In this case, keep the request marked as in progress for a short while, - # so that isFinished() doesn't return true and _ensureHeadIsNonEmpty() doesn't not load the request into - # the queueHeadDict straight again. After the interval expires, fetchNextRequest() will try to fetch this + # (i.e. get_request() returned null). In this case, keep the request marked as in progress for a short while, + # so that is_finished() doesn't return true and _ensure_head_is_non_empty() doesn't not load the request into + # the queueHeadDict straight again. After the interval expires, fetch_next_request() will try to fetch this # request again, until it eventually appears in the main table. if request is None: logger.debug( @@ -357,9 +364,9 @@ async def fetch_next_request(self) -> Request | None: # 2) # Queue head index is behind the main table and the underlying request was already handled (by some other - # client, since we keep the track of handled requests in recentlyHandled dictionary). We just add the request - # to the recentlyHandled dictionary so that next call to _ensureHeadIsNonEmpty() will not put the request again - # to queueHeadDict. + # client, since we keep the track of handled requests in recently_handled dictionary). We just add the request + # to the recently_handled dictionary so that next call to _ensure_head_is_non_empty() will not put the request + # again to queue_head_dict. if request.handled_at is not None: logger.debug( 'Request fetched from the beginning of queue was already handled', @@ -410,7 +417,7 @@ async def reclaim_request( ) -> ProcessedRequest | None: """Reclaim a failed request back to the queue. - The request will be returned for processing later again by another call to `RequestQueue.fetchNextRequest`. + The request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`. Args: request: The request to return to the queue. @@ -425,7 +432,7 @@ async def reclaim_request( logger.debug(f'Cannot reclaim request (ID: {request.id}), because it is not in progress!') return None - # TODO: If request hasn't been changed since the last getRequest(), we don't need to call updateRequest() + # TODO: If request hasn't been changed since the last get_request(), we don't need to call update_request() # and thus improve performance. # https://github.com/apify/apify-sdk-python/issues/143 processed_request = await self._resource_client.update_request(request, forefront=forefront) @@ -450,7 +457,7 @@ async def is_empty(self) -> bool: """Check whether the queue is empty. Returns: - bool: `True` if the next call to `RequestQueue.fetchNextRequest` would return `None`, otherwise `False`. + bool: `True` if the next call to `RequestQueue.fetch_next_request` would return `None`, otherwise `False`. """ await self._ensure_head_is_non_empty() return len(self._queue_head_dict) == 0 @@ -458,9 +465,8 @@ async def is_empty(self) -> bool: async def is_finished(self) -> bool: """Check whether the queue is finished. - Due to the nature of distributed storage used by the queue, - the function might occasionally return a false negative, - but it will never return a false positive. + Due to the nature of distributed storage used by the queue, the function might occasionally return a false + negative, but it will never return a false positive. Returns: bool: `True` if all requests were already handled and there are no more left. `False` otherwise. @@ -617,7 +623,7 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None: hydrated_request = await self.get_request(request_id) # Queue head index is ahead of the main table and the request is not present in the main table yet - # (i.e. getRequest() returned null). + # (i.e. get_request() returned null). if not hydrated_request: # Remove the lock from the request for now, so that it can be picked up later # This may/may not succeed, but that's fine @@ -661,7 +667,7 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None: cached_entry['hydrated'] = hydrated_request # Queue head index is ahead of the main table and the request is not present in the main table yet - # (i.e. getRequest() returned null). + # (i.e. get_request() returned null). if not hydrated_request: # Remove the lock from the request for now, so that it can be picked up later # This may/may not succeed, but that's fine diff --git a/website/transformDocs.js b/website/transformDocs.js index 08fd738fab..5bc9b1bafc 100644 --- a/website/transformDocs.js +++ b/website/transformDocs.js @@ -79,7 +79,12 @@ const groupSort = (g1, g2) => { function getGroupName(object) { const groupPredicates = { 'Errors': (x) => x.name.toLowerCase().includes('error'), - 'Main Classes': (x) => ['Dataset', 'KeyValueStore', 'RequestQueue'].includes(x.name) || x.name.endsWith('Crawler'), + 'Main Classes': (x) => [ + 'BasicCrawler', 'HttpCrawler', 'BeautifulSoupCrawler', 'ParselCrawler', 'PlaywrightCrawler', 'Dataset', + 'KeyValueStore', 'RequestQueue', 'MemoryStorageClient', 'HttpxHttpClient', 'CurlImpersonateHttpClient', + 'Configuration', 'EventManager', 'LocalEventManager', 'Request', 'Session', 'SessionPool', 'BrowserPool', + 'PlaywrightBrowserController', 'PlaywrightBrowserPlugin', 'Statistics', + ].includes(x.name), 'Helper Classes': (x) => x.kindString === 'Class', 'Methods': (x) => x.kindString === 'Method', 'Constructors': (x) => x.kindString === 'Constructor', @@ -178,12 +183,12 @@ function convertObject(obj, parent, module) { } let typedocType = inferTypedocType(member.datatype); - + if (member.decorations?.some(d => ['property', 'dualproperty'].includes(d.name))) { typedocKind = TYPEDOC_KINDS['data']; typedocType = inferTypedocType(member.return_type ?? member.datatype); } - + if(parent.kindString === 'Enumeration') { typedocKind = TYPEDOC_KINDS['enumValue']; typedocType = {