apify · vdusek · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py
@@ -37,14 +37,14 @@ class BaseRequestData(BaseModel):
     """URL of the web page to crawl"""
 
     unique_key: Annotated[str, Field(alias='uniqueKey')]
-    """A unique key identifying the request. Two requests with the same `uniqueKey` are considered as pointing to the
-    same URL.
+    """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
+    to the same URL.
 
-    If `uniqueKey` is not provided, then it is automatically generated by normalizing the URL.
-    For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `uniqueKey`
+    If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
+    For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
     of `http://www.example.com/something`.
 
-    Pass an arbitrary non-empty text value to the `uniqueKey` property
+    Pass an arbitrary non-empty text value to the `unique_key` property
     to override the default behavior and specify which URLs shall be considered equal.
     """
 

diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py
@@ -75,22 +75,32 @@ class ExportToKwargs(TypedDict):
 
 
 class Dataset(BaseStorage):
-    """Represents an append-only structured storage, ideal for tabular data akin to database tables.
+    """Represents an append-only structured storage, ideal for tabular data similar to database tables.
 
-    Represents a structured data store similar to a table, where each object (row) has consistent attributes (columns).
-    Datasets operate on an append-only basis, allowing for the addition of new records without the modification or
-    removal of existing ones. This class is typically used for storing crawling results.
+    The `Dataset` class is designed to store structured data, where each entry (row) maintains consistent attributes
+    (columns) across the dataset. It operates in an append-only mode, allowing new records to be added, but not
+    modified or deleted. This makes it particularly useful for storing results from web crawling operations.
 
-    Data can be stored locally or in the cloud, with local storage paths formatted as:
-    `{CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json`. Here, `{DATASET_ID}` is either "default" or
-    a specific dataset ID, and `{INDEX}` represents the zero-based index of the item in the dataset.
+    Data can be stored either locally or in the cloud. It depends on the setup of underlying storage client.
+    By default a `MemoryStorageClient` is used, but it can be changed to a different one.
 
-    To open a dataset, use the `open` class method with an `id`, `name`, or `config`. If unspecified, the default
-    dataset for the current crawler run is used. Opening a non-existent dataset by `id` raises an error, while
-    by `name`, it is created.
+    By default, data is stored using the following path structure:
+    ```
+    {CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json
+    ```
+    - `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable.
+    - `{DATASET_ID}`: Specifies the dataset, either "default" or a custom dataset ID.
+    - `{INDEX}`: Represents the zero-based index of the record within the dataset.
+
+    To open a dataset, use the `open` class method by specifying an `id`, `name`, or `configuration`. If none are
+    provided, the default dataset for the current crawler run is used. Attempting to open a dataset by `id` that does
+    not exist will raise an error; however, if accessed by `name`, the dataset will be created if it doesn't already
+    exist.
 
     Usage:
-        dataset = await Dataset.open(id='my_dataset_id')
+    ```python
+    dataset = await Dataset.open(name='my_dataset')
+    ```
     """
 
     _MAX_PAYLOAD_SIZE = ByteSize.from_mb(9)

diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py
@@ -15,24 +15,34 @@
 
 
 class KeyValueStore(BaseStorage):
-    """Represents a key-value based storage for reading data records or files.
-
-    Each record is identified by a unique key and associated with a MIME content type. This class is used within
-    crawler runs to store inputs and outputs, typically in JSON format, but supports other types as well.
-
-    The data can be stored on a local filesystem or in the cloud, determined by the `CRAWLEE_STORAGE_DIR`
-    environment variable.
-
-    By default, data is stored in `{CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{INDEX}.{EXT}`, where
-    `{STORE_ID}` is either "default" or specified by `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`, `{KEY}` is the record key,
-    and `{EXT}` is the MIME type.
-
-    To open a key-value store, use the class method `open`, providing either an `id` or `name` along with optional
-    `config`. If neither is provided, the default store for the crawler run is used. Opening a non-existent store by
-    `id` raises an error, while a non-existent store by `name` is created.
+    """Represents a key-value based storage for reading and writing data records or files.
+
+    Each data record is identified by a unique key and associated with a specific MIME content type. This class is
+    commonly used in crawler runs to store inputs and outputs, typically in JSON format, but it also supports other
+    content types.
+
+    Data can be stored either locally or in the cloud. It depends on the setup of underlying storage client.
+    By default a `MemoryStorageClient` is used, but it can be changed to a different one.
+
+    By default, data is stored using the following path structure:
+    ```
+    {CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}.{EXT}
+    ```
+    - `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable.
+    - `{STORE_ID}`: The identifier for the key-value store, either "default" or as specified by
+      `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`.
+    - `{KEY}`: The unique key for the record.
+    - `{EXT}`: The file extension corresponding to the MIME type of the content.
+
+    To open a key-value store, use the `open` class method, providing an `id`, `name`, or optional `configuration`.
+    If none are specified, the default store for the current crawler run is used. Attempting to open a store by `id`
+    that does not exist will raise an error; however, if accessed by `name`, the store will be created if it does not
+    already exist.
 
     Usage:
-        kvs = await KeyValueStore.open(id='my_kvs_id')
+    ```python
+    kvs = await KeyValueStore.open(name='my_kvs')
+    ```
     """
 
     def __init__(

diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py
@@ -63,23 +63,32 @@ class CachedRequest(TypedDict):
 
 
 class RequestQueue(BaseStorage, RequestProvider):
-    """Represents a queue storage for HTTP requests to crawl.
+    """Represents a queue storage for managing HTTP requests in web crawling operations.
 
-    Manages a queue of requests with unique URLs for structured deep web crawling with support for both breadth-first
-    and depth-first orders. This queue is designed for crawling websites by starting with initial URLs and recursively
-    following links. Each URL is uniquely identified by a `unique_key` field, which can be overridden to add the same
-    URL multiple times under different keys.
+    The `RequestQueue` class handles a queue of HTTP requests, each identified by a unique URL, to facilitate structured
+    web crawling. It supports both breadth-first and depth-first crawling strategies, allowing for recursive crawling
+    starting from an initial set of URLs. Each URL in the queue is uniquely identified by a `unique_key`, which can be
+    customized to allow the same URL to be added multiple times under different keys.
 
-    Local storage path (if `CRAWLEE_STORAGE_DIR` is set):
-    `{CRAWLEE_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json`, where `{QUEUE_ID}` is the request
-    queue's ID (default or specified) and `{REQUEST_ID}` is the request's ID.
+    Data can be stored either locally or in the cloud. It depends on the setup of underlying storage client.
+    By default a `MemoryStorageClient` is used, but it can be changed to a different one.
 
-    Usage includes creating or opening existing queues by ID or name, with named queues retained indefinitely and
-    unnamed queues expiring after 7 days unless specified otherwise. Supports mutable operations—URLs can be added
-    and deleted.
+    By default, data is stored using the following path structure:
+    ```
+    {CRAWLEE_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json
+    ```
+    - `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable.
+    - `{QUEUE_ID}`: The identifier for the request queue, either "default" or as specified.
+    - `{REQUEST_ID}`: The unique identifier for each request in the queue.
+
+    The `RequestQueue` supports both creating new queues and opening existing ones by `id` or `name`. Named queues
+    persist indefinitely, while unnamed queues expire after 7 days unless specified otherwise. The queue supports
+    mutable operations, allowing URLs to be added and removed as needed.
 
     Usage:
-        rq = await RequestQueue.open(id='my_rq_id')
+    ```python
+    rq = await RequestQueue.open(name='my_rq')
+    ```
     """
 
     _MAX_CACHED_REQUESTS = 1_000_000
@@ -176,9 +185,9 @@ async def add_request(
     ) -> ProcessedRequest:
         """Adds a request to the `RequestQueue` while managing deduplication and positioning within the queue.
 
-        The deduplication of requests relies on the `uniqueKey` field within the request dictionary. If `uniqueKey`
+        The deduplication of requests relies on the `unique_key` field within the request dictionary. If `unique_key`
         exists, it remains unchanged; if it does not, it is generated based on the request's `url`, `method`,
-        and `payload` fields. The generation of `uniqueKey` can be influenced by the `keep_url_fragment` and
+        and `payload` fields. The generation of `unique_key` can be influenced by the `keep_url_fragment` and
         `use_extended_unique_key` flags, which dictate whether to include the URL fragment and the request's method
         and payload, respectively, in its computation.
 
@@ -188,17 +197,15 @@ async def add_request(
 
         Args:
             request: The request object to be added to the queue. Must include at least the `url` key.
-                Optionaly it can include the `method`, `payload` and `uniqueKey` keys.
-
+                Optionaly it can include the `method`, `payload` and `unique_key` keys.
             forefront: If True, adds the request to the forefront of the queue; otherwise, adds it to the end.
-
             keep_url_fragment: Determines whether the URL fragment (the part of the URL after '#') should be retained
-                in the unique key computation.
+                in the `unique_key` computation.
+            use_extended_unique_key: Determines whether to use an extended `unique_key`, incorporating the request's
+                method and payload into the `unique_key` computation.
 
-            use_extended_unique_key: Determines whether to use an extended unique key, incorporating the request's
-                method and payload into the unique key computation.
-
-        Returns: Information about the processed request.
+        Returns:
+            Information about the processed request.
         """
         request = self._transform_request(request)
         self._last_activity = datetime.now(timezone.utc)
@@ -340,9 +347,9 @@ async def fetch_next_request(self) -> Request | None:
 
         # 1)
         # Queue head index is ahead of the main table and the request is not present in the main table yet
-        # (i.e. getRequest() returned null). In this case, keep the request marked as in progress for a short while,
-        # so that isFinished() doesn't return true and _ensureHeadIsNonEmpty() doesn't not load the request into
-        # the queueHeadDict straight again. After the interval expires, fetchNextRequest() will try to fetch this
+        # (i.e. get_request() returned null). In this case, keep the request marked as in progress for a short while,
+        # so that is_finished() doesn't return true and _ensure_head_is_non_empty() doesn't not load the request into
+        # the queueHeadDict straight again. After the interval expires, fetch_next_request() will try to fetch this
         # request again, until it eventually appears in the main table.
         if request is None:
             logger.debug(
@@ -357,9 +364,9 @@ async def fetch_next_request(self) -> Request | None:
 
         # 2)
         # Queue head index is behind the main table and the underlying request was already handled (by some other
-        # client, since we keep the track of handled requests in recentlyHandled dictionary). We just add the request
-        # to the recentlyHandled dictionary so that next call to _ensureHeadIsNonEmpty() will not put the request again
-        # to queueHeadDict.
+        # client, since we keep the track of handled requests in recently_handled dictionary). We just add the request
+        # to the recently_handled dictionary so that next call to _ensure_head_is_non_empty() will not put the request
+        # again to queue_head_dict.
         if request.handled_at is not None:
             logger.debug(
                 'Request fetched from the beginning of queue was already handled',
@@ -410,7 +417,7 @@ async def reclaim_request(
     ) -> ProcessedRequest | None:
         """Reclaim a failed request back to the queue.
 
-        The request will be returned for processing later again by another call to `RequestQueue.fetchNextRequest`.
+        The request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`.
 
         Args:
             request: The request to return to the queue.
@@ -425,7 +432,7 @@ async def reclaim_request(
             logger.debug(f'Cannot reclaim request (ID: {request.id}), because it is not in progress!')
             return None
 
-        # TODO: If request hasn't been changed since the last getRequest(), we don't need to call updateRequest()
+        # TODO: If request hasn't been changed since the last get_request(), we don't need to call update_request()
         # and thus improve performance.
         # https://github.com/apify/apify-sdk-python/issues/143
         processed_request = await self._resource_client.update_request(request, forefront=forefront)
@@ -450,17 +457,16 @@ async def is_empty(self) -> bool:
         """Check whether the queue is empty.
 
         Returns:
-            bool: `True` if the next call to `RequestQueue.fetchNextRequest` would return `None`, otherwise `False`.
+            bool: `True` if the next call to `RequestQueue.fetch_next_request` would return `None`, otherwise `False`.
         """
         await self._ensure_head_is_non_empty()
         return len(self._queue_head_dict) == 0
 
     async def is_finished(self) -> bool:
         """Check whether the queue is finished.
 
-        Due to the nature of distributed storage used by the queue,
-        the function might occasionally return a false negative,
-        but it will never return a false positive.
+        Due to the nature of distributed storage used by the queue, the function might occasionally return a false
+        negative, but it will never return a false positive.
 
         Returns:
             bool: `True` if all requests were already handled and there are no more left. `False` otherwise.
@@ -617,7 +623,7 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None:
             hydrated_request = await self.get_request(request_id)
 
             # Queue head index is ahead of the main table and the request is not present in the main table yet
-            # (i.e. getRequest() returned null).
+            # (i.e. get_request() returned null).
             if not hydrated_request:
                 # Remove the lock from the request for now, so that it can be picked up later
                 # This may/may not succeed, but that's fine
@@ -661,7 +667,7 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None:
         cached_entry['hydrated'] = hydrated_request
 
         # Queue head index is ahead of the main table and the request is not present in the main table yet
-        # (i.e. getRequest() returned null).
+        # (i.e. get_request() returned null).
         if not hydrated_request:
             # Remove the lock from the request for now, so that it can be picked up later
             # This may/may not succeed, but that's fine

diff --git a/website/transformDocs.js b/website/transformDocs.js
@@ -79,7 +79,12 @@ const groupSort = (g1, g2) => {
 function getGroupName(object) {
     const groupPredicates = {
         'Errors': (x) => x.name.toLowerCase().includes('error'),
-        'Main Classes': (x) => ['Dataset', 'KeyValueStore', 'RequestQueue'].includes(x.name) || x.name.endsWith('Crawler'),
+        'Main Classes': (x) => [
+            'BasicCrawler', 'HttpCrawler', 'BeautifulSoupCrawler', 'ParselCrawler', 'PlaywrightCrawler', 'Dataset',
+            'KeyValueStore', 'RequestQueue', 'MemoryStorageClient', 'HttpxHttpClient', 'CurlImpersonateHttpClient',
+            'Configuration', 'EventManager', 'LocalEventManager', 'Request', 'Session', 'SessionPool', 'BrowserPool',
+            'PlaywrightBrowserController', 'PlaywrightBrowserPlugin', 'Statistics',
+        ].includes(x.name),
         'Helper Classes': (x) => x.kindString === 'Class',
         'Methods': (x) => x.kindString === 'Method',
         'Constructors': (x) => x.kindString === 'Constructor',
@@ -178,12 +183,12 @@ function convertObject(obj, parent, module) {
         }
 
         let typedocType = inferTypedocType(member.datatype);
-        
+
         if (member.decorations?.some(d => ['property', 'dualproperty'].includes(d.name))) {
             typedocKind = TYPEDOC_KINDS['data'];
             typedocType = inferTypedocType(member.return_type ?? member.datatype);
         }
-        
+
         if(parent.kindString === 'Enumeration') {
             typedocKind = TYPEDOC_KINDS['enumValue'];
             typedocType = {