Dataset export methods

vdusek · vdusek · commit bdadd4357077 · 2025-04-11T09:06:40.000+02:00
diff --git a/src/crawlee/_cli.py b/src/crawlee/_cli.py
@@ -22,7 +22,7 @@
 cli = typer.Typer(no_args_is_help=True)
 
 template_directory = importlib.resources.files('crawlee') / 'project_template'
-with open(str(template_directory / 'cookiecutter.json')) as f:
+with (template_directory / 'cookiecutter.json').open() as f:
     cookiecutter_json = json.load(f)
 
 crawler_choices = cookiecutter_json['crawler_type']
diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py
@@ -2,17 +2,26 @@
 
 import asyncio
 import contextlib
+import csv
 import json
 import mimetypes
 import os
 import re
 import shutil
 from enum import Enum
+from logging import getLogger
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
+    from collections.abc import AsyncIterator
     from pathlib import Path
-    from typing import Any
+    from typing import Any, TextIO
+
+    from typing_extensions import Unpack
+
+    from crawlee.storages._types import ExportDataCsvKwargs, ExportDataJsonKwargs
+
+logger = getLogger(__name__)
 
 
 class ContentType(Enum):
@@ -92,3 +101,36 @@ async def json_dumps(obj: Any) -> str:
         A string containing the JSON representation of the input object.
     """
     return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str)
+
+
+async def export_json_to_stream(
+    iterator: AsyncIterator[dict],
+    dst: TextIO,
+    **kwargs: Unpack[ExportDataJsonKwargs],
+) -> None:
+    items = [item async for item in iterator]
+
+    if items:
+        json.dump(items, dst, **kwargs)
+    else:
+        logger.warning('Attempting to export an empty dataset - no file will be created')
+
+
+async def export_csv_to_stream(
+    iterator: AsyncIterator[dict],
+    dst: TextIO,
+    **kwargs: Unpack[ExportDataCsvKwargs],
+) -> None:
+    writer = csv.writer(dst, **kwargs)
+    write_header = True
+
+    # Iterate over the dataset and write to CSV.
+    async for item in iterator:
+        if not item:
+            continue
+
+        if write_header:
+            writer.writerow(item.keys())
+            write_header = False
+
+        writer.writerow(item.values())
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -32,6 +32,7 @@
     SendRequestFunction,
 )
 from crawlee._utils.docs import docs_group
+from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
 from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
 from crawlee._utils.wait import wait_for
 from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
@@ -57,7 +58,7 @@
     import re
     from contextlib import AbstractAsyncContextManager
 
-    from crawlee._types import ConcurrencySettings, HttpMethod, JsonSerializable
+    from crawlee._types import ConcurrencySettings, HttpMethod, JsonSerializable, PushDataKwargs
     from crawlee.configuration import Configuration
     from crawlee.events import EventManager
     from crawlee.http_clients import HttpClient, HttpResponse
@@ -67,7 +68,7 @@
     from crawlee.statistics import FinalStatistics
     from crawlee.storage_clients import StorageClient
     from crawlee.storage_clients.models import DatasetItemsListPage
-    from crawlee.storages._dataset import ExportDataCsvKwargs, ExportDataJsonKwargs, GetDataKwargs, PushDataKwargs
+    from crawlee.storages._types import GetDataKwargs
 
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
 TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
@@ -655,13 +656,18 @@ async def add_requests(
             wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout,
         )
 
-    async def _use_state(self, default_value: dict[str, JsonSerializable] | None = None) -> dict[str, JsonSerializable]:
-        store = await self.get_key_value_store()
-        return await store.get_auto_saved_value(self._CRAWLEE_STATE_KEY, default_value)
+    async def _use_state(
+        self,
+        default_value: dict[str, JsonSerializable] | None = None,
+    ) -> dict[str, JsonSerializable]:
+        kvs = await self.get_key_value_store()
+        # TODO:
+        # return some kvs value
 
     async def _save_crawler_state(self) -> None:
-        store = await self.get_key_value_store()
-        await store.persist_autosaved_values()
+        kvs = await self.get_key_value_store()
+        # TODO:
+        # some kvs call
 
     async def get_data(
         self,
@@ -705,64 +711,15 @@ async def export_data(
         dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
 
         path = path if isinstance(path, Path) else Path(path)
-        destination = path.open('w', newline='')
+        dst = path.open('w', newline='')
 
         if path.suffix == '.csv':
-            await dataset.write_to_csv(destination)
+            await export_csv_to_stream(dataset.iterate(), dst)
         elif path.suffix == '.json':
-            await dataset.write_to_json(destination)
+            await export_json_to_stream(dataset.iterate(), dst)
         else:
             raise ValueError(f'Unsupported file extension: {path.suffix}')
 
-    async def export_data_csv(
-        self,
-        path: str | Path,
-        *,
-        dataset_id: str | None = None,
-        dataset_name: str | None = None,
-        **kwargs: Unpack[ExportDataCsvKwargs],
-    ) -> None:
-        """Export data from a `Dataset` to a CSV file.
-
-        This helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens
-        the specified one and then exports the data based on the provided parameters.
-
-        Args:
-            path: The destination path.
-            content_type: The output format.
-            dataset_id: The ID of the `Dataset`.
-            dataset_name: The name of the `Dataset`.
-            kwargs: Extra configurations for dumping/writing in csv format.
-        """
-        dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
-        path = path if isinstance(path, Path) else Path(path)
-
-        return await dataset.write_to_csv(path.open('w', newline=''), **kwargs)
-
-    async def export_data_json(
-        self,
-        path: str | Path,
-        *,
-        dataset_id: str | None = None,
-        dataset_name: str | None = None,
-        **kwargs: Unpack[ExportDataJsonKwargs],
-    ) -> None:
-        """Export data from a `Dataset` to a JSON file.
-
-        This helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the
-        specified one and then exports the data based on the provided parameters.
-
-        Args:
-            path: The destination path
-            dataset_id: The ID of the `Dataset`.
-            dataset_name: The name of the `Dataset`.
-            kwargs: Extra configurations for dumping/writing in json format.
-        """
-        dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
-        path = path if isinstance(path, Path) else Path(path)
-
-        return await dataset.write_to_json(path.open('w', newline=''), **kwargs)
-
     async def _push_data(
         self,
         data: JsonSerializable,
diff --git a/src/crawlee/storage_clients/_base/_dataset_client.py b/src/crawlee/storage_clients/_base/_dataset_client.py
@@ -14,6 +14,22 @@
     from crawlee.storage_clients.models import DatasetItemsListPage
 
 
+# Properties:
+# - id
+# - name
+# - created_at
+# - accessed_at
+# - modified_at
+# - item_count
+
+# Methods:
+# - open
+# - drop
+# - push_data
+# - get_data
+# - iterate
+
+
 @docs_group('Abstract classes')
 class DatasetClient(ABC):
     """An abstract class for dataset resource clients.
diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py
@@ -219,7 +219,7 @@ async def get_data(
         invalid = [arg for arg in unsupported_args if arg not in (False, None)]
         if invalid:
             logger.warning(
-                f'The arguments {invalid} of iterate_items are not supported by the {self.__class__.__name__} client.'
+                f'The arguments {invalid} of get_data are not supported by the {self.__class__.__name__} client.'
             )
 
         # If the dataset directory does not exist, log a warning and return an empty page.
diff --git a/src/crawlee/storage_clients/_memory/_dataset_client.py b/src/crawlee/storage_clients/_memory/_dataset_client.py
@@ -135,7 +135,7 @@ async def get_data(
         invalid = [arg for arg in unsupported_args if arg not in (False, None)]
         if invalid:
             logger.warning(
-                f'The arguments {invalid} of iterate_items are not supported by the {self.__class__.__name__} client.'
+                f'The arguments {invalid} of get_data are not supported by the {self.__class__.__name__} client.'
             )
 
         total = len(self._records)
@@ -172,7 +172,7 @@ async def iterate(
         invalid = [arg for arg in unsupported_args if arg not in (False, None)]
         if invalid:
             logger.warning(
-                f'The arguments {invalid} of iterate_items are not supported by the {self.__class__.__name__} client.'
+                f'The arguments {invalid} of iterate are not supported by the {self.__class__.__name__} client.'
             )
 
         items = self._records.copy()
diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py
diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py

Original file line number	Diff line number	Diff line change
`@@ -219,7 +219,7 @@ async def get_data(`
`219`	`219`	`invalid = [arg for arg in unsupported_args if arg not in (False, None)]`
`220`	`220`	`if invalid:`
`221`	`221`	`logger.warning(`
`222`		`- f'The arguments {invalid} of iterate_items are not supported by the {self.__class__.__name__} client.'`
	`222`	`+ f'The arguments {invalid} of get_data are not supported by the {self.__class__.__name__} client.'`
`223`	`223`	`)`
`224`	`224`
`225`	`225`	`# If the dataset directory does not exist, log a warning and return an empty page.`