|
32 | 32 | SendRequestFunction,
|
33 | 33 | )
|
34 | 34 | from crawlee._utils.docs import docs_group
|
| 35 | +from crawlee._utils.file import export_csv_to_stream, export_json_to_stream |
35 | 36 | from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
|
36 | 37 | from crawlee._utils.wait import wait_for
|
37 | 38 | from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
|
|
57 | 58 | import re
|
58 | 59 | from contextlib import AbstractAsyncContextManager
|
59 | 60 |
|
60 |
| - from crawlee._types import ConcurrencySettings, HttpMethod, JsonSerializable |
| 61 | + from crawlee._types import ConcurrencySettings, HttpMethod, JsonSerializable, PushDataKwargs |
61 | 62 | from crawlee.configuration import Configuration
|
62 | 63 | from crawlee.events import EventManager
|
63 | 64 | from crawlee.http_clients import HttpClient, HttpResponse
|
|
67 | 68 | from crawlee.statistics import FinalStatistics
|
68 | 69 | from crawlee.storage_clients import StorageClient
|
69 | 70 | from crawlee.storage_clients.models import DatasetItemsListPage
|
70 |
| - from crawlee.storages._dataset import ExportDataCsvKwargs, ExportDataJsonKwargs, GetDataKwargs, PushDataKwargs |
| 71 | + from crawlee.storages._types import GetDataKwargs |
71 | 72 |
|
72 | 73 | TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
|
73 | 74 | TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
@@ -655,13 +656,18 @@ async def add_requests(
|
655 | 656 | wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout,
|
656 | 657 | )
|
657 | 658 |
|
658 |
| - async def _use_state(self, default_value: dict[str, JsonSerializable] | None = None) -> dict[str, JsonSerializable]: |
659 |
| - store = await self.get_key_value_store() |
660 |
| - return await store.get_auto_saved_value(self._CRAWLEE_STATE_KEY, default_value) |
| 659 | + async def _use_state( |
| 660 | + self, |
| 661 | + default_value: dict[str, JsonSerializable] | None = None, |
| 662 | + ) -> dict[str, JsonSerializable]: |
| 663 | + kvs = await self.get_key_value_store() |
| 664 | + # TODO: |
| 665 | + # return some kvs value |
661 | 666 |
|
662 | 667 | async def _save_crawler_state(self) -> None:
|
663 |
| - store = await self.get_key_value_store() |
664 |
| - await store.persist_autosaved_values() |
| 668 | + kvs = await self.get_key_value_store() |
| 669 | + # TODO: |
| 670 | + # some kvs call |
665 | 671 |
|
666 | 672 | async def get_data(
|
667 | 673 | self,
|
@@ -705,64 +711,15 @@ async def export_data(
|
705 | 711 | dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
|
706 | 712 |
|
707 | 713 | path = path if isinstance(path, Path) else Path(path)
|
708 |
| - destination = path.open('w', newline='') |
| 714 | + dst = path.open('w', newline='') |
709 | 715 |
|
710 | 716 | if path.suffix == '.csv':
|
711 |
| - await dataset.write_to_csv(destination) |
| 717 | + await export_csv_to_stream(dataset.iterate(), dst) |
712 | 718 | elif path.suffix == '.json':
|
713 |
| - await dataset.write_to_json(destination) |
| 719 | + await export_json_to_stream(dataset.iterate(), dst) |
714 | 720 | else:
|
715 | 721 | raise ValueError(f'Unsupported file extension: {path.suffix}')
|
716 | 722 |
|
717 |
| - async def export_data_csv( |
718 |
| - self, |
719 |
| - path: str | Path, |
720 |
| - *, |
721 |
| - dataset_id: str | None = None, |
722 |
| - dataset_name: str | None = None, |
723 |
| - **kwargs: Unpack[ExportDataCsvKwargs], |
724 |
| - ) -> None: |
725 |
| - """Export data from a `Dataset` to a CSV file. |
726 |
| -
|
727 |
| - This helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens |
728 |
| - the specified one and then exports the data based on the provided parameters. |
729 |
| -
|
730 |
| - Args: |
731 |
| - path: The destination path. |
732 |
| - content_type: The output format. |
733 |
| - dataset_id: The ID of the `Dataset`. |
734 |
| - dataset_name: The name of the `Dataset`. |
735 |
| - kwargs: Extra configurations for dumping/writing in csv format. |
736 |
| - """ |
737 |
| - dataset = await self.get_dataset(id=dataset_id, name=dataset_name) |
738 |
| - path = path if isinstance(path, Path) else Path(path) |
739 |
| - |
740 |
| - return await dataset.write_to_csv(path.open('w', newline=''), **kwargs) |
741 |
| - |
742 |
| - async def export_data_json( |
743 |
| - self, |
744 |
| - path: str | Path, |
745 |
| - *, |
746 |
| - dataset_id: str | None = None, |
747 |
| - dataset_name: str | None = None, |
748 |
| - **kwargs: Unpack[ExportDataJsonKwargs], |
749 |
| - ) -> None: |
750 |
| - """Export data from a `Dataset` to a JSON file. |
751 |
| -
|
752 |
| - This helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the |
753 |
| - specified one and then exports the data based on the provided parameters. |
754 |
| -
|
755 |
| - Args: |
756 |
| - path: The destination path |
757 |
| - dataset_id: The ID of the `Dataset`. |
758 |
| - dataset_name: The name of the `Dataset`. |
759 |
| - kwargs: Extra configurations for dumping/writing in json format. |
760 |
| - """ |
761 |
| - dataset = await self.get_dataset(id=dataset_id, name=dataset_name) |
762 |
| - path = path if isinstance(path, Path) else Path(path) |
763 |
| - |
764 |
| - return await dataset.write_to_json(path.open('w', newline=''), **kwargs) |
765 |
| - |
766 | 723 | async def _push_data(
|
767 | 724 | self,
|
768 | 725 | data: JsonSerializable,
|
|
0 commit comments