1+ from typing import List , TypedDict , Union
2+
13from asgiref .sync import async_to_sync
24
35from swp .celery import app
911PUBLICATION_PREVIEW_PAGES = 2
1012
1113
14+ class SuccessResult (TypedDict ):
15+ success : bool
16+ publications : List [dict ]
17+ max_per_page : int
18+ is_multipage : bool
19+
20+
21+ class ErrorResult (TypedDict ):
22+ success : bool
23+ error : str
24+
25+
26+ PreviewResult = Union [SuccessResult , ErrorResult ]
27+
28+
1229def configure_preview_pagination (config : dict ) -> int :
1330 """ Setup scraper config for limited pagination during preview. """
1431 paginator = config .pop ('paginator' , None ) or {}
@@ -21,7 +38,7 @@ def configure_preview_pagination(config: dict) -> int:
2138 return max_pages * max_per_page
2239
2340
24- async def scrape (scraper : Scraper , config : dict ) -> dict :
41+ async def scrape (scraper : Scraper , config : dict ) -> PreviewResult :
2542 publications = []
2643
2744 max_len = configure_preview_pagination (config )
@@ -57,15 +74,15 @@ async def scrape(scraper: Scraper, config: dict) -> dict:
5774 }
5875
5976
60- def clean_publications (publications ):
77+ def clean_publications (publications : List [ dict ] ):
6178 for publication in publications :
6279 if fields := publication .get ('fields' ):
6380 for embedding_field in ['pdf_path' , 'text_content' ]:
6481 fields .pop (embedding_field , None )
6582
6683
6784@app .task (name = 'preview.scraper' )
68- def preview_scraper (start_url , config ):
85+ def preview_scraper (start_url , config ) -> PreviewResult :
6986 scraper = Scraper (start_url )
7087
7188 result = async_to_sync (scrape )(scraper , config )
0 commit comments