Skip to content

feat: add infnite scroll #37

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions scrapegraph-py/examples/async/smartscraper_infinite_scroll.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Example of using SmartScraper with infinite scrolling in asynchronous mode.
This example demonstrates how to scrape content from multiple webpages concurrently using infinite scrolling.
"""

import asyncio
from scrapegraph_py import AsyncClient
from scrapegraph_py.logger import sgai_logger

# Set up logging
sgai_logger.set_logging(level="INFO")

async def scrape_with_infinite_scroll(client: AsyncClient, url: str, prompt: str, max_pages: int = 10):
"""Helper function to perform a single scraping task with infinite scrolling"""
response = await client.smartscraper(
website_url=url,
user_prompt=prompt,
infinite_scrolling=True,
max_pages=max_pages
)
return response

async def main():
# Initialize the async client with your API key
async with AsyncClient(api_key="your-api-key-here") as sgai_client:
# Example 1: Scrape multiple pages concurrently
tasks = [
scrape_with_infinite_scroll(
sgai_client,
"https://example.com/products",
"Extract all product names and prices",
max_pages=20
),
scrape_with_infinite_scroll(
sgai_client,
"https://example.com/articles",
"Extract all article titles and authors",
max_pages=15
),
scrape_with_infinite_scroll(
sgai_client,
"https://example.com/news",
"Extract all news headlines and dates",
max_pages=10
)
]

# Wait for all scraping tasks to complete
results = await asyncio.gather(*tasks)

# Process and print results
for i, result in enumerate(results, 1):
print(f"\nExample {i} Results:")
print(f"Request ID: {result['request_id']}")
print(f"Result: {result['result']}")

# Example 2: Single page without infinite scrolling
response = await sgai_client.smartscraper(
website_url="https://example.com/static-page",
user_prompt="Extract the main heading and first paragraph",
infinite_scrolling=False
)
print("\nExample 4 - Without infinite scrolling:")
print(f"Request ID: {response['request_id']}")
print(f"Result: {response['result']}")

if __name__ == "__main__":
asyncio.run(main())
54 changes: 54 additions & 0 deletions scrapegraph-py/examples/sync/smartscraper_infinite_scroll.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""
Example of using SmartScraper with infinite scrolling in synchronous mode.
This example demonstrates how to scrape content from a webpage that requires scrolling to load more content.
"""

from scrapegraph_py import Client
from scrapegraph_py.logger import sgai_logger
import time

# Set up logging
sgai_logger.set_logging(level="INFO")

def main():
# Initialize the client with your API key
sgai_client = Client(api_key="your-api-key-here")

try:
# Example 1: Basic infinite scrolling with default settings
response1 = sgai_client.smartscraper(
website_url="https://example.com/infinite-scroll",
user_prompt="Extract all product names and prices from the page",
infinite_scrolling=True # Uses default max_pages=10
)
print("\nExample 1 - Basic infinite scrolling:")
print(f"Request ID: {response1['request_id']}")
print(f"Result: {response1['result']}")

# Example 2: Custom infinite scrolling with specific max pages
response2 = sgai_client.smartscraper(
website_url="https://example.com/long-list",
user_prompt="Extract all article titles and their publication dates",
infinite_scrolling=True,
max_pages=50 # Custom maximum number of pages to scroll
)
print("\nExample 2 - Custom max pages:")
print(f"Request ID: {response2['request_id']}")
print(f"Result: {response2['result']}")

# Example 3: Without infinite scrolling (for comparison)
response3 = sgai_client.smartscraper(
website_url="https://example.com/static-page",
user_prompt="Extract the main heading and first paragraph",
infinite_scrolling=False
)
print("\nExample 3 - Without infinite scrolling:")
print(f"Request ID: {response3['request_id']}")
print(f"Result: {response3['result']}")

finally:
# Always close the client when done
sgai_client.close()

if __name__ == "__main__":
main()
10 changes: 9 additions & 1 deletion scrapegraph-py/scrapegraph_py/models/smartscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from uuid import UUID

from bs4 import BeautifulSoup
from pydantic import BaseModel, Field, model_validator
from pydantic import BaseModel, Field, model_validator, conint


class SmartScraperRequest(BaseModel):
Expand All @@ -28,6 +28,14 @@ class SmartScraperRequest(BaseModel):
},
description="Optional headers to send with the request, including cookies and user agent",
)
infinite_scrolling: bool = Field(
default=False,
description="Enable infinite scrolling to load more content dynamically",
)
max_pages: conint(ge=1, le=1000) = Field(
default=10,
description="Maximum number of pages to scroll when infinite_scrolling is enabled",
)
output_schema: Optional[Type[BaseModel]] = None

@model_validator(mode="after")
Expand Down