Skip to content

Commit 3309d7e

Browse files
committed
Update crawl_example.py
1 parent 4cf4ea6 commit 3309d7e

File tree

1 file changed

+29
-48
lines changed

1 file changed

+29
-48
lines changed

scrapegraph-py/examples/sync/crawl_example.py

Lines changed: 29 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,42 @@
1313
import json
1414
import os
1515
import time
16-
from typing import Dict, Any
16+
from typing import Dict, Any, List, Optional
1717

1818
from dotenv import load_dotenv
1919

20+
from pydantic import BaseModel, EmailStr, HttpUrl
2021
from scrapegraph_py import Client
2122

2223
# Load environment variables from .env file
2324
load_dotenv()
2425

26+
# Pydantic models for schema
27+
class SocialLinks(BaseModel):
28+
github: Optional[HttpUrl]
29+
linkedin: Optional[HttpUrl]
30+
twitter: Optional[HttpUrl]
31+
32+
class Company(BaseModel):
33+
name: str
34+
description: str
35+
features: Optional[List[str]] = None
36+
contact_email: Optional[EmailStr] = None
37+
social_links: Optional[SocialLinks] = None
38+
39+
class Service(BaseModel):
40+
service_name: str
41+
description: str
42+
features: Optional[List[str]] = None
43+
44+
class Legal(BaseModel):
45+
privacy_policy: str
46+
terms_of_service: str
47+
48+
class WebsiteContent(BaseModel):
49+
company: Company
50+
services: List[Service]
51+
legal: Legal
2552

2653
def main():
2754
if not os.getenv("SGAI_API_KEY"):
@@ -31,53 +58,7 @@ def main():
3158
return
3259

3360
# Example schema (from your curl command)
34-
schema: Dict[str, Any] = {
35-
"$schema": "http://json-schema.org/draft-07/schema#",
36-
"title": "ScrapeGraphAI Website Content",
37-
"type": "object",
38-
"properties": {
39-
"company": {
40-
"type": "object",
41-
"properties": {
42-
"name": {"type": "string"},
43-
"description": {"type": "string"},
44-
"features": {"type": "array", "items": {"type": "string"}},
45-
"contact_email": {"type": "string", "format": "email"},
46-
"social_links": {
47-
"type": "object",
48-
"properties": {
49-
"github": {"type": "string", "format": "uri"},
50-
"linkedin": {"type": "string", "format": "uri"},
51-
"twitter": {"type": "string", "format": "uri"},
52-
},
53-
"additionalProperties": False,
54-
},
55-
},
56-
"required": ["name", "description"],
57-
},
58-
"services": {
59-
"type": "array",
60-
"items": {
61-
"type": "object",
62-
"properties": {
63-
"service_name": {"type": "string"},
64-
"description": {"type": "string"},
65-
"features": {"type": "array", "items": {"type": "string"}},
66-
},
67-
"required": ["service_name", "description"],
68-
},
69-
},
70-
"legal": {
71-
"type": "object",
72-
"properties": {
73-
"privacy_policy": {"type": "string"},
74-
"terms_of_service": {"type": "string"},
75-
},
76-
"required": ["privacy_policy", "terms_of_service"],
77-
},
78-
},
79-
"required": ["company", "services", "legal"],
80-
}
61+
schema = WebsiteContent.schema()
8162

8263
url = "https://scrapegraphai.com/"
8364
prompt = (

0 commit comments

Comments
 (0)