Crawl4AI
class Article(BaseModel):
title: str
link: str
llm_strategy = LLMExtractionStrategy(
provider="openrouter/openai/gpt-4o-mini",
api_token=os.getenv('OPENAI_API_KEY'),
schema=Article.model_json_schema(),
extraction_type="schema",
instruction="Extract all articles of today with 'title' and valid http 'link' of article e.g: {'title': '美股早市', 'link': 'https:/news.futunn.com/hk/post/53844918/us-stock-market-early-session-s-p-and-nasdaq-slightly'}",
chunk_token_threshold=8192,
overlap_rate=0.0,
apply_chunking=True,
input_format="markdown", # or "html", "fit_markdown"
)
js_commands = ["document.getElementsByClassName('btn-type_primary btn-share_rect')?.[0]?.click();",
"window.scrollTo(0, document.body.scrollHeight);"]
crawl_config = CrawlerRunConfig(
extraction_strategy=llm_strategy,
cache_mode=CacheMode.DISABLED,
js_code=js_commands,
)
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun("https://news.futunn.com/hk/main",
config=crawl_config
)
if result.success:
# 5. The extracted content is presumably JSON
data:list[dict] = json.loads(result.extracted_content)
print("Extracted items:", data)
# 6. Show usage stats
llm_strategy.show_usage()
if(len(data) > 0 and data[0].get("error", False) == True):
raise HTTPException("Failed to get futu news")
return {"articles": data}
else:
print("Error:", result.error_message)
raise HTTPException("Failed to get futu news")
Last updated
Was this helpful?