Crawl4AI

class Article(BaseModel):
        title: str
        link: str
    llm_strategy = LLMExtractionStrategy(
        provider="openrouter/openai/gpt-4o-mini",        
        api_token=os.getenv('OPENAI_API_KEY'),
        schema=Article.model_json_schema(),         
        extraction_type="schema",
        instruction="Extract all articles of today with 'title' and valid http 'link' of article e.g: {'title': '美股早市', 'link': 'https:/news.futunn.com/hk/post/53844918/us-stock-market-early-session-s-p-and-nasdaq-slightly'}",
        chunk_token_threshold=8192,
        overlap_rate=0.0,
        apply_chunking=True,
        input_format="markdown",   # or "html", "fit_markdown"
    )
    js_commands = ["document.getElementsByClassName('btn-type_primary btn-share_rect')?.[0]?.click();",
                   "window.scrollTo(0, document.body.scrollHeight);"]
    crawl_config = CrawlerRunConfig(
        extraction_strategy=llm_strategy,
        cache_mode=CacheMode.DISABLED,
        js_code=js_commands,
    )

    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun("https://news.futunn.com/hk/main", 
                config=crawl_config    
            )
        if result.success:
            # 5. The extracted content is presumably JSON
            data:list[dict] = json.loads(result.extracted_content)
            print("Extracted items:", data)
            # 6. Show usage stats
            llm_strategy.show_usage()
            if(len(data) > 0 and data[0].get("error", False) == True):
                raise HTTPException("Failed to get futu news") 
            return {"articles": data}
            
        else:
            print("Error:", result.error_message)
            raise HTTPException("Failed to get futu news") 

Last updated

Was this helpful?