from crawl4ai.extraction_strategy import LLMExtractionStrategy
INSTRUCTION_TO_LLM ="Extract all rows from the main table as objects with 'CASNo','purity','MF','MW','SMILES','size', 'price' ,'stock' from the content."
async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://www.chemshuttle.com/building-blocks/amino-acids/fmoc-r-3-amino-4-4-nitrophenyl-butyric-acid.html", extraction_strategy=extraction_strategy ) print(result.extracted_content)
async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://example.com", js_code="window.scrollTo(0, document.body.scrollHeight);", wait_for="document.querySelector('.content-loaded')" ) print(result.markdown)
browser_cfg = BrowserConfig(headless=True, verbose=True) async with AsyncWebCrawler(config=browser_cfg) as crawler: try: result = await crawler.arun(url=URL_TO_SCRAPE, config=crawl_config)
ifresult.success: data = json.loads(result.extracted_content)
print("Extracted items:", data)
llm_strategy.show_usage() else: print("Error:", result.error_message) except Exception as e: print(traceback.print_exc())
结果展示
数据展示:
Extracted items: [{'CASNo':'269398-78-9','size':'1g','price':'$150.00','stock':'Typically in stock','purity':'95%','MF':'C25H22N2O6','MW':'446.459','SMILES':'OC(=O)C[C@@H](CC1=CC=C(C=C1)[N+]([O-])=O)NC(=O)OCC1C2=CC=CC=C2C2=C1C=CC=C2','error': False}, {'CASNo':'269398-78-9','size':'5g','price':'$450.00','stock':'Typically in stock','purity':'95%','MF':'C25H22N2O6','MW':'446.459','SMILES':'OC(=O)C[C@@H](CC1=CC=C(C=C1)[N+]([O-])=O)NC(=O)OCC1C2=CC=CC=C2C2=C1C=CC=C2','error': False}, {'CASNo':'269398-78-9','size':'10g','price':'Inquire','stock':'Inquire','purity':'95%','MF':'C25H22N2O6','MW':'446.459','SMILES':'OC(=O)C[C@@H](CC1=CC=C(C=C1)[N+]([O-])=O)NC(=O)OCC1C2=CC=CC=C2C2=C1C=CC=C2','error': False}, {'CASNo':'269398-78-9','size':'100g','price':'$6980.00','stock':'Inquire','purity':'95%','MF':'C25H22N2O6','MW':'446.459','SMILES':'OC(=O)C[C@@H](CC1=CC=C(C=C1)[N+]([O-])=O)NC(=O)OCC1C2=CC=CC=C2C2=C1C=CC=C2','error': False}]
完整代码如下
import asyncio import json import os import traceback from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig from crawl4ai.extraction_strategy import LLMExtractionStrategy from pydantic import BaseModel, Field
INSTRUCTION_TO_LLM ="Extract all rows from the main table as objects with 'CASNo','purity','MF','MW','SMILES','size', 'price' ,'stock' from the content." class Product(BaseModel): CASNo:str size: str price: str stock:str purity:str MF:str MW:str SMILES:str