Coverage for integrations / service_tools / crawl4ai_tool.py: 42.9%
21 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Crawl4AI tool wrapper — web scraping to markdown conversion.
4Now uses native in-process crawl4ai (no Docker/HTTP required).
5Falls back to requests+BeautifulSoup if crawl4ai not installed.
7The agent sees intermediate progress and full extracted content.
8"""
10import json
11import os
12from .registry import ServiceToolInfo, service_tool_registry
15def _native_crawl(params_json: str) -> str:
16 """Execute crawl in-process. Returns agent-visible progress + content."""
17 from integrations.web_crawler import crawl_url_for_agent, crawl_urls_for_agent
19 try:
20 params = json.loads(params_json) if isinstance(params_json, str) else params_json
21 except (json.JSONDecodeError, TypeError):
22 params = {'url': str(params_json)}
24 # Handle both 'url' and 'urls' param names
25 url = params.get('url') or params.get('urls', '')
26 if isinstance(url, list):
27 return crawl_urls_for_agent(url)
28 return crawl_url_for_agent(str(url))
31class Crawl4AITool:
32 """Register web crawling as a native tool (in-process, no Docker)."""
34 @classmethod
35 def create_tool_info(cls, base_url: str = None) -> ServiceToolInfo:
36 return ServiceToolInfo(
37 name="crawl4ai",
38 description=(
39 "Web scraping and content extraction. Crawls URLs and converts "
40 "web pages to clean markdown optimized for LLM consumption. "
41 "Supports JavaScript rendering via crawl4ai or BeautifulSoup fallback. "
42 "Runs in-process — no external service needed."
43 ),
44 base_url="native://in-process",
45 endpoints={
46 "crawl": {
47 "path": "/crawl",
48 "method": "POST",
49 "description": (
50 "Crawl a URL and extract content as clean markdown. "
51 "Input: JSON with 'url' (string URL to crawl). "
52 "Returns progress log + markdown text of the page content."
53 ),
54 "params_schema": {
55 "url": {"type": "string", "description": "URL to crawl"},
56 },
57 "native_handler": _native_crawl,
58 },
59 },
60 health_endpoint=None, # No external service to check
61 tags=["web", "scraping", "markdown", "crawling"],
62 timeout=60,
63 )
65 @classmethod
66 def register(cls, base_url: str = None) -> bool:
67 """Register Crawl4AI with the global service_tool_registry."""
68 tool_info = cls.create_tool_info(base_url)
69 return service_tool_registry.register_tool(tool_info)