Coverage for integrations / service_tools / crawl4ai_tool.py: 42.9%

21 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Crawl4AI tool wrapper — web scraping to markdown conversion. 

3 

4Now uses native in-process crawl4ai (no Docker/HTTP required). 

5Falls back to requests+BeautifulSoup if crawl4ai not installed. 

6 

7The agent sees intermediate progress and full extracted content. 

8""" 

9 

10import json 

11import os 

12from .registry import ServiceToolInfo, service_tool_registry 

13 

14 

15def _native_crawl(params_json: str) -> str: 

16 """Execute crawl in-process. Returns agent-visible progress + content.""" 

17 from integrations.web_crawler import crawl_url_for_agent, crawl_urls_for_agent 

18 

19 try: 

20 params = json.loads(params_json) if isinstance(params_json, str) else params_json 

21 except (json.JSONDecodeError, TypeError): 

22 params = {'url': str(params_json)} 

23 

24 # Handle both 'url' and 'urls' param names 

25 url = params.get('url') or params.get('urls', '') 

26 if isinstance(url, list): 

27 return crawl_urls_for_agent(url) 

28 return crawl_url_for_agent(str(url)) 

29 

30 

31class Crawl4AITool: 

32 """Register web crawling as a native tool (in-process, no Docker).""" 

33 

34 @classmethod 

35 def create_tool_info(cls, base_url: str = None) -> ServiceToolInfo: 

36 return ServiceToolInfo( 

37 name="crawl4ai", 

38 description=( 

39 "Web scraping and content extraction. Crawls URLs and converts " 

40 "web pages to clean markdown optimized for LLM consumption. " 

41 "Supports JavaScript rendering via crawl4ai or BeautifulSoup fallback. " 

42 "Runs in-process — no external service needed." 

43 ), 

44 base_url="native://in-process", 

45 endpoints={ 

46 "crawl": { 

47 "path": "/crawl", 

48 "method": "POST", 

49 "description": ( 

50 "Crawl a URL and extract content as clean markdown. " 

51 "Input: JSON with 'url' (string URL to crawl). " 

52 "Returns progress log + markdown text of the page content." 

53 ), 

54 "params_schema": { 

55 "url": {"type": "string", "description": "URL to crawl"}, 

56 }, 

57 "native_handler": _native_crawl, 

58 }, 

59 }, 

60 health_endpoint=None, # No external service to check 

61 tags=["web", "scraping", "markdown", "crawling"], 

62 timeout=60, 

63 ) 

64 

65 @classmethod 

66 def register(cls, base_url: str = None) -> bool: 

67 """Register Crawl4AI with the global service_tool_registry.""" 

68 tool_info = cls.create_tool_info(base_url) 

69 return service_tool_registry.register_tool(tool_info)