Coverage for integrations/service_tools/crawl4ai

1"""

2Crawl4AI tool wrapper — web scraping to markdown conversion.

4Now uses native in-process crawl4ai (no Docker/HTTP required).

5Falls back to requests+BeautifulSoup if crawl4ai not installed.

7The agent sees intermediate progress and full extracted content.

8"""

10import json

11import os

12from .registry import ServiceToolInfo, service_tool_registry

15def _native_crawl(params_json: str) -> str:

16 """Execute crawl in-process. Returns agent-visible progress + content."""

17 from integrations.web_crawler import crawl_url_for_agent, crawl_urls_for_agent

19 try:

20 params = json.loads(params_json) if isinstance(params_json, str) else params_json

21 except (json.JSONDecodeError, TypeError):

22 params = {'url': str(params_json)}

24 # Handle both 'url' and 'urls' param names

25 url = params.get('url') or params.get('urls', '')

26 if isinstance(url, list):

27 return crawl_urls_for_agent(url)

28 return crawl_url_for_agent(str(url))

31class Crawl4AITool:

32 """Register web crawling as a native tool (in-process, no Docker)."""

34 @classmethod

35 def create_tool_info(cls, base_url: str = None) -> ServiceToolInfo:

36 return ServiceToolInfo(

37 name="crawl4ai",

38 description=(

39 "Web scraping and content extraction. Crawls URLs and converts "

40 "web pages to clean markdown optimized for LLM consumption. "

41 "Supports JavaScript rendering via crawl4ai or BeautifulSoup fallback. "

42 "Runs in-process — no external service needed."

43 ),

44 base_url="native://in-process",

45 endpoints={

46 "crawl": {

47 "path": "/crawl",

48 "method": "POST",

49 "description": (

50 "Crawl a URL and extract content as clean markdown. "

51 "Input: JSON with 'url' (string URL to crawl). "

52 "Returns progress log + markdown text of the page content."

53 ),

54 "params_schema": {

55 "url": {"type": "string", "description": "URL to crawl"},

56 },

57 "native_handler": _native_crawl,

58 },

59 },

60 health_endpoint=None, # No external service to check

61 tags=["web", "scraping", "markdown", "crawling"],

62 timeout=60,

63 )

65 @classmethod

66 def register(cls, base_url: str = None) -> bool:

67 """Register Crawl4AI with the global service_tool_registry."""

68 tool_info = cls.create_tool_info(base_url)

69 return service_tool_registry.register_tool(tool_info)

Coverage for integrations / service_tools / crawl4ai_tool.py: 42.9%

21 statements