Coverage for integrations / service_tools / omniparser_tool.py: 84.6%

13 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2OmniParser tool wrapper — screen parsing and computer use. 

3 

4Wraps the existing VLMAgentContext (vlm_agent_integration.py) as a 

5ServiceTool for unified registry. OmniParser is an external service 

6the user starts separately. 

7 

8Service: OmniParser (C:\\Users\\sathi\\PycharmProjects\\OmniParser) 

9Port 8080: Screen parsing (FastAPI, /parse/, /probe/) 

10Port 5001: VLM agent RPC (Flask, /execute_action) 

11""" 

12 

13from .registry import ServiceToolInfo, service_tool_registry 

14 

15 

16class OmniParserTool: 

17 """Thin wrapper to register OmniParser with the ServiceToolRegistry.""" 

18 

19 DEFAULT_PARSER_URL = "http://localhost:8080" 

20 DEFAULT_VLM_URL = "http://localhost:5001" 

21 

22 @classmethod 

23 def create_tool_info(cls, parser_url: str = None, 

24 vlm_url: str = None) -> ServiceToolInfo: 

25 parser = parser_url or cls.DEFAULT_PARSER_URL 

26 vlm = vlm_url or cls.DEFAULT_VLM_URL 

27 return ServiceToolInfo( 

28 name="omniparser", 

29 description=( 

30 "Screen parsing and computer use. Parses the user's screen " 

31 "to identify UI elements, then executes actions (click, type, " 

32 "scroll, hotkey) to control the computer on the user's behalf." 

33 ), 

34 base_url=parser, 

35 endpoints={ 

36 "parse_screen": { 

37 "path": "/parse/", 

38 "method": "POST", 

39 "description": ( 

40 "Parse the current screen to identify UI elements. " 

41 "Returns list of detected elements with bounding boxes, " 

42 "labels, and a screenshot." 

43 ), 

44 "params_schema": { 

45 "include_som": {"type": "boolean", "description": "Include Set-of-Mark overlay", "default": True}, 

46 }, 

47 }, 

48 "execute_action": { 

49 "path": "/execute_action", 

50 "method": "POST", 

51 "description": ( 

52 "Execute a computer action via VLM agent. " 

53 "Input: 'action' (type/click/scroll/hotkey/etc), " 

54 "'parameters' (dict with action-specific params like " 

55 "'text', 'x', 'y', 'key'). " 

56 "Sent to VLM agent on port 5001." 

57 ), 

58 "params_schema": { 

59 "action": {"type": "string", "description": "Action type: type, left_click, right_click, scroll_up, scroll_down, hotkey, wait"}, 

60 "parameters": {"type": "object", "description": "Action parameters (text, x, y, key, etc.)"}, 

61 }, 

62 }, 

63 }, 

64 health_endpoint="/probe", 

65 tags=["computer-use", "screen", "ui", "automation", "omniparser"], 

66 timeout=30, 

67 ) 

68 

69 @classmethod 

70 def register(cls, parser_url: str = None, vlm_url: str = None) -> bool: 

71 """Register OmniParser with the global service_tool_registry.""" 

72 tool_info = cls.create_tool_info(parser_url, vlm_url) 

73 return service_tool_registry.register_tool(tool_info)