Coverage for integrations / service_tools / omniparser_tool.py: 84.6%
13 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2OmniParser tool wrapper — screen parsing and computer use.
4Wraps the existing VLMAgentContext (vlm_agent_integration.py) as a
5ServiceTool for unified registry. OmniParser is an external service
6the user starts separately.
8Service: OmniParser (C:\\Users\\sathi\\PycharmProjects\\OmniParser)
9Port 8080: Screen parsing (FastAPI, /parse/, /probe/)
10Port 5001: VLM agent RPC (Flask, /execute_action)
11"""
13from .registry import ServiceToolInfo, service_tool_registry
16class OmniParserTool:
17 """Thin wrapper to register OmniParser with the ServiceToolRegistry."""
19 DEFAULT_PARSER_URL = "http://localhost:8080"
20 DEFAULT_VLM_URL = "http://localhost:5001"
22 @classmethod
23 def create_tool_info(cls, parser_url: str = None,
24 vlm_url: str = None) -> ServiceToolInfo:
25 parser = parser_url or cls.DEFAULT_PARSER_URL
26 vlm = vlm_url or cls.DEFAULT_VLM_URL
27 return ServiceToolInfo(
28 name="omniparser",
29 description=(
30 "Screen parsing and computer use. Parses the user's screen "
31 "to identify UI elements, then executes actions (click, type, "
32 "scroll, hotkey) to control the computer on the user's behalf."
33 ),
34 base_url=parser,
35 endpoints={
36 "parse_screen": {
37 "path": "/parse/",
38 "method": "POST",
39 "description": (
40 "Parse the current screen to identify UI elements. "
41 "Returns list of detected elements with bounding boxes, "
42 "labels, and a screenshot."
43 ),
44 "params_schema": {
45 "include_som": {"type": "boolean", "description": "Include Set-of-Mark overlay", "default": True},
46 },
47 },
48 "execute_action": {
49 "path": "/execute_action",
50 "method": "POST",
51 "description": (
52 "Execute a computer action via VLM agent. "
53 "Input: 'action' (type/click/scroll/hotkey/etc), "
54 "'parameters' (dict with action-specific params like "
55 "'text', 'x', 'y', 'key'). "
56 "Sent to VLM agent on port 5001."
57 ),
58 "params_schema": {
59 "action": {"type": "string", "description": "Action type: type, left_click, right_click, scroll_up, scroll_down, hotkey, wait"},
60 "parameters": {"type": "object", "description": "Action parameters (text, x, y, key, etc.)"},
61 },
62 },
63 },
64 health_endpoint="/probe",
65 tags=["computer-use", "screen", "ui", "automation", "omniparser"],
66 timeout=30,
67 )
69 @classmethod
70 def register(cls, parser_url: str = None, vlm_url: str = None) -> bool:
71 """Register OmniParser with the global service_tool_registry."""
72 tool_info = cls.create_tool_info(parser_url, vlm_url)
73 return service_tool_registry.register_tool(tool_info)