Coverage for integrations / vlm / local_omniparser.py: 50.8%
61 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2local_omniparser.py - Lazy-loaded OmniParser for screen parsing.
4Tier 'inprocess': imports OmniParser's Omniparser class directly (singleton).
5Tier 'http': HTTP POST to localhost:8080/parse/ (existing FastAPI endpoint).
7OmniParser detects UI elements (buttons, text fields, icons) from a screenshot
8using YOLO detection + Florence captioning, returning labeled bounding boxes.
9"""
11import os
12import sys
13import logging
15logger = logging.getLogger('hevolve.vlm.omniparser')
17# Singleton for in-process OmniParser (models are GPU-heavy, load once)
18_parser_instance = None
19_parser_lock = None
22def _get_lock():
23 """Lazy-init threading lock (avoid import-time side effects)."""
24 global _parser_lock
25 if _parser_lock is None:
26 import threading
27 _parser_lock = threading.Lock()
28 return _parser_lock
31def parse_screen(screenshot_b64: str, tier: str) -> dict:
32 """
33 Parse a screenshot into structured UI elements.
35 Args:
36 screenshot_b64: Base64-encoded PNG screenshot
37 tier: 'inprocess' (direct import) or 'http' (localhost:8080)
38 Returns:
39 dict with keys:
40 - 'screen_info': str - formatted ID→label text for LLM consumption
41 - 'parsed_content_list': list - [{type, content, bbox, idx}, ...]
42 - 'som_image_base64': str - labeled screenshot with bounding boxes
43 - 'original_screenshot_base64': str - original screenshot
44 - 'width': int, 'height': int - screen dimensions
45 - 'latency': float - parse time in seconds
46 """
47 if tier == 'inprocess':
48 return _parse_inprocess(screenshot_b64)
49 else:
50 return _parse_http(screenshot_b64)
53def _parse_inprocess(screenshot_b64: str) -> dict:
54 """Parse using direct OmniParser import."""
55 global _parser_instance
56 import time
58 with _get_lock():
59 if _parser_instance is None:
60 _parser_instance = _load_omniparser()
62 start = time.time()
63 result = _parser_instance.parse(screenshot_b64)
64 latency = time.time() - start
66 # Ensure consistent keys
67 if 'latency' not in result:
68 result['latency'] = latency
69 if 'original_screenshot_base64' not in result:
70 result['original_screenshot_base64'] = screenshot_b64
72 return result
75def _parse_http(screenshot_b64: str) -> dict:
76 """Parse via HTTP POST to OmniParser FastAPI server.
78 Falls back gracefully when OmniParser is unavailable — returns empty
79 parsed_content_list so the LLM reasons from the raw screenshot alone
80 (slower but functional).
81 """
82 import time
83 import requests
84 from core.http_pool import pooled_post
86 omni_url = os.environ.get('OMNIPARSER_URL', 'http://localhost:8080')
87 start = time.time()
89 try:
90 resp = pooled_post(
91 f'{omni_url.rstrip("/")}/parse/',
92 json={'base64_image': screenshot_b64},
93 timeout=30
94 )
95 resp.raise_for_status()
96 result = resp.json()
97 except (requests.RequestException, ValueError) as e:
98 logger.warning(
99 f"OmniParser unavailable ({e}), falling back to raw screenshot. "
100 f"LLM will reason without UI element labels."
101 )
102 result = {
103 'screen_info': '',
104 'parsed_content_list': [],
105 'som_image_base64': screenshot_b64,
106 }
108 latency = time.time() - start
110 if 'latency' not in result:
111 result['latency'] = latency
112 if 'original_screenshot_base64' not in result:
113 result['original_screenshot_base64'] = screenshot_b64
115 return result
118def _load_omniparser():
119 """
120 Load OmniParser singleton.
122 Searches for OmniParser in:
123 1. OMNIPARSER_PATH env var
124 2. ~/.hevolve/models/omniparser/
125 3. Sibling directory ../OmniParser (dev layout)
126 """
127 search_paths = [
128 os.environ.get('OMNIPARSER_PATH', ''),
129 os.path.join(os.path.expanduser('~'), '.hevolve', 'models', 'omniparser'),
130 os.path.join(os.path.dirname(__file__), '..', '..', '..', 'OmniParser'),
131 ]
133 for path in search_paths:
134 if not path:
135 continue
136 path = os.path.abspath(path)
137 util_path = os.path.join(path, 'util', 'omniparser.py')
138 if os.path.exists(util_path):
139 logger.info(f"Loading OmniParser from {path}")
140 if path not in sys.path:
141 sys.path.insert(0, path)
142 from util.omniparser import Omniparser
143 return Omniparser(path)
145 raise ImportError(
146 "OmniParser not found. Set OMNIPARSER_PATH or install to "
147 "~/.hevolve/models/omniparser/"
148 )