Coverage for integrations / vlm / local_omniparser.py: 50.8%

61 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2local_omniparser.py - Lazy-loaded OmniParser for screen parsing. 

3 

4Tier 'inprocess': imports OmniParser's Omniparser class directly (singleton). 

5Tier 'http': HTTP POST to localhost:8080/parse/ (existing FastAPI endpoint). 

6 

7OmniParser detects UI elements (buttons, text fields, icons) from a screenshot 

8using YOLO detection + Florence captioning, returning labeled bounding boxes. 

9""" 

10 

11import os 

12import sys 

13import logging 

14 

15logger = logging.getLogger('hevolve.vlm.omniparser') 

16 

17# Singleton for in-process OmniParser (models are GPU-heavy, load once) 

18_parser_instance = None 

19_parser_lock = None 

20 

21 

22def _get_lock(): 

23 """Lazy-init threading lock (avoid import-time side effects).""" 

24 global _parser_lock 

25 if _parser_lock is None: 

26 import threading 

27 _parser_lock = threading.Lock() 

28 return _parser_lock 

29 

30 

31def parse_screen(screenshot_b64: str, tier: str) -> dict: 

32 """ 

33 Parse a screenshot into structured UI elements. 

34 

35 Args: 

36 screenshot_b64: Base64-encoded PNG screenshot 

37 tier: 'inprocess' (direct import) or 'http' (localhost:8080) 

38 Returns: 

39 dict with keys: 

40 - 'screen_info': str - formatted ID→label text for LLM consumption 

41 - 'parsed_content_list': list - [{type, content, bbox, idx}, ...] 

42 - 'som_image_base64': str - labeled screenshot with bounding boxes 

43 - 'original_screenshot_base64': str - original screenshot 

44 - 'width': int, 'height': int - screen dimensions 

45 - 'latency': float - parse time in seconds 

46 """ 

47 if tier == 'inprocess': 

48 return _parse_inprocess(screenshot_b64) 

49 else: 

50 return _parse_http(screenshot_b64) 

51 

52 

53def _parse_inprocess(screenshot_b64: str) -> dict: 

54 """Parse using direct OmniParser import.""" 

55 global _parser_instance 

56 import time 

57 

58 with _get_lock(): 

59 if _parser_instance is None: 

60 _parser_instance = _load_omniparser() 

61 

62 start = time.time() 

63 result = _parser_instance.parse(screenshot_b64) 

64 latency = time.time() - start 

65 

66 # Ensure consistent keys 

67 if 'latency' not in result: 

68 result['latency'] = latency 

69 if 'original_screenshot_base64' not in result: 

70 result['original_screenshot_base64'] = screenshot_b64 

71 

72 return result 

73 

74 

75def _parse_http(screenshot_b64: str) -> dict: 

76 """Parse via HTTP POST to OmniParser FastAPI server. 

77 

78 Falls back gracefully when OmniParser is unavailable — returns empty 

79 parsed_content_list so the LLM reasons from the raw screenshot alone 

80 (slower but functional). 

81 """ 

82 import time 

83 import requests 

84 from core.http_pool import pooled_post 

85 

86 omni_url = os.environ.get('OMNIPARSER_URL', 'http://localhost:8080') 

87 start = time.time() 

88 

89 try: 

90 resp = pooled_post( 

91 f'{omni_url.rstrip("/")}/parse/', 

92 json={'base64_image': screenshot_b64}, 

93 timeout=30 

94 ) 

95 resp.raise_for_status() 

96 result = resp.json() 

97 except (requests.RequestException, ValueError) as e: 

98 logger.warning( 

99 f"OmniParser unavailable ({e}), falling back to raw screenshot. " 

100 f"LLM will reason without UI element labels." 

101 ) 

102 result = { 

103 'screen_info': '', 

104 'parsed_content_list': [], 

105 'som_image_base64': screenshot_b64, 

106 } 

107 

108 latency = time.time() - start 

109 

110 if 'latency' not in result: 

111 result['latency'] = latency 

112 if 'original_screenshot_base64' not in result: 

113 result['original_screenshot_base64'] = screenshot_b64 

114 

115 return result 

116 

117 

118def _load_omniparser(): 

119 """ 

120 Load OmniParser singleton. 

121 

122 Searches for OmniParser in: 

123 1. OMNIPARSER_PATH env var 

124 2. ~/.hevolve/models/omniparser/ 

125 3. Sibling directory ../OmniParser (dev layout) 

126 """ 

127 search_paths = [ 

128 os.environ.get('OMNIPARSER_PATH', ''), 

129 os.path.join(os.path.expanduser('~'), '.hevolve', 'models', 'omniparser'), 

130 os.path.join(os.path.dirname(__file__), '..', '..', '..', 'OmniParser'), 

131 ] 

132 

133 for path in search_paths: 

134 if not path: 

135 continue 

136 path = os.path.abspath(path) 

137 util_path = os.path.join(path, 'util', 'omniparser.py') 

138 if os.path.exists(util_path): 

139 logger.info(f"Loading OmniParser from {path}") 

140 if path not in sys.path: 

141 sys.path.insert(0, path) 

142 from util.omniparser import Omniparser 

143 return Omniparser(path) 

144 

145 raise ImportError( 

146 "OmniParser not found. Set OMNIPARSER_PATH or install to " 

147 "~/.hevolve/models/omniparser/" 

148 )