Coverage for integrations / vlm / local_loop.py: 72.5%

258 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2local_loop.py - Synchronous agentic loop for VLM execution. 

3 

4Equivalent to OmniParser's sampling_loop_sync() (loop.py) but without Twisted. 

5Orchestrates: screenshot → parse → LLM reason → execute action → repeat. 

6 

7Uses the same LLM config as create_recipe.py:285-300 (HEVOLVE_NODE_TIER aware). 

8Produces the same response format as Crossbar: {status, extracted_responses, ...}. 

9""" 

10 

11import os 

12import json 

13import platform 

14import time 

15import logging 

16import re 

17 

18logger = logging.getLogger('hevolve.vlm.local_loop') 

19 

20# Max iterations to prevent infinite loops (same safeguard as OmniParser) 

21MAX_ITERATIONS = 30 

22 

23# Action list — single source of truth for both the legacy SYSTEM_PROMPT 

24# and the unified-mode combined_prompt. Keeping one string means the 

25# legacy OmniParser path and the unified Qwen3-VL path can never drift 

26# on which actions the model is allowed to emit. 

27_VLM_ACTION_LIST = ( 

28 "Available actions:\n" 

29 "- GUI: left_click, right_click, double_click, type, key, hotkey, hover, " 

30 "mouse_move, wait, scroll_up, scroll_down\n" 

31 "- Deterministic (PREFER these when the task is expressible as a " 

32 "command — they're 100x faster than GUI grounding):\n" 

33 " * shell: run any shell/PowerShell/bash command. Use for launching " 

34 "apps (command='notepad'), opening files in specific apps " 

35 "(command='notepad hello.txt'), running git/npm/python, file ops, etc. " 

36 "Put the full command in the 'command' field.\n" 

37 " * open_file_gui: open a file or app in the OS default handler. " 

38 "Put the target in the 'path' field (e.g. path='notepad' or " 

39 "path='C:\\\\Users\\\\foo\\\\doc.pdf').\n" 

40 "- File: list_folders_and_files, Open_file_and_copy_paste, write_file, " 

41 "read_file_and_understand\n" 

42) 

43 

44# System prompt matching OmniParser vlm_agent.py _get_system_prompt() 

45_os_name = platform.system() # 'Windows', 'Linux', 'Darwin', etc. 

46SYSTEM_PROMPT = ( 

47 "You are using a " + _os_name + " device.\n" 

48 "You are able to use a mouse and keyboard to interact with the computer " 

49 "based on the given task and screenshot.\n" 

50 "You have access to every app running in the device via the mouse and " 

51 "keyboard interfaces mentioned above for GUI actions.\n" 

52 "\n" 

53 + _VLM_ACTION_LIST + 

54 "\n" 

55 "IMPORTANT: Prefer deterministic actions (shell, open_file_gui) over " 

56 "clicking when the task is expressible as a command. Only fall back to " 

57 "clicks for things that MUST be done visually (e.g. clicking a specific " 

58 "button inside an already-running app's UI that has no keyboard " 

59 "shortcut). After the first action, verify the expected outcome on screen " 

60 "before taking any new action.\n" 

61 "\n" 

62 "Output your response in JSON format:\n" 

63 '{\n' 

64 ' "Reasoning": "Brief explanation of what you see and why this action is needed",\n' 

65 ' "Next Action": "action_name or None if task is complete",\n' 

66 ' "Box ID": <element_id if clicking an element>,\n' 

67 ' "coordinate": [x, y],\n' 

68 ' "value": "text for type/hotkey actions",\n' 

69 ' "command": "shell command string when Next Action is shell",\n' 

70 ' "path": "file or app name when Next Action is open_file_gui",\n' 

71 ' "Status": "IN_PROGRESS or DONE"\n' 

72 '}\n' 

73 "\n" 

74 'When the task is complete, set "Next Action": "None" and "Status": "DONE".\n' 

75) 

76 

77 

78# ─── Stop registry — port of OmniParser agentic_rpc.app_state["active_sessions"] ─── 

79# When the VLM is mid-loop on the user's screen and the user clicks 

80# the indicator window's Stop button, Nunba POSTs to /api/vlm/stop on 

81# HARTOS. That handler calls request_stop() below, which sets the 

82# user's threading.Event. The next iteration of run_local_agentic_loop 

83# checks the event via _is_stop_requested() and exits cleanly with 

84# exit_reason='stopped' instead of running another action on the user's 

85# screen. 

86# 

87# Why threading.Event: pyautogui actions inside an iteration are 

88# already synchronous on the loop's thread, so we can't preempt mid- 

89# action. But every action has natural seams (between iterations and 

90# after each pyautogui call), and Event.is_set() is a cheap atomic 

91# check we can sprinkle there without locking. 

92# 

93# Why per-(user_id, prompt_id) key: same instance can have multiple 

94# concurrent VLM sessions if more than one user is connected. Stop 

95# fires on a specific session, not globally, mirroring OmniParser's 

96# active_sessions dict shape. 

97import threading as _threading 

98 

99_vlm_stop_flags: dict = {} # f"{user_id}:{prompt_id}" -> Event 

100_vlm_stop_lock = _threading.Lock() 

101 

102 

103def _stop_key(user_id: str, prompt_id: str) -> str: 

104 return f"{user_id}:{prompt_id}" 

105 

106 

107def _register_session(user_id: str, prompt_id: str) -> _threading.Event: 

108 """Called by run_local_agentic_loop on entry — creates the Event so 

109 a /api/vlm/stop POST can later flip it.""" 

110 key = _stop_key(user_id, prompt_id) 

111 with _vlm_stop_lock: 

112 ev = _vlm_stop_flags.get(key) 

113 if ev is None: 

114 ev = _threading.Event() 

115 _vlm_stop_flags[key] = ev 

116 else: 

117 # Existing flag from a prior session — clear it so this run 

118 # starts un-stopped. Preserves the singleton-Event pattern 

119 # without leaking state across runs. 

120 ev.clear() 

121 return ev 

122 

123 

124def _unregister_session(user_id: str, prompt_id: str) -> None: 

125 """Called by run_local_agentic_loop on exit (success or stop) — 

126 drops the Event so the dict doesn't grow unbounded.""" 

127 key = _stop_key(user_id, prompt_id) 

128 with _vlm_stop_lock: 

129 _vlm_stop_flags.pop(key, None) 

130 

131 

132def _is_stop_requested(user_id: str, prompt_id: str) -> bool: 

133 """Cheap check called at iteration boundaries inside the loop.""" 

134 key = _stop_key(user_id, prompt_id) 

135 with _vlm_stop_lock: 

136 ev = _vlm_stop_flags.get(key) 

137 return bool(ev and ev.is_set()) 

138 

139 

140def request_stop(user_id: str, prompt_id: str) -> bool: 

141 """Public API — called by /api/vlm/stop in hart_intelligence_entry.py. 

142 

143 Sets the stop flag on a registered session. Returns True when a 

144 matching session was found, False when the user has no active VLM 

145 loop (caller logs accordingly so the UI can distinguish "stopped" 

146 from "nothing to stop"). 

147 

148 Pairs with the loop's iteration-boundary check at the top of every 

149 iteration. Stop becomes visible to the loop on its NEXT iteration 

150 — typically within 1-3 seconds depending on which step is in 

151 flight (screenshot, LLM call, action execution). 

152 """ 

153 key = _stop_key(user_id, prompt_id) 

154 with _vlm_stop_lock: 

155 ev = _vlm_stop_flags.get(key) 

156 if ev is None: 

157 return False 

158 ev.set() 

159 return True 

160 

161 

162def list_active_sessions() -> list: 

163 """Return [(user_id, prompt_id), ...] of currently-running VLM 

164 loops. Used by /api/vlm/stop with no payload to bulk-stop, and by 

165 diagnostics.""" 

166 with _vlm_stop_lock: 

167 return [tuple(k.split(':', 1)) for k in _vlm_stop_flags.keys()] 

168 

169 

170def run_local_agentic_loop( 

171 message: dict, 

172 tier: str, 

173 max_iterations: int = MAX_ITERATIONS 

174) -> dict: 

175 """ 

176 Local agentic loop: screenshot → parse → LLM reason → execute → repeat. 

177 

178 Supports two modes: 

179 - Legacy (default): OmniParser screen parsing + separate LLM reasoning call 

180 - Unified (HEVOLVE_VLM_UNIFIED=true): Single Qwen3-VL call for parsing + reasoning 

181 

182 Args: 

183 message: dict with keys from execute_windows_or_android_command: 

184 - instruction_to_vlm_agent: str 

185 - enhanced_instruction: str (optional, from recipe matching) 

186 - user_id: str 

187 - prompt_id: str 

188 - os_to_control: str 

189 - max_ETA_in_seconds: int 

190 tier: 'inprocess' or 'http' 

191 Returns: 

192 dict matching Crossbar response format: 

193 {status, extracted_responses, execution_time_seconds} 

194 """ 

195 from integrations.vlm.local_computer_tool import take_screenshot, execute_action 

196 

197 instruction = message.get('instruction_to_vlm_agent', '') 

198 enhanced = message.get('enhanced_instruction', instruction) 

199 user_id = message.get('user_id', '') 

200 prompt_id = message.get('prompt_id', '') 

201 max_eta = message.get('max_ETA_in_seconds', 1800) 

202 

203 # exit_reason is overwritten as the loop progresses. Defaults to max_iterations 

204 # so a loop that runs to the iteration cap without a DONE signal is honest 

205 # about it to the caller (instead of pretending status='success'). 

206 exit_reason = 'max_iterations' 

207 consecutive_action_errors = 0 

208 

209 # Detect unified Qwen3-VL mode 

210 use_unified = os.environ.get('HEVOLVE_VLM_UNIFIED', '').lower() in ('1', 'true') 

211 

212 if use_unified: 

213 from integrations.vlm.qwen3vl_backend import get_qwen3vl_backend 

214 qwen3vl = get_qwen3vl_backend() 

215 logger.info( 

216 f"Starting unified VLM loop (Qwen3-VL, tier={tier}, user={user_id}, " 

217 f"prompt={prompt_id}): {instruction[:100]}" 

218 ) 

219 else: 

220 from integrations.vlm.local_omniparser import parse_screen 

221 qwen3vl = None 

222 logger.info( 

223 f"Starting local VLM loop (tier={tier}, user={user_id}, " 

224 f"prompt={prompt_id}): {instruction[:100]}" 

225 ) 

226 

227 # Phase 3.5 wire-up: classify the task with the complementary path 

228 # router and use it to size the iteration budget. Single-shot 

229 # tasks ("click X") shouldn't burn the full 30-iter budget when 

230 # one click satisfies the goal — the multi-iter loop's overhead 

231 # is real (per-iter screenshot + VLM call ~3-5s). Multi-step 

232 # tasks get the full caller-supplied max_iterations. 

233 _route = 'multi_step' # safe default — never over-cap a real loop 

234 try: 

235 if qwen3vl is not None: 

236 _route = qwen3vl.route_task(instruction or enhanced) 

237 logger.info(f"VLM loop route_task: '{instruction[:60]}' → {_route}") 

238 if _route == 'single_shot' and max_iterations > 3: 

239 # Cap at 3 — gives one nudge-retry + one followup 

240 # if the click misses without burning the full budget. 

241 max_iterations = 3 

242 elif _route == 'enumerate' and max_iterations > 1: 

243 # Enumerate = parse_and_reason snapshot, no follow-up 

244 # iter needed. 

245 max_iterations = 1 

246 except Exception as e: 

247 logger.debug(f'route_task wire-up skipped: {e}') 

248 

249 # Build conversation messages for LLM 

250 messages = [ 

251 {"role": "system", "content": SYSTEM_PROMPT}, 

252 {"role": "user", "content": enhanced}, 

253 ] 

254 

255 extracted_responses = [] 

256 start_time = time.time() 

257 

258 # Register this session in the stop registry so /api/vlm/stop can 

259 # signal it. Cleanup happens just before the final return below 

260 # (no try/finally — the existing iteration body wraps every error 

261 # in its own try/continue so exceptions never escape this scope). 

262 _register_session(user_id, prompt_id) 

263 

264 for iteration in range(max_iterations): 

265 # User-requested stop wins over every other exit condition. 

266 # Check FIRST so a stop fired during the previous iteration's 

267 # action lands at this seam without one more click happening. 

268 if _is_stop_requested(user_id, prompt_id): 

269 logger.info( 

270 f"VLM loop stopped by /api/vlm/stop at iteration " 

271 f"{iteration + 1} (user={user_id}, prompt={prompt_id})" 

272 ) 

273 exit_reason = 'stopped' 

274 break 

275 

276 elapsed = time.time() - start_time 

277 if elapsed > max_eta: 

278 logger.warning(f"VLM loop hit ETA limit ({max_eta}s) at iteration {iteration}") 

279 exit_reason = 'timeout' 

280 break 

281 

282 logger.info(f"VLM loop iteration {iteration + 1}/{max_iterations}") 

283 

284 try: 

285 # 1. Take screenshot 

286 screenshot_b64 = take_screenshot(tier) 

287 

288 if use_unified and qwen3vl is not None: 

289 # ── Single VLM call: plan step + ground coordinates in one prompt ── 

290 # One image encoding (~500 visual tokens) instead of two. 

291 # Halves latency: ~10s per step instead of ~20s. 

292 from integrations.vlm.local_computer_tool import VLM_IMG_W, VLM_IMG_H 

293 

294 # Taskbar pre-check (additive — restores point_and_act's 

295 # smart strategy that 8fa6e97 dropped when this loop 

296 # adopted its own inline prompt). When the task targets 

297 # a taskbar item ("open Chrome", "click Start", etc.), 

298 # _taskbar_list_lookup short-circuits the VLM call 

299 # entirely and returns a click coord direct from the 

300 # taskbar enumeration — typically <1s vs the 5-10s a 

301 # full VLM grounding takes. On miss, returns None and 

302 # the existing inline prompt path runs unchanged. 

303 _step_started = time.time() 

304 try: 

305 import pyautogui as _pag_pre 

306 _sw_pre, _sh_pre = _pag_pre.size() 

307 except Exception: 

308 _sw_pre = _sh_pre = None 

309 _taskbar_action = None 

310 if _sw_pre and _sh_pre: 

311 try: 

312 _taskbar_action = qwen3vl.try_taskbar_pre_check( 

313 screenshot_b64, enhanced, 

314 _sw_pre, _sh_pre, _step_started, 

315 ) 

316 except Exception as _tb_err: 

317 logger.debug( 

318 f"taskbar_pre_check failed (non-fatal): {_tb_err}") 

319 if _taskbar_action is not None: 

320 # Single source of truth for "point_and_act result 

321 # -> action_json shape" conversion. Was inline 

322 # 14 lines duplicating the dict construction. 

323 action_json = _point_action_to_action_json(_taskbar_action) 

324 raw = _taskbar_action.get('raw', '') 

325 logger.info( 

326 f"Loop: taskbar_list shortcut → " 

327 f"({_taskbar_action.get('screen_x')}," 

328 f"{_taskbar_action.get('screen_y')})" 

329 ) 

330 # Fall through to the existing post-action handling 

331 # below (which executes action_json + records it). 

332 # Skip the combined_prompt + _call_api block. 

333 _skip_combined_prompt = True 

334 else: 

335 _skip_combined_prompt = False 

336 

337 # Skip the heavy combined-prompt VLM call entirely when 

338 # taskbar_pre_check above already produced a click — 

339 # the taskbar lookup is the authoritative grounding for 

340 # taskbar tasks (point_and_act has used the same 

341 # short-circuit since cb92a2e). Without this guard the 

342 # _call_api below would overwrite action_json with a 

343 # less-grounded result. 

344 if not _skip_combined_prompt: 

345 combined_prompt = ( 

346 f"You are a computer use agent on {_os_name}.\n" 

347 f"Task: {enhanced}\n\n" 

348 ) 

349 if extracted_responses: 

350 last = extracted_responses[-1].get('content', '') 

351 if isinstance(last, dict): 

352 combined_prompt += ( 

353 f"Previous action: {last.get('action', '?')} — " 

354 f"{last.get('reasoning', '')[:80]}.\n" 

355 f"Check the screenshot: did it succeed?\n\n" 

356 ) 

357 combined_prompt += ( 

358 _VLM_ACTION_LIST + 

359 "\n" 

360 "What is the SINGLE next action? Respond in JSON ONLY:\n" 

361 "{\n" 

362 ' "Reasoning": "What you see and why this action",\n' 

363 ' "Next Action": "left_click|right_click|double_click|' 

364 'type|key|hotkey|scroll_up|scroll_down|wait|shell|' 

365 'open_file_gui|None",\n' 

366 ' "coordinate": [x, y],\n' 

367 ' "value": "text to type or key name",\n' 

368 ' "command": "shell command when Next Action is shell",\n' 

369 ' "path": "file or app name when Next Action is open_file_gui",\n' 

370 ' "Status": "IN_PROGRESS|DONE"\n' 

371 "}\n\n" 

372 "For click actions: provide <point>x,y</point> normalized " 

373 "0-1000 coordinates.\n" 

374 "For type/key/hotkey: set coordinate to null, put text in value.\n" 

375 "Only fall back to clicks when the task requires interacting " 

376 "with something already visible on screen that cannot be " 

377 "done via a command.\n" 

378 'When task is complete: "Next Action": "None", "Status": "DONE".' 

379 ) 

380 

381 raw = qwen3vl._call_api([{ 

382 "role": "user", 

383 "content": [ 

384 {"type": "text", "text": combined_prompt}, 

385 {"type": "image_url", "image_url": { 

386 "url": f"data:image/jpeg;base64,{screenshot_b64}"}}, 

387 ] 

388 }]) 

389 # Guard against None (e.g. thinking-only response with no content) 

390 if raw is None: 

391 raw = '' 

392 action_json = _parse_vlm_response(raw) 

393 

394 # Extract coordinates from <point>x,y</point> if present in raw 

395 next_action = action_json.get('Next Action', 'None') 

396 _CLICK_ACTIONS = {'left_click', 'right_click', 'double_click', 

397 'middle_click', 'hover', 'mouse_move'} 

398 

399 if next_action in _CLICK_ACTIONS: 

400 # Phase 5 follow-through: was a 4th inline <point> 

401 # regex parser duplicating parser._parse_point_shape. 

402 # Now delegates to the canonical parser so the 

403 # action_json JSON-coordinate vs the raw <point> 

404 # tag agree (they previously could disagree when 

405 # the JSON had Box ID + the raw text had a point). 

406 nx, ny = _extract_click_coord(raw, action_json) 

407 

408 # Scale from 1000-normalized or image space to screen space 

409 try: 

410 import pyautogui as _pag 

411 _sw, _sh = _pag.size() 

412 if nx <= 1000 and ny <= 1000: 

413 # Normalized 0-1000 coords 

414 screen_x = int(nx * _sw / 1000) 

415 screen_y = int(ny * _sh / 1000) 

416 else: 

417 # Image pixel coords 

418 screen_x = int(nx * _sw / VLM_IMG_W) 

419 screen_y = int(ny * _sh / VLM_IMG_H) 

420 except Exception as _scale_err: 

421 logger.debug(f"coord scale to screen failed: {_scale_err}") 

422 screen_x, screen_y = nx, ny 

423 action_json['coordinate'] = [screen_x, screen_y] 

424 logger.info(f"Action: {next_action} at ({screen_x},{screen_y}) " 

425 f"norm=({nx},{ny})") 

426 

427 # Bias-detection + elimination retry — additive 

428 # restoration of point_and_act's strategy 3 that 

429 # 8fa6e97 dropped when this loop adopted its own 

430 # inline prompt. Catches center/bottom/top-edge 

431 # hallucinations in the 0-1000 normalized coords 

432 # the loop just produced and reissues the VLM with 

433 # an elimination prompt that explicitly forbids 

434 # the suspect region. Skipped when the action 

435 # came from taskbar_pre_check (its coords are 

436 # already lookup-grounded, no need to retry). 

437 # All wrapped in try/except so a retry-time error 

438 # NEVER takes down the iteration — original coords 

439 # remain in action_json. 

440 if action_json.get('_strategy') != 'taskbar_list': 

441 try: 

442 _bias = qwen3vl.detect_grounding_bias( 

443 nx, ny, 'left_click', enhanced, 

444 ) 

445 if _bias: 

446 _retry = qwen3vl.retry_with_elimination( 

447 screenshot_b64, enhanced, 

448 VLM_IMG_W, VLM_IMG_H, _bias, 

449 ) 

450 if _retry is not None: 

451 _r_dict, _enx, _eny = _retry 

452 nx, ny = _enx, _eny 

453 # Re-scale retry coords to screen 

454 # space using the same rule as the 

455 # original (0-1000 vs image-pixel). 

456 try: 

457 import pyautogui as _pag_r 

458 _swr, _shr = _pag_r.size() 

459 if nx <= 1000 and ny <= 1000: 

460 screen_x = int(nx * _swr / 1000) 

461 screen_y = int(ny * _shr / 1000) 

462 else: 

463 screen_x = int(nx * _swr / VLM_IMG_W) 

464 screen_y = int(ny * _shr / VLM_IMG_H) 

465 except Exception: 

466 screen_x, screen_y = nx, ny 

467 action_json['coordinate'] = [ 

468 screen_x, screen_y, 

469 ] 

470 action_json['_strategy'] = ( 

471 'elimination_retry' 

472 ) 

473 logger.info( 

474 f"Loop bias retry ({_bias}) → " 

475 f"({screen_x},{screen_y}) " 

476 f"norm=({nx},{ny})" 

477 ) 

478 except Exception as _bias_err: 

479 logger.debug( 

480 f"bias retry failed (non-fatal, " 

481 f"keeping original coords): {_bias_err}" 

482 ) 

483 # Sanity check: flag clicks in the likely taskbar region. 

484 # If the VLM's reasoning talks about a Start menu item or 

485 # app window but the coordinate lands in the bottom 50px, 

486 # the grounding probably drifted onto the taskbar strip. 

487 # We log a warning and let the verify step catch it; the 

488 # router will see exit_reason=action_error if this pattern 

489 # keeps happening, so it can respond honestly. 

490 try: 

491 import pyautogui as _pag2 

492 _sw2, _sh2 = _pag2.size() 

493 reasoning_lc = (action_json.get('Reasoning') or '').lower() 

494 if (screen_y >= _sh2 - 50 

495 and any(t in reasoning_lc for t in 

496 ('start menu', 'menu item', 'recommended', 'pinned'))): 

497 logger.warning( 

498 f"VLM click ({screen_x},{screen_y}) is in taskbar " 

499 f"region (screen height={_sh2}), but reasoning " 

500 f"mentions Start menu — probable grounding drift" 

501 ) 

502 except Exception: 

503 pass 

504 else: 

505 action_json['coordinate'] = None 

506 logger.info(f"Action: {next_action} " 

507 f"value='{action_json.get('value', '')[:50]}'") 

508 

509 parsed = {'screen_info': '', 'parsed_content_list': []} 

510 else: 

511 # ── Legacy path: OmniParser + separate LLM call ── 

512 # 2. Parse UI elements 

513 parsed = parse_screen(screenshot_b64, tier) 

514 screen_info = parsed.get('screen_info', '') 

515 

516 # 3. Build LLM prompt with current screen state 

517 user_content = _build_vision_prompt(screen_info, screenshot_b64, iteration) 

518 messages.append({"role": "user", "content": user_content}) 

519 

520 # 4. Call local LLM for reasoning 

521 llm_response = _call_local_llm(messages) 

522 action_json = _parse_vlm_response(llm_response) 

523 

524 # Record the assistant response 

525 messages.append({"role": "assistant", "content": llm_response}) 

526 

527 logger.info(f"VLM action: {action_json.get('Next Action', 'None')}") 

528 

529 # Check if task is complete 

530 next_action = action_json.get('Next Action', 'None') 

531 status = action_json.get('Status', 'IN_PROGRESS') 

532 

533 if next_action == 'None' or next_action is None or status == 'DONE': 

534 logger.info("VLM task completed") 

535 extracted_responses.append({ 

536 "type": "completion", 

537 "content": action_json.get('Reasoning', 'Task completed'), 

538 "iteration": iteration + 1, 

539 }) 

540 exit_reason = 'done' 

541 break 

542 

543 # 5. Execute the action. 

544 # Phase 6 wire-up: pass safety=True so the per-session cap 

545 # + window blocklist + audit JSONL fire on every loop click. 

546 # Verify=True triggers the post-click pre/post diff + 50px 

547 # nudge retry from Phase 4. Both default-tunable via env 

548 # but ON in the loop is the right safe default — solo 

549 # /visual_agent calls keep their existing behaviour. 

550 action_payload = _build_action_payload(action_json, parsed) 

551 _safety_on = os.environ.get( 

552 'HEVOLVE_VLM_LOOP_SAFETY', '1').lower() not in ('0', 'false', 'no') 

553 _verify_on = os.environ.get( 

554 'HEVOLVE_VLM_LOOP_VERIFY', '0').lower() in ('1', 'true', 'yes') 

555 result = execute_action( 

556 action_payload, tier, 

557 safety=_safety_on, verify=_verify_on) 

558 action_ok = result.get('status') != 'error' 

559 if action_ok: 

560 consecutive_action_errors = 0 

561 else: 

562 consecutive_action_errors += 1 

563 

564 # Surface coordinate + strategy in the response content so 

565 # observers (benchmark, audit, /visual_agent telemetry, 

566 # post-hoc replay) can reconstruct what the VLM actually 

567 # decided this iteration without re-parsing action_json. 

568 # Was missing - vlm_grounding_benchmark.py:loop_one_iter 

569 # path always read content['coordinate'] = None and scored 

570 # all 6 targets as FAIL, hiding any real grounding regression 

571 # behind a fixed metric. 

572 extracted_responses.append({ 

573 "type": "action", 

574 "content": { 

575 "action": next_action, 

576 "reasoning": action_json.get('Reasoning', ''), 

577 "result": result.get('output', ''), 

578 "ok": action_ok, 

579 "coordinate": action_json.get('coordinate'), 

580 "_strategy": action_json.get('_strategy', 'inline_prompt'), 

581 }, 

582 "iteration": iteration + 1, 

583 }) 

584 

585 # Bail after 3 consecutive action errors — something is structurally 

586 # broken (bad coordinates, action type mismatch, subprocess dead) 

587 # and more iterations won't help. 

588 if consecutive_action_errors >= 3: 

589 logger.warning("VLM loop: 3 consecutive action errors, aborting") 

590 exit_reason = 'action_error' 

591 break 

592 

593 # Small delay between iterations (let UI update) 

594 time.sleep(0.5) 

595 

596 except Exception as e: 

597 logger.error(f"VLM loop iteration {iteration + 1} error: {e}") 

598 extracted_responses.append({ 

599 "type": "error", 

600 "content": str(e), 

601 "iteration": iteration + 1, 

602 }) 

603 consecutive_action_errors += 1 

604 if consecutive_action_errors >= 3: 

605 logger.warning("VLM loop: 3 consecutive iteration errors, aborting") 

606 exit_reason = 'action_error' 

607 break 

608 # Continue to next iteration rather than aborting 

609 continue 

610 

611 execution_time = time.time() - start_time 

612 logger.info( 

613 f"VLM loop finished: {len(extracted_responses)} actions in " 

614 f"{execution_time:.1f}s (exit_reason={exit_reason})" 

615 ) 

616 

617 # Drop this session's stop flag so the registry doesn't grow 

618 # across runs. Pairs with _register_session above. 

619 _unregister_session(user_id, prompt_id) 

620 

621 # status mirrors exit_reason: only 'done' is a real success. Callers 

622 # (LangChain router, autogen) can inspect exit_reason to craft an honest 

623 # response instead of confidently lying when the loop timed out. 

624 # 'stopped' is its own honest exit_reason — Nunba's indicator UX 

625 # reads it to render the right "Stopped" badge instead of a 

626 # generic "incomplete". 

627 return { 

628 "status": "success" if exit_reason == 'done' else "incomplete", 

629 "exit_reason": exit_reason, 

630 "extracted_responses": extracted_responses, 

631 "execution_time_seconds": execution_time, 

632 } 

633 

634 

635def _build_vision_prompt(screen_info: str, screenshot_b64: str, iteration: int) -> list: 

636 """Build multimodal prompt with screen info + screenshot image.""" 

637 content = [] 

638 

639 if iteration == 0: 

640 content.append({ 

641 "type": "text", 

642 "text": ( 

643 "Here is the current screen state. " 

644 "Analyze the UI elements and decide the next action.\n\n" 

645 f"UI Elements:\n{screen_info}" 

646 ), 

647 }) 

648 else: 

649 content.append({ 

650 "type": "text", 

651 "text": ( 

652 "Here is the updated screen after the previous action. " 

653 "Verify the previous action succeeded, then decide the next action.\n\n" 

654 f"UI Elements:\n{screen_info}" 

655 ), 

656 }) 

657 

658 # Add screenshot as image 

659 content.append({ 

660 "type": "image_url", 

661 "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}, 

662 }) 

663 

664 return content 

665 

666 

667def _call_local_llm(messages: list) -> str: 

668 """ 

669 Call local LLM using the same config as create_recipe.py:285-300. 

670 

671 Uses OpenAI-compatible API (llama.cpp / Qwen3-VL / cloud endpoint). 

672 """ 

673 import requests as _req 

674 

675 node_tier = os.environ.get('HEVOLVE_NODE_TIER', 'flat') 

676 

677 # VLM-specific override takes priority, then global AutoGen LLM config, 

678 # then node-tier aware defaults (same model the user configured) 

679 if os.environ.get('HEVOLVE_VLM_ENDPOINT_URL'): 

680 base_url = os.environ['HEVOLVE_VLM_ENDPOINT_URL'] 

681 model = os.environ.get('HEVOLVE_VLM_MODEL_NAME', 

682 os.environ.get('HEVOLVE_LLM_MODEL_NAME', 'gpt-4.1-mini')) 

683 api_key = os.environ.get('HEVOLVE_VLM_API_KEY', 

684 os.environ.get('HEVOLVE_LLM_API_KEY', 'dummy')) 

685 elif os.environ.get('HEVOLVE_LLM_ENDPOINT_URL'): 

686 # Use the same LLM config as AutoGen (user's configured model) 

687 base_url = os.environ['HEVOLVE_LLM_ENDPOINT_URL'] 

688 model = os.environ.get('HEVOLVE_LLM_MODEL_NAME', 'gpt-4.1-mini') 

689 api_key = os.environ.get('HEVOLVE_LLM_API_KEY', 'dummy') 

690 elif os.environ.get('OPENAI_API_KEY'): 

691 # Fall back to OpenAI API if configured (common for standalone) 

692 base_url = 'https://api.openai.com/v1' 

693 model = os.environ.get('HEVOLVE_LLM_MODEL_NAME', 'gpt-4.1-mini') 

694 api_key = os.environ['OPENAI_API_KEY'] 

695 else: 

696 # Last resort: local llama.cpp / Qwen3-VL 

697 from core.port_registry import get_local_llm_url 

698 base_url = get_local_llm_url() 

699 model = 'Qwen3-VL-4B-Instruct' 

700 api_key = 'dummy' 

701 

702 try: 

703 resp = _req.post( 

704 f'{base_url.rstrip("/")}/chat/completions', 

705 json={ 

706 'model': model, 

707 'messages': messages, 

708 'max_tokens': 4096, 

709 'temperature': 0.0, 

710 }, 

711 headers={'Authorization': f'Bearer {api_key}'}, 

712 timeout=60, 

713 ) 

714 resp.raise_for_status() 

715 data = resp.json() 

716 return data['choices'][0]['message']['content'] 

717 except Exception as e: 

718 logger.error(f"Local LLM call failed: {e}") 

719 raise 

720 

721 

722def _point_action_to_action_json(point_action: dict) -> dict: 

723 """Convert a point_and_act-shaped result (from 

724 Qwen3VLBackend.try_taskbar_pre_check / point_and_act / retry_with_ 

725 elimination) into the action_json shape the loop's post-action 

726 handler expects. 

727 

728 Single source of truth for the shape transformation - was 

729 duplicated inline in the iteration body, flagged by reviewer 

730 as remaining DRY violation after the Phase 5 parser cleanup. 

731 

732 Both shapes are documented: 

733 point_action: {action, screen_x, screen_y, norm_x, norm_y, 

734 text, done, reasoning, raw, strategy?} 

735 action_json: {Reasoning, Next Action, coordinate, value, 

736 Status, _strategy?} 

737 """ 

738 return { 

739 'Reasoning': point_action.get('reasoning', ''), 

740 'Next Action': point_action.get('action', 'left_click'), 

741 'coordinate': [ 

742 point_action.get('screen_x'), 

743 point_action.get('screen_y'), 

744 ], 

745 'value': point_action.get('text', ''), 

746 'Status': 'DONE' if point_action.get('done') else 'IN_PROGRESS', 

747 '_strategy': point_action.get('strategy', 'taskbar_list'), 

748 } 

749 

750 

751def _extract_click_coord(raw: str, action_json: dict) -> tuple: 

752 """Pull the click target coord from the VLM response. 

753 

754 Single source of truth for "where in 0-1000 norm space did the 

755 VLM say to click?" — was a 4th parallel parser inline in the 

756 iteration body. Now delegates to 

757 :func:`integrations.vlm.parser.parse_vlm_action` for the 

758 ``<point>`` regex, then falls back to ``action_json['coordinate']``, 

759 then to dead center (500, 500). 

760 

761 Returns ``(nx, ny)`` always — never raises, never returns None. 

762 Center fallback is the historical behaviour the VLM loop has 

763 relied on since 2026-04-10. 

764 """ 

765 from integrations.vlm.parser import parse_vlm_action 

766 pa = parse_vlm_action(raw or '', expected_shape='point_only') 

767 if pa.norm_x is not None and pa.norm_y is not None: 

768 return pa.norm_x, pa.norm_y 

769 coord = action_json.get('coordinate') 

770 if coord and isinstance(coord, list) and len(coord) == 2 \ 

771 and coord[0] is not None and coord[1] is not None: 

772 return coord[0], coord[1] 

773 return 500, 500 

774 

775 

776def _parse_vlm_response(response_text: str) -> dict: 

777 """ 

778 Parse VLM JSON response, handling markdown code blocks and partial JSON. 

779 

780 Matches OmniParser vlm_agent.py extract_data() pattern. 

781 

782 Phase 5: thin shim onto the canonical parser in 

783 :mod:`integrations.vlm.parser`. Returns the same dict shape this 

784 function always has (``{Next Action, Status, Reasoning, ...}``) 

785 via :meth:`ParsedAction.to_action_json_dict`. The byte-equivalent 

786 fallback for empty / unparseable input is preserved. 

787 """ 

788 from integrations.vlm.parser import parse_vlm_action 

789 pa = parse_vlm_action(response_text or '', expected_shape='action_json') 

790 return pa.to_action_json_dict() 

791 

792 

793def _build_action_payload(action_json: dict, parsed_screen: dict) -> dict: 

794 """ 

795 Convert VLM response JSON into action payload for local_computer_tool. 

796 

797 Resolves Box ID → coordinate using parsed_screen bounding boxes. 

798 """ 

799 next_action = action_json.get('Next Action', '') 

800 coordinate = action_json.get('coordinate') 

801 text = action_json.get('value', '') 

802 box_id = action_json.get('Box ID') 

803 

804 # Resolve Box ID to coordinate if no explicit coordinate given 

805 if coordinate is None and box_id is not None: 

806 parsed_list = parsed_screen.get('parsed_content_list', []) 

807 for item in parsed_list: 

808 if item.get('idx') == box_id or item.get('id') == box_id: 

809 bbox = item.get('bbox', []) 

810 if len(bbox) == 4: 

811 # Center of bounding box 

812 coordinate = [ 

813 int((bbox[0] + bbox[2]) / 2), 

814 int((bbox[1] + bbox[3]) / 2), 

815 ] 

816 break 

817 

818 payload = {'action': next_action} 

819 if coordinate: 

820 payload['coordinate'] = coordinate 

821 if text: 

822 payload['text'] = text 

823 

824 # Pass through extra keys for file/shell operations. 'command' is for 

825 # the 'shell' action and 'path' covers 'open_file_gui' — both already 

826 # live in SUPPORTED_ACTIONS so _execute_inprocess handles them natively. 

827 for key in ('path', 'source_path', 'destination_path', 'content', 

828 'duration', 'command'): 

829 if key in action_json: 

830 payload[key] = action_json[key] 

831 

832 return payload