Coverage for integrations/vlm/local

1"""

2local_loop.py - Synchronous agentic loop for VLM execution.

4Equivalent to OmniParser's sampling_loop_sync() (loop.py) but without Twisted.

5Orchestrates: screenshot → parse → LLM reason → execute action → repeat.

7Uses the same LLM config as create_recipe.py:285-300 (HEVOLVE_NODE_TIER aware).

8Produces the same response format as Crossbar: {status, extracted_responses, ...}.

9"""

11import os

12import json

13import platform

14import time

15import logging

16import re

18logger = logging.getLogger('hevolve.vlm.local_loop')

20# Max iterations to prevent infinite loops (same safeguard as OmniParser)

21MAX_ITERATIONS = 30

23# Action list — single source of truth for both the legacy SYSTEM_PROMPT

24# and the unified-mode combined_prompt. Keeping one string means the

25# legacy OmniParser path and the unified Qwen3-VL path can never drift

26# on which actions the model is allowed to emit.

27_VLM_ACTION_LIST = (

28 "Available actions:\n"

29 "- GUI: left_click, right_click, double_click, type, key, hotkey, hover, "

30 "mouse_move, wait, scroll_up, scroll_down\n"

31 "- Deterministic (PREFER these when the task is expressible as a "

32 "command — they're 100x faster than GUI grounding):\n"

33 " * shell: run any shell/PowerShell/bash command. Use for launching "

34 "apps (command='notepad'), opening files in specific apps "

35 "(command='notepad hello.txt'), running git/npm/python, file ops, etc. "

36 "Put the full command in the 'command' field.\n"

37 " * open_file_gui: open a file or app in the OS default handler. "

38 "Put the target in the 'path' field (e.g. path='notepad' or "

39 "path='C:\\\\Users\\\\foo\\\\doc.pdf').\n"

40 "- File: list_folders_and_files, Open_file_and_copy_paste, write_file, "

41 "read_file_and_understand\n"

42)

44# System prompt matching OmniParser vlm_agent.py _get_system_prompt()

45_os_name = platform.system() # 'Windows', 'Linux', 'Darwin', etc.

46SYSTEM_PROMPT = (

47 "You are using a " + _os_name + " device.\n"

48 "You are able to use a mouse and keyboard to interact with the computer "

49 "based on the given task and screenshot.\n"

50 "You have access to every app running in the device via the mouse and "

51 "keyboard interfaces mentioned above for GUI actions.\n"

52 "\n"

53 + _VLM_ACTION_LIST +

54 "\n"

55 "IMPORTANT: Prefer deterministic actions (shell, open_file_gui) over "

56 "clicking when the task is expressible as a command. Only fall back to "

57 "clicks for things that MUST be done visually (e.g. clicking a specific "

58 "button inside an already-running app's UI that has no keyboard "

59 "shortcut). After the first action, verify the expected outcome on screen "

60 "before taking any new action.\n"

61 "\n"

62 "Output your response in JSON format:\n"

63 '{\n'

64 ' "Reasoning": "Brief explanation of what you see and why this action is needed",\n'

65 ' "Next Action": "action_name or None if task is complete",\n'

66 ' "Box ID": <element_id if clicking an element>,\n'

67 ' "coordinate": [x, y],\n'

68 ' "value": "text for type/hotkey actions",\n'

69 ' "command": "shell command string when Next Action is shell",\n'

70 ' "path": "file or app name when Next Action is open_file_gui",\n'

71 ' "Status": "IN_PROGRESS or DONE"\n'

72 '}\n'

73 "\n"

74 'When the task is complete, set "Next Action": "None" and "Status": "DONE".\n'

75)

78# ─── Stop registry — port of OmniParser agentic_rpc.app_state["active_sessions"] ───

79# When the VLM is mid-loop on the user's screen and the user clicks

80# the indicator window's Stop button, Nunba POSTs to /api/vlm/stop on

81# HARTOS. That handler calls request_stop() below, which sets the

82# user's threading.Event. The next iteration of run_local_agentic_loop

83# checks the event via _is_stop_requested() and exits cleanly with

84# exit_reason='stopped' instead of running another action on the user's

85# screen.

86#

87# Why threading.Event: pyautogui actions inside an iteration are

88# already synchronous on the loop's thread, so we can't preempt mid-

89# action. But every action has natural seams (between iterations and

90# after each pyautogui call), and Event.is_set() is a cheap atomic

91# check we can sprinkle there without locking.

92#

93# Why per-(user_id, prompt_id) key: same instance can have multiple

94# concurrent VLM sessions if more than one user is connected. Stop

95# fires on a specific session, not globally, mirroring OmniParser's

96# active_sessions dict shape.

97import threading as _threading

99_vlm_stop_flags: dict = {} # f"{user_id}:{prompt_id}" -> Event

100_vlm_stop_lock = _threading.Lock()

101

102

103def _stop_key(user_id: str, prompt_id: str) -> str:

104 return f"{user_id}:{prompt_id}"

105

106

107def _register_session(user_id: str, prompt_id: str) -> _threading.Event:

108 """Called by run_local_agentic_loop on entry — creates the Event so

109 a /api/vlm/stop POST can later flip it."""

110 key = _stop_key(user_id, prompt_id)

111 with _vlm_stop_lock:

112 ev = _vlm_stop_flags.get(key)

113 if ev is None:

114 ev = _threading.Event()

115 _vlm_stop_flags[key] = ev

116 else:

117 # Existing flag from a prior session — clear it so this run

118 # starts un-stopped. Preserves the singleton-Event pattern

119 # without leaking state across runs.

120 ev.clear()

121 return ev

122

123

124def _unregister_session(user_id: str, prompt_id: str) -> None:

125 """Called by run_local_agentic_loop on exit (success or stop) —

126 drops the Event so the dict doesn't grow unbounded."""

127 key = _stop_key(user_id, prompt_id)

128 with _vlm_stop_lock:

129 _vlm_stop_flags.pop(key, None)

130

131

132def _is_stop_requested(user_id: str, prompt_id: str) -> bool:

133 """Cheap check called at iteration boundaries inside the loop."""

134 key = _stop_key(user_id, prompt_id)

135 with _vlm_stop_lock:

136 ev = _vlm_stop_flags.get(key)

137 return bool(ev and ev.is_set())

138

139

140def request_stop(user_id: str, prompt_id: str) -> bool:

141 """Public API — called by /api/vlm/stop in hart_intelligence_entry.py.

142

143 Sets the stop flag on a registered session. Returns True when a

144 matching session was found, False when the user has no active VLM

145 loop (caller logs accordingly so the UI can distinguish "stopped"

146 from "nothing to stop").

147

148 Pairs with the loop's iteration-boundary check at the top of every

149 iteration. Stop becomes visible to the loop on its NEXT iteration

150 — typically within 1-3 seconds depending on which step is in

151 flight (screenshot, LLM call, action execution).

152 """

153 key = _stop_key(user_id, prompt_id)

154 with _vlm_stop_lock:

155 ev = _vlm_stop_flags.get(key)

156 if ev is None:

157 return False

158 ev.set()

159 return True

160

161

162def list_active_sessions() -> list:

163 """Return [(user_id, prompt_id), ...] of currently-running VLM

164 loops. Used by /api/vlm/stop with no payload to bulk-stop, and by

165 diagnostics."""

166 with _vlm_stop_lock:

167 return [tuple(k.split(':', 1)) for k in _vlm_stop_flags.keys()]

168

169

170def run_local_agentic_loop(

171 message: dict,

172 tier: str,

173 max_iterations: int = MAX_ITERATIONS

174) -> dict:

175 """

176 Local agentic loop: screenshot → parse → LLM reason → execute → repeat.

177

178 Supports two modes:

179 - Legacy (default): OmniParser screen parsing + separate LLM reasoning call

180 - Unified (HEVOLVE_VLM_UNIFIED=true): Single Qwen3-VL call for parsing + reasoning

181

182 Args:

183 message: dict with keys from execute_windows_or_android_command:

184 - instruction_to_vlm_agent: str

185 - enhanced_instruction: str (optional, from recipe matching)

186 - user_id: str

187 - prompt_id: str

188 - os_to_control: str

189 - max_ETA_in_seconds: int

190 tier: 'inprocess' or 'http'

191 Returns:

192 dict matching Crossbar response format:

193 {status, extracted_responses, execution_time_seconds}

194 """

195 from integrations.vlm.local_computer_tool import take_screenshot, execute_action

196

197 instruction = message.get('instruction_to_vlm_agent', '')

198 enhanced = message.get('enhanced_instruction', instruction)

199 user_id = message.get('user_id', '')

200 prompt_id = message.get('prompt_id', '')

201 max_eta = message.get('max_ETA_in_seconds', 1800)

202

203 # exit_reason is overwritten as the loop progresses. Defaults to max_iterations

204 # so a loop that runs to the iteration cap without a DONE signal is honest

205 # about it to the caller (instead of pretending status='success').

206 exit_reason = 'max_iterations'

207 consecutive_action_errors = 0

208

209 # Detect unified Qwen3-VL mode

210 use_unified = os.environ.get('HEVOLVE_VLM_UNIFIED', '').lower() in ('1', 'true')

211

212 if use_unified:

213 from integrations.vlm.qwen3vl_backend import get_qwen3vl_backend

214 qwen3vl = get_qwen3vl_backend()

215 logger.info(

216 f"Starting unified VLM loop (Qwen3-VL, tier={tier}, user={user_id}, "

217 f"prompt={prompt_id}): {instruction[:100]}"

218 )

219 else:

220 from integrations.vlm.local_omniparser import parse_screen

221 qwen3vl = None

222 logger.info(

223 f"Starting local VLM loop (tier={tier}, user={user_id}, "

224 f"prompt={prompt_id}): {instruction[:100]}"

225 )

226

227 # Phase 3.5 wire-up: classify the task with the complementary path

228 # router and use it to size the iteration budget. Single-shot

229 # tasks ("click X") shouldn't burn the full 30-iter budget when

230 # one click satisfies the goal — the multi-iter loop's overhead

231 # is real (per-iter screenshot + VLM call ~3-5s). Multi-step

232 # tasks get the full caller-supplied max_iterations.

233 _route = 'multi_step' # safe default — never over-cap a real loop

234 try:

235 if qwen3vl is not None:

236 _route = qwen3vl.route_task(instruction or enhanced)

237 logger.info(f"VLM loop route_task: '{instruction[:60]}' → {_route}")

238 if _route == 'single_shot' and max_iterations > 3:

239 # Cap at 3 — gives one nudge-retry + one followup

240 # if the click misses without burning the full budget.

241 max_iterations = 3

242 elif _route == 'enumerate' and max_iterations > 1:

243 # Enumerate = parse_and_reason snapshot, no follow-up

244 # iter needed.

245 max_iterations = 1

246 except Exception as e:

247 logger.debug(f'route_task wire-up skipped: {e}')

248

249 # Build conversation messages for LLM

250 messages = [

251 {"role": "system", "content": SYSTEM_PROMPT},

252 {"role": "user", "content": enhanced},

253 ]

254

255 extracted_responses = []

256 start_time = time.time()

257

258 # Register this session in the stop registry so /api/vlm/stop can

259 # signal it. Cleanup happens just before the final return below

260 # (no try/finally — the existing iteration body wraps every error

261 # in its own try/continue so exceptions never escape this scope).

262 _register_session(user_id, prompt_id)

263

264 for iteration in range(max_iterations):

265 # User-requested stop wins over every other exit condition.

266 # Check FIRST so a stop fired during the previous iteration's

267 # action lands at this seam without one more click happening.

268 if _is_stop_requested(user_id, prompt_id):

269 logger.info(

270 f"VLM loop stopped by /api/vlm/stop at iteration "

271 f"{iteration + 1} (user={user_id}, prompt={prompt_id})"

272 )

273 exit_reason = 'stopped'

274 break

275

276 elapsed = time.time() - start_time

277 if elapsed > max_eta:

278 logger.warning(f"VLM loop hit ETA limit ({max_eta}s) at iteration {iteration}")

279 exit_reason = 'timeout'

280 break

281

282 logger.info(f"VLM loop iteration {iteration + 1}/{max_iterations}")

283

284 try:

285 # 1. Take screenshot

286 screenshot_b64 = take_screenshot(tier)

287

288 if use_unified and qwen3vl is not None:

289 # ── Single VLM call: plan step + ground coordinates in one prompt ──

290 # One image encoding (~500 visual tokens) instead of two.

291 # Halves latency: ~10s per step instead of ~20s.

292 from integrations.vlm.local_computer_tool import VLM_IMG_W, VLM_IMG_H

293

294 # Taskbar pre-check (additive — restores point_and_act's

295 # smart strategy that 8fa6e97 dropped when this loop

296 # adopted its own inline prompt). When the task targets

297 # a taskbar item ("open Chrome", "click Start", etc.),

298 # _taskbar_list_lookup short-circuits the VLM call

299 # entirely and returns a click coord direct from the

300 # taskbar enumeration — typically <1s vs the 5-10s a

301 # full VLM grounding takes. On miss, returns None and

302 # the existing inline prompt path runs unchanged.

303 _step_started = time.time()

304 try:

305 import pyautogui as _pag_pre

306 _sw_pre, _sh_pre = _pag_pre.size()

307 except Exception:

308 _sw_pre = _sh_pre = None

309 _taskbar_action = None

310 if _sw_pre and _sh_pre:

311 try:

312 _taskbar_action = qwen3vl.try_taskbar_pre_check(

313 screenshot_b64, enhanced,

314 _sw_pre, _sh_pre, _step_started,

315 )

316 except Exception as _tb_err:

317 logger.debug(

318 f"taskbar_pre_check failed (non-fatal): {_tb_err}")

319 if _taskbar_action is not None:

320 # Single source of truth for "point_and_act result

321 # -> action_json shape" conversion. Was inline

322 # 14 lines duplicating the dict construction.

323 action_json = _point_action_to_action_json(_taskbar_action)

324 raw = _taskbar_action.get('raw', '')

325 logger.info(

326 f"Loop: taskbar_list shortcut → "

327 f"({_taskbar_action.get('screen_x')},"

328 f"{_taskbar_action.get('screen_y')})"

329 )

330 # Fall through to the existing post-action handling

331 # below (which executes action_json + records it).

332 # Skip the combined_prompt + _call_api block.

333 _skip_combined_prompt = True

334 else:

335 _skip_combined_prompt = False

336

337 # Skip the heavy combined-prompt VLM call entirely when

338 # taskbar_pre_check above already produced a click —

339 # the taskbar lookup is the authoritative grounding for

340 # taskbar tasks (point_and_act has used the same

341 # short-circuit since cb92a2e). Without this guard the

342 # _call_api below would overwrite action_json with a

343 # less-grounded result.

344 if not _skip_combined_prompt:

345 combined_prompt = (

346 f"You are a computer use agent on {_os_name}.\n"

347 f"Task: {enhanced}\n\n"

348 )

349 if extracted_responses:

350 last = extracted_responses[-1].get('content', '')

351 if isinstance(last, dict):

352 combined_prompt += (

353 f"Previous action: {last.get('action', '?')} — "

354 f"{last.get('reasoning', '')[:80]}.\n"

355 f"Check the screenshot: did it succeed?\n\n"

356 )

357 combined_prompt += (

358 _VLM_ACTION_LIST +

359 "\n"

360 "What is the SINGLE next action? Respond in JSON ONLY:\n"

361 "{\n"

362 ' "Reasoning": "What you see and why this action",\n'

363 ' "Next Action": "left_click|right_click|double_click|'

365 'open_file_gui|None",\n'

366 ' "coordinate": [x, y],\n'

367 ' "value": "text to type or key name",\n'

368 ' "command": "shell command when Next Action is shell",\n'

369 ' "path": "file or app name when Next Action is open_file_gui",\n'

370 ' "Status": "IN_PROGRESS|DONE"\n'

371 "}\n\n"

372 "For click actions: provide <point>x,y</point> normalized "

373 "0-1000 coordinates.\n"

374 "For type/key/hotkey: set coordinate to null, put text in value.\n"

375 "Only fall back to clicks when the task requires interacting "

376 "with something already visible on screen that cannot be "

377 "done via a command.\n"

378 'When task is complete: "Next Action": "None", "Status": "DONE".'

379 )

380

381 raw = qwen3vl._call_api([{

382 "role": "user",

383 "content": [

384 {"type": "text", "text": combined_prompt},

385 {"type": "image_url", "image_url": {

386 "url": f"data:image/jpeg;base64,{screenshot_b64}"}},

387 ]

388 }])

389 # Guard against None (e.g. thinking-only response with no content)

390 if raw is None:

391 raw = ''

392 action_json = _parse_vlm_response(raw)

393

394 # Extract coordinates from <point>x,y</point> if present in raw

395 next_action = action_json.get('Next Action', 'None')

396 _CLICK_ACTIONS = {'left_click', 'right_click', 'double_click',

397 'middle_click', 'hover', 'mouse_move'}

398

399 if next_action in _CLICK_ACTIONS:

400 # Phase 5 follow-through: was a 4th inline <point>

401 # regex parser duplicating parser._parse_point_shape.

402 # Now delegates to the canonical parser so the

403 # action_json JSON-coordinate vs the raw <point>

404 # tag agree (they previously could disagree when

405 # the JSON had Box ID + the raw text had a point).

406 nx, ny = _extract_click_coord(raw, action_json)

407

408 # Scale from 1000-normalized or image space to screen space

409 try:

410 import pyautogui as _pag

411 _sw, _sh = _pag.size()

412 if nx <= 1000 and ny <= 1000:

413 # Normalized 0-1000 coords

414 screen_x = int(nx * _sw / 1000)

415 screen_y = int(ny * _sh / 1000)

416 else:

417 # Image pixel coords

418 screen_x = int(nx * _sw / VLM_IMG_W)

419 screen_y = int(ny * _sh / VLM_IMG_H)

420 except Exception as _scale_err:

421 logger.debug(f"coord scale to screen failed: {_scale_err}")

422 screen_x, screen_y = nx, ny

423 action_json['coordinate'] = [screen_x, screen_y]

424 logger.info(f"Action: {next_action} at ({screen_x},{screen_y}) "

425 f"norm=({nx},{ny})")

426

427 # Bias-detection + elimination retry — additive

428 # restoration of point_and_act's strategy 3 that

429 # 8fa6e97 dropped when this loop adopted its own

430 # inline prompt. Catches center/bottom/top-edge

431 # hallucinations in the 0-1000 normalized coords

432 # the loop just produced and reissues the VLM with

433 # an elimination prompt that explicitly forbids

434 # the suspect region. Skipped when the action

435 # came from taskbar_pre_check (its coords are

436 # already lookup-grounded, no need to retry).

437 # All wrapped in try/except so a retry-time error

438 # NEVER takes down the iteration — original coords

439 # remain in action_json.

440 if action_json.get('_strategy') != 'taskbar_list':

441 try:

442 _bias = qwen3vl.detect_grounding_bias(

443 nx, ny, 'left_click', enhanced,

444 )

445 if _bias:

446 _retry = qwen3vl.retry_with_elimination(

447 screenshot_b64, enhanced,

448 VLM_IMG_W, VLM_IMG_H, _bias,

449 )

450 if _retry is not None:

451 _r_dict, _enx, _eny = _retry

452 nx, ny = _enx, _eny

453 # Re-scale retry coords to screen

454 # space using the same rule as the

455 # original (0-1000 vs image-pixel).

456 try:

457 import pyautogui as _pag_r

458 _swr, _shr = _pag_r.size()

459 if nx <= 1000 and ny <= 1000:

460 screen_x = int(nx * _swr / 1000)

461 screen_y = int(ny * _shr / 1000)

462 else:

463 screen_x = int(nx * _swr / VLM_IMG_W)

464 screen_y = int(ny * _shr / VLM_IMG_H)

465 except Exception:

466 screen_x, screen_y = nx, ny

467 action_json['coordinate'] = [

468 screen_x, screen_y,

469 ]

470 action_json['_strategy'] = (

471 'elimination_retry'

472 )

473 logger.info(

474 f"Loop bias retry ({_bias}) → "

475 f"({screen_x},{screen_y}) "

476 f"norm=({nx},{ny})"

477 )

478 except Exception as _bias_err:

479 logger.debug(

480 f"bias retry failed (non-fatal, "

481 f"keeping original coords): {_bias_err}"

482 )

483 # Sanity check: flag clicks in the likely taskbar region.

484 # If the VLM's reasoning talks about a Start menu item or

485 # app window but the coordinate lands in the bottom 50px,

486 # the grounding probably drifted onto the taskbar strip.

487 # We log a warning and let the verify step catch it; the

488 # router will see exit_reason=action_error if this pattern

489 # keeps happening, so it can respond honestly.

490 try:

491 import pyautogui as _pag2

492 _sw2, _sh2 = _pag2.size()

493 reasoning_lc = (action_json.get('Reasoning') or '').lower()

494 if (screen_y >= _sh2 - 50

495 and any(t in reasoning_lc for t in

496 ('start menu', 'menu item', 'recommended', 'pinned'))):

497 logger.warning(

498 f"VLM click ({screen_x},{screen_y}) is in taskbar "

499 f"region (screen height={_sh2}), but reasoning "

500 f"mentions Start menu — probable grounding drift"

501 )

502 except Exception:

503 pass

504 else:

505 action_json['coordinate'] = None

506 logger.info(f"Action: {next_action} "

507 f"value='{action_json.get('value', '')[:50]}'")

508

509 parsed = {'screen_info': '', 'parsed_content_list': []}

510 else:

511 # ── Legacy path: OmniParser + separate LLM call ──

512 # 2. Parse UI elements

513 parsed = parse_screen(screenshot_b64, tier)

514 screen_info = parsed.get('screen_info', '')

515

516 # 3. Build LLM prompt with current screen state

517 user_content = _build_vision_prompt(screen_info, screenshot_b64, iteration)

518 messages.append({"role": "user", "content": user_content})

519

520 # 4. Call local LLM for reasoning

521 llm_response = _call_local_llm(messages)

522 action_json = _parse_vlm_response(llm_response)

523

524 # Record the assistant response

525 messages.append({"role": "assistant", "content": llm_response})

526

527 logger.info(f"VLM action: {action_json.get('Next Action', 'None')}")

528

529 # Check if task is complete

530 next_action = action_json.get('Next Action', 'None')

531 status = action_json.get('Status', 'IN_PROGRESS')

532

533 if next_action == 'None' or next_action is None or status == 'DONE':

534 logger.info("VLM task completed")

535 extracted_responses.append({

536 "type": "completion",

537 "content": action_json.get('Reasoning', 'Task completed'),

538 "iteration": iteration + 1,

539 })

540 exit_reason = 'done'

541 break

542

543 # 5. Execute the action.

544 # Phase 6 wire-up: pass safety=True so the per-session cap

545 # + window blocklist + audit JSONL fire on every loop click.

546 # Verify=True triggers the post-click pre/post diff + 50px

547 # nudge retry from Phase 4. Both default-tunable via env

548 # but ON in the loop is the right safe default — solo

549 # /visual_agent calls keep their existing behaviour.

550 action_payload = _build_action_payload(action_json, parsed)

551 _safety_on = os.environ.get(

552 'HEVOLVE_VLM_LOOP_SAFETY', '1').lower() not in ('0', 'false', 'no')

553 _verify_on = os.environ.get(

554 'HEVOLVE_VLM_LOOP_VERIFY', '0').lower() in ('1', 'true', 'yes')

555 result = execute_action(

556 action_payload, tier,

557 safety=_safety_on, verify=_verify_on)

558 action_ok = result.get('status') != 'error'

559 if action_ok:

560 consecutive_action_errors = 0

561 else:

562 consecutive_action_errors += 1

563

564 # Surface coordinate + strategy in the response content so

565 # observers (benchmark, audit, /visual_agent telemetry,

566 # post-hoc replay) can reconstruct what the VLM actually

567 # decided this iteration without re-parsing action_json.

568 # Was missing - vlm_grounding_benchmark.py:loop_one_iter

569 # path always read content['coordinate'] = None and scored

570 # all 6 targets as FAIL, hiding any real grounding regression

571 # behind a fixed metric.

572 extracted_responses.append({

573 "type": "action",

574 "content": {

575 "action": next_action,

576 "reasoning": action_json.get('Reasoning', ''),

577 "result": result.get('output', ''),

578 "ok": action_ok,

579 "coordinate": action_json.get('coordinate'),

580 "_strategy": action_json.get('_strategy', 'inline_prompt'),

581 },

582 "iteration": iteration + 1,

583 })

584

585 # Bail after 3 consecutive action errors — something is structurally

586 # broken (bad coordinates, action type mismatch, subprocess dead)

587 # and more iterations won't help.

588 if consecutive_action_errors >= 3:

589 logger.warning("VLM loop: 3 consecutive action errors, aborting")

590 exit_reason = 'action_error'

591 break

592

593 # Small delay between iterations (let UI update)

594 time.sleep(0.5)

595

596 except Exception as e:

597 logger.error(f"VLM loop iteration {iteration + 1} error: {e}")

598 extracted_responses.append({

599 "type": "error",

600 "content": str(e),

601 "iteration": iteration + 1,

602 })

603 consecutive_action_errors += 1

604 if consecutive_action_errors >= 3:

605 logger.warning("VLM loop: 3 consecutive iteration errors, aborting")

606 exit_reason = 'action_error'

607 break

608 # Continue to next iteration rather than aborting

609 continue

610

611 execution_time = time.time() - start_time

612 logger.info(

613 f"VLM loop finished: {len(extracted_responses)} actions in "

614 f"{execution_time:.1f}s (exit_reason={exit_reason})"

615 )

616

617 # Drop this session's stop flag so the registry doesn't grow

618 # across runs. Pairs with _register_session above.

619 _unregister_session(user_id, prompt_id)

620

621 # status mirrors exit_reason: only 'done' is a real success. Callers

622 # (LangChain router, autogen) can inspect exit_reason to craft an honest

623 # response instead of confidently lying when the loop timed out.

624 # 'stopped' is its own honest exit_reason — Nunba's indicator UX

625 # reads it to render the right "Stopped" badge instead of a

626 # generic "incomplete".

627 return {

628 "status": "success" if exit_reason == 'done' else "incomplete",

629 "exit_reason": exit_reason,

630 "extracted_responses": extracted_responses,

631 "execution_time_seconds": execution_time,

632 }

633

634

635def _build_vision_prompt(screen_info: str, screenshot_b64: str, iteration: int) -> list:

636 """Build multimodal prompt with screen info + screenshot image."""

637 content = []

638

639 if iteration == 0:

640 content.append({

641 "type": "text",

642 "text": (

643 "Here is the current screen state. "

644 "Analyze the UI elements and decide the next action.\n\n"

645 f"UI Elements:\n{screen_info}"

646 ),

647 })

648 else:

649 content.append({

650 "type": "text",

651 "text": (

652 "Here is the updated screen after the previous action. "

653 "Verify the previous action succeeded, then decide the next action.\n\n"

654 f"UI Elements:\n{screen_info}"

655 ),

656 })

657

658 # Add screenshot as image

659 content.append({

660 "type": "image_url",

661 "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},

662 })

663

664 return content

665

666

667def _call_local_llm(messages: list) -> str:

668 """

669 Call local LLM using the same config as create_recipe.py:285-300.

670

671 Uses OpenAI-compatible API (llama.cpp / Qwen3-VL / cloud endpoint).

672 """

673 import requests as _req

674

675 node_tier = os.environ.get('HEVOLVE_NODE_TIER', 'flat')

676

677 # VLM-specific override takes priority, then global AutoGen LLM config,

678 # then node-tier aware defaults (same model the user configured)

679 if os.environ.get('HEVOLVE_VLM_ENDPOINT_URL'):

680 base_url = os.environ['HEVOLVE_VLM_ENDPOINT_URL']

681 model = os.environ.get('HEVOLVE_VLM_MODEL_NAME',

682 os.environ.get('HEVOLVE_LLM_MODEL_NAME', 'gpt-4.1-mini'))

683 api_key = os.environ.get('HEVOLVE_VLM_API_KEY',

684 os.environ.get('HEVOLVE_LLM_API_KEY', 'dummy'))

685 elif os.environ.get('HEVOLVE_LLM_ENDPOINT_URL'):

686 # Use the same LLM config as AutoGen (user's configured model)

687 base_url = os.environ['HEVOLVE_LLM_ENDPOINT_URL']

688 model = os.environ.get('HEVOLVE_LLM_MODEL_NAME', 'gpt-4.1-mini')

689 api_key = os.environ.get('HEVOLVE_LLM_API_KEY', 'dummy')

690 elif os.environ.get('OPENAI_API_KEY'):

691 # Fall back to OpenAI API if configured (common for standalone)

692 base_url = 'https://api.openai.com/v1'

693 model = os.environ.get('HEVOLVE_LLM_MODEL_NAME', 'gpt-4.1-mini')

694 api_key = os.environ['OPENAI_API_KEY']

695 else:

696 # Last resort: local llama.cpp / Qwen3-VL

697 from core.port_registry import get_local_llm_url

698 base_url = get_local_llm_url()

699 model = 'Qwen3-VL-4B-Instruct'

700 api_key = 'dummy'

701

702 try:

703 resp = _req.post(

704 f'{base_url.rstrip("/")}/chat/completions',

705 json={

706 'model': model,

707 'messages': messages,

708 'max_tokens': 4096,

709 'temperature': 0.0,

710 },

711 headers={'Authorization': f'Bearer {api_key}'},

712 timeout=60,

713 )

714 resp.raise_for_status()

715 data = resp.json()

716 return data['choices'][0]['message']['content']

717 except Exception as e:

718 logger.error(f"Local LLM call failed: {e}")

719 raise

720

721

722def _point_action_to_action_json(point_action: dict) -> dict:

723 """Convert a point_and_act-shaped result (from

724 Qwen3VLBackend.try_taskbar_pre_check / point_and_act / retry_with_

725 elimination) into the action_json shape the loop's post-action

726 handler expects.

727

728 Single source of truth for the shape transformation - was

729 duplicated inline in the iteration body, flagged by reviewer

730 as remaining DRY violation after the Phase 5 parser cleanup.

731

732 Both shapes are documented:

733 point_action: {action, screen_x, screen_y, norm_x, norm_y,

734 text, done, reasoning, raw, strategy?}

735 action_json: {Reasoning, Next Action, coordinate, value,

736 Status, _strategy?}

737 """

738 return {

739 'Reasoning': point_action.get('reasoning', ''),

740 'Next Action': point_action.get('action', 'left_click'),

741 'coordinate': [

742 point_action.get('screen_x'),

743 point_action.get('screen_y'),

744 ],

745 'value': point_action.get('text', ''),

746 'Status': 'DONE' if point_action.get('done') else 'IN_PROGRESS',

747 '_strategy': point_action.get('strategy', 'taskbar_list'),

748 }

749

750

751def _extract_click_coord(raw: str, action_json: dict) -> tuple:

752 """Pull the click target coord from the VLM response.

753

754 Single source of truth for "where in 0-1000 norm space did the

755 VLM say to click?" — was a 4th parallel parser inline in the

756 iteration body. Now delegates to

757 :func:`integrations.vlm.parser.parse_vlm_action` for the

758 ``<point>`` regex, then falls back to ``action_json['coordinate']``,

759 then to dead center (500, 500).

760

761 Returns ``(nx, ny)`` always — never raises, never returns None.

762 Center fallback is the historical behaviour the VLM loop has

763 relied on since 2026-04-10.

764 """

765 from integrations.vlm.parser import parse_vlm_action

766 pa = parse_vlm_action(raw or '', expected_shape='point_only')

767 if pa.norm_x is not None and pa.norm_y is not None:

768 return pa.norm_x, pa.norm_y

769 coord = action_json.get('coordinate')

770 if coord and isinstance(coord, list) and len(coord) == 2 \

771 and coord[0] is not None and coord[1] is not None:

772 return coord[0], coord[1]

773 return 500, 500

774

775

776def _parse_vlm_response(response_text: str) -> dict:

777 """

778 Parse VLM JSON response, handling markdown code blocks and partial JSON.

779

780 Matches OmniParser vlm_agent.py extract_data() pattern.

781

782 Phase 5: thin shim onto the canonical parser in

783 :mod:`integrations.vlm.parser`. Returns the same dict shape this

784 function always has (``{Next Action, Status, Reasoning, ...}``)

785 via :meth:`ParsedAction.to_action_json_dict`. The byte-equivalent

786 fallback for empty / unparseable input is preserved.

787 """

788 from integrations.vlm.parser import parse_vlm_action

789 pa = parse_vlm_action(response_text or '', expected_shape='action_json')

790 return pa.to_action_json_dict()

791

792

793def _build_action_payload(action_json: dict, parsed_screen: dict) -> dict:

794 """

795 Convert VLM response JSON into action payload for local_computer_tool.

796

797 Resolves Box ID → coordinate using parsed_screen bounding boxes.

798 """

799 next_action = action_json.get('Next Action', '')

800 coordinate = action_json.get('coordinate')

801 text = action_json.get('value', '')

802 box_id = action_json.get('Box ID')

803

804 # Resolve Box ID to coordinate if no explicit coordinate given

805 if coordinate is None and box_id is not None:

806 parsed_list = parsed_screen.get('parsed_content_list', [])

807 for item in parsed_list:

808 if item.get('idx') == box_id or item.get('id') == box_id:

809 bbox = item.get('bbox', [])

810 if len(bbox) == 4:

811 # Center of bounding box

812 coordinate = [

813 int((bbox[0] + bbox[2]) / 2),

814 int((bbox[1] + bbox[3]) / 2),

815 ]

816 break

817

818 payload = {'action': next_action}

819 if coordinate:

820 payload['coordinate'] = coordinate

821 if text:

822 payload['text'] = text

823

824 # Pass through extra keys for file/shell operations. 'command' is for

825 # the 'shell' action and 'path' covers 'open_file_gui' — both already

826 # live in SUPPORTED_ACTIONS so _execute_inprocess handles them natively.

827 for key in ('path', 'source_path', 'destination_path', 'content',

828 'duration', 'command'):

829 if key in action_json:

830 payload[key] = action_json[key]

831

832 return payload

Coverage for integrations / vlm / local_loop.py: 72.5%

258 statements