Coverage for integrations / vlm / local_loop.py: 72.5%
258 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2local_loop.py - Synchronous agentic loop for VLM execution.
4Equivalent to OmniParser's sampling_loop_sync() (loop.py) but without Twisted.
5Orchestrates: screenshot → parse → LLM reason → execute action → repeat.
7Uses the same LLM config as create_recipe.py:285-300 (HEVOLVE_NODE_TIER aware).
8Produces the same response format as Crossbar: {status, extracted_responses, ...}.
9"""
11import os
12import json
13import platform
14import time
15import logging
16import re
18logger = logging.getLogger('hevolve.vlm.local_loop')
20# Max iterations to prevent infinite loops (same safeguard as OmniParser)
21MAX_ITERATIONS = 30
23# Action list — single source of truth for both the legacy SYSTEM_PROMPT
24# and the unified-mode combined_prompt. Keeping one string means the
25# legacy OmniParser path and the unified Qwen3-VL path can never drift
26# on which actions the model is allowed to emit.
27_VLM_ACTION_LIST = (
28 "Available actions:\n"
29 "- GUI: left_click, right_click, double_click, type, key, hotkey, hover, "
30 "mouse_move, wait, scroll_up, scroll_down\n"
31 "- Deterministic (PREFER these when the task is expressible as a "
32 "command — they're 100x faster than GUI grounding):\n"
33 " * shell: run any shell/PowerShell/bash command. Use for launching "
34 "apps (command='notepad'), opening files in specific apps "
35 "(command='notepad hello.txt'), running git/npm/python, file ops, etc. "
36 "Put the full command in the 'command' field.\n"
37 " * open_file_gui: open a file or app in the OS default handler. "
38 "Put the target in the 'path' field (e.g. path='notepad' or "
39 "path='C:\\\\Users\\\\foo\\\\doc.pdf').\n"
40 "- File: list_folders_and_files, Open_file_and_copy_paste, write_file, "
41 "read_file_and_understand\n"
42)
44# System prompt matching OmniParser vlm_agent.py _get_system_prompt()
45_os_name = platform.system() # 'Windows', 'Linux', 'Darwin', etc.
46SYSTEM_PROMPT = (
47 "You are using a " + _os_name + " device.\n"
48 "You are able to use a mouse and keyboard to interact with the computer "
49 "based on the given task and screenshot.\n"
50 "You have access to every app running in the device via the mouse and "
51 "keyboard interfaces mentioned above for GUI actions.\n"
52 "\n"
53 + _VLM_ACTION_LIST +
54 "\n"
55 "IMPORTANT: Prefer deterministic actions (shell, open_file_gui) over "
56 "clicking when the task is expressible as a command. Only fall back to "
57 "clicks for things that MUST be done visually (e.g. clicking a specific "
58 "button inside an already-running app's UI that has no keyboard "
59 "shortcut). After the first action, verify the expected outcome on screen "
60 "before taking any new action.\n"
61 "\n"
62 "Output your response in JSON format:\n"
63 '{\n'
64 ' "Reasoning": "Brief explanation of what you see and why this action is needed",\n'
65 ' "Next Action": "action_name or None if task is complete",\n'
66 ' "Box ID": <element_id if clicking an element>,\n'
67 ' "coordinate": [x, y],\n'
68 ' "value": "text for type/hotkey actions",\n'
69 ' "command": "shell command string when Next Action is shell",\n'
70 ' "path": "file or app name when Next Action is open_file_gui",\n'
71 ' "Status": "IN_PROGRESS or DONE"\n'
72 '}\n'
73 "\n"
74 'When the task is complete, set "Next Action": "None" and "Status": "DONE".\n'
75)
78# ─── Stop registry — port of OmniParser agentic_rpc.app_state["active_sessions"] ───
79# When the VLM is mid-loop on the user's screen and the user clicks
80# the indicator window's Stop button, Nunba POSTs to /api/vlm/stop on
81# HARTOS. That handler calls request_stop() below, which sets the
82# user's threading.Event. The next iteration of run_local_agentic_loop
83# checks the event via _is_stop_requested() and exits cleanly with
84# exit_reason='stopped' instead of running another action on the user's
85# screen.
86#
87# Why threading.Event: pyautogui actions inside an iteration are
88# already synchronous on the loop's thread, so we can't preempt mid-
89# action. But every action has natural seams (between iterations and
90# after each pyautogui call), and Event.is_set() is a cheap atomic
91# check we can sprinkle there without locking.
92#
93# Why per-(user_id, prompt_id) key: same instance can have multiple
94# concurrent VLM sessions if more than one user is connected. Stop
95# fires on a specific session, not globally, mirroring OmniParser's
96# active_sessions dict shape.
97import threading as _threading
99_vlm_stop_flags: dict = {} # f"{user_id}:{prompt_id}" -> Event
100_vlm_stop_lock = _threading.Lock()
103def _stop_key(user_id: str, prompt_id: str) -> str:
104 return f"{user_id}:{prompt_id}"
107def _register_session(user_id: str, prompt_id: str) -> _threading.Event:
108 """Called by run_local_agentic_loop on entry — creates the Event so
109 a /api/vlm/stop POST can later flip it."""
110 key = _stop_key(user_id, prompt_id)
111 with _vlm_stop_lock:
112 ev = _vlm_stop_flags.get(key)
113 if ev is None:
114 ev = _threading.Event()
115 _vlm_stop_flags[key] = ev
116 else:
117 # Existing flag from a prior session — clear it so this run
118 # starts un-stopped. Preserves the singleton-Event pattern
119 # without leaking state across runs.
120 ev.clear()
121 return ev
124def _unregister_session(user_id: str, prompt_id: str) -> None:
125 """Called by run_local_agentic_loop on exit (success or stop) —
126 drops the Event so the dict doesn't grow unbounded."""
127 key = _stop_key(user_id, prompt_id)
128 with _vlm_stop_lock:
129 _vlm_stop_flags.pop(key, None)
132def _is_stop_requested(user_id: str, prompt_id: str) -> bool:
133 """Cheap check called at iteration boundaries inside the loop."""
134 key = _stop_key(user_id, prompt_id)
135 with _vlm_stop_lock:
136 ev = _vlm_stop_flags.get(key)
137 return bool(ev and ev.is_set())
140def request_stop(user_id: str, prompt_id: str) -> bool:
141 """Public API — called by /api/vlm/stop in hart_intelligence_entry.py.
143 Sets the stop flag on a registered session. Returns True when a
144 matching session was found, False when the user has no active VLM
145 loop (caller logs accordingly so the UI can distinguish "stopped"
146 from "nothing to stop").
148 Pairs with the loop's iteration-boundary check at the top of every
149 iteration. Stop becomes visible to the loop on its NEXT iteration
150 — typically within 1-3 seconds depending on which step is in
151 flight (screenshot, LLM call, action execution).
152 """
153 key = _stop_key(user_id, prompt_id)
154 with _vlm_stop_lock:
155 ev = _vlm_stop_flags.get(key)
156 if ev is None:
157 return False
158 ev.set()
159 return True
162def list_active_sessions() -> list:
163 """Return [(user_id, prompt_id), ...] of currently-running VLM
164 loops. Used by /api/vlm/stop with no payload to bulk-stop, and by
165 diagnostics."""
166 with _vlm_stop_lock:
167 return [tuple(k.split(':', 1)) for k in _vlm_stop_flags.keys()]
170def run_local_agentic_loop(
171 message: dict,
172 tier: str,
173 max_iterations: int = MAX_ITERATIONS
174) -> dict:
175 """
176 Local agentic loop: screenshot → parse → LLM reason → execute → repeat.
178 Supports two modes:
179 - Legacy (default): OmniParser screen parsing + separate LLM reasoning call
180 - Unified (HEVOLVE_VLM_UNIFIED=true): Single Qwen3-VL call for parsing + reasoning
182 Args:
183 message: dict with keys from execute_windows_or_android_command:
184 - instruction_to_vlm_agent: str
185 - enhanced_instruction: str (optional, from recipe matching)
186 - user_id: str
187 - prompt_id: str
188 - os_to_control: str
189 - max_ETA_in_seconds: int
190 tier: 'inprocess' or 'http'
191 Returns:
192 dict matching Crossbar response format:
193 {status, extracted_responses, execution_time_seconds}
194 """
195 from integrations.vlm.local_computer_tool import take_screenshot, execute_action
197 instruction = message.get('instruction_to_vlm_agent', '')
198 enhanced = message.get('enhanced_instruction', instruction)
199 user_id = message.get('user_id', '')
200 prompt_id = message.get('prompt_id', '')
201 max_eta = message.get('max_ETA_in_seconds', 1800)
203 # exit_reason is overwritten as the loop progresses. Defaults to max_iterations
204 # so a loop that runs to the iteration cap without a DONE signal is honest
205 # about it to the caller (instead of pretending status='success').
206 exit_reason = 'max_iterations'
207 consecutive_action_errors = 0
209 # Detect unified Qwen3-VL mode
210 use_unified = os.environ.get('HEVOLVE_VLM_UNIFIED', '').lower() in ('1', 'true')
212 if use_unified:
213 from integrations.vlm.qwen3vl_backend import get_qwen3vl_backend
214 qwen3vl = get_qwen3vl_backend()
215 logger.info(
216 f"Starting unified VLM loop (Qwen3-VL, tier={tier}, user={user_id}, "
217 f"prompt={prompt_id}): {instruction[:100]}"
218 )
219 else:
220 from integrations.vlm.local_omniparser import parse_screen
221 qwen3vl = None
222 logger.info(
223 f"Starting local VLM loop (tier={tier}, user={user_id}, "
224 f"prompt={prompt_id}): {instruction[:100]}"
225 )
227 # Phase 3.5 wire-up: classify the task with the complementary path
228 # router and use it to size the iteration budget. Single-shot
229 # tasks ("click X") shouldn't burn the full 30-iter budget when
230 # one click satisfies the goal — the multi-iter loop's overhead
231 # is real (per-iter screenshot + VLM call ~3-5s). Multi-step
232 # tasks get the full caller-supplied max_iterations.
233 _route = 'multi_step' # safe default — never over-cap a real loop
234 try:
235 if qwen3vl is not None:
236 _route = qwen3vl.route_task(instruction or enhanced)
237 logger.info(f"VLM loop route_task: '{instruction[:60]}' → {_route}")
238 if _route == 'single_shot' and max_iterations > 3:
239 # Cap at 3 — gives one nudge-retry + one followup
240 # if the click misses without burning the full budget.
241 max_iterations = 3
242 elif _route == 'enumerate' and max_iterations > 1:
243 # Enumerate = parse_and_reason snapshot, no follow-up
244 # iter needed.
245 max_iterations = 1
246 except Exception as e:
247 logger.debug(f'route_task wire-up skipped: {e}')
249 # Build conversation messages for LLM
250 messages = [
251 {"role": "system", "content": SYSTEM_PROMPT},
252 {"role": "user", "content": enhanced},
253 ]
255 extracted_responses = []
256 start_time = time.time()
258 # Register this session in the stop registry so /api/vlm/stop can
259 # signal it. Cleanup happens just before the final return below
260 # (no try/finally — the existing iteration body wraps every error
261 # in its own try/continue so exceptions never escape this scope).
262 _register_session(user_id, prompt_id)
264 for iteration in range(max_iterations):
265 # User-requested stop wins over every other exit condition.
266 # Check FIRST so a stop fired during the previous iteration's
267 # action lands at this seam without one more click happening.
268 if _is_stop_requested(user_id, prompt_id):
269 logger.info(
270 f"VLM loop stopped by /api/vlm/stop at iteration "
271 f"{iteration + 1} (user={user_id}, prompt={prompt_id})"
272 )
273 exit_reason = 'stopped'
274 break
276 elapsed = time.time() - start_time
277 if elapsed > max_eta:
278 logger.warning(f"VLM loop hit ETA limit ({max_eta}s) at iteration {iteration}")
279 exit_reason = 'timeout'
280 break
282 logger.info(f"VLM loop iteration {iteration + 1}/{max_iterations}")
284 try:
285 # 1. Take screenshot
286 screenshot_b64 = take_screenshot(tier)
288 if use_unified and qwen3vl is not None:
289 # ── Single VLM call: plan step + ground coordinates in one prompt ──
290 # One image encoding (~500 visual tokens) instead of two.
291 # Halves latency: ~10s per step instead of ~20s.
292 from integrations.vlm.local_computer_tool import VLM_IMG_W, VLM_IMG_H
294 # Taskbar pre-check (additive — restores point_and_act's
295 # smart strategy that 8fa6e97 dropped when this loop
296 # adopted its own inline prompt). When the task targets
297 # a taskbar item ("open Chrome", "click Start", etc.),
298 # _taskbar_list_lookup short-circuits the VLM call
299 # entirely and returns a click coord direct from the
300 # taskbar enumeration — typically <1s vs the 5-10s a
301 # full VLM grounding takes. On miss, returns None and
302 # the existing inline prompt path runs unchanged.
303 _step_started = time.time()
304 try:
305 import pyautogui as _pag_pre
306 _sw_pre, _sh_pre = _pag_pre.size()
307 except Exception:
308 _sw_pre = _sh_pre = None
309 _taskbar_action = None
310 if _sw_pre and _sh_pre:
311 try:
312 _taskbar_action = qwen3vl.try_taskbar_pre_check(
313 screenshot_b64, enhanced,
314 _sw_pre, _sh_pre, _step_started,
315 )
316 except Exception as _tb_err:
317 logger.debug(
318 f"taskbar_pre_check failed (non-fatal): {_tb_err}")
319 if _taskbar_action is not None:
320 # Single source of truth for "point_and_act result
321 # -> action_json shape" conversion. Was inline
322 # 14 lines duplicating the dict construction.
323 action_json = _point_action_to_action_json(_taskbar_action)
324 raw = _taskbar_action.get('raw', '')
325 logger.info(
326 f"Loop: taskbar_list shortcut → "
327 f"({_taskbar_action.get('screen_x')},"
328 f"{_taskbar_action.get('screen_y')})"
329 )
330 # Fall through to the existing post-action handling
331 # below (which executes action_json + records it).
332 # Skip the combined_prompt + _call_api block.
333 _skip_combined_prompt = True
334 else:
335 _skip_combined_prompt = False
337 # Skip the heavy combined-prompt VLM call entirely when
338 # taskbar_pre_check above already produced a click —
339 # the taskbar lookup is the authoritative grounding for
340 # taskbar tasks (point_and_act has used the same
341 # short-circuit since cb92a2e). Without this guard the
342 # _call_api below would overwrite action_json with a
343 # less-grounded result.
344 if not _skip_combined_prompt:
345 combined_prompt = (
346 f"You are a computer use agent on {_os_name}.\n"
347 f"Task: {enhanced}\n\n"
348 )
349 if extracted_responses:
350 last = extracted_responses[-1].get('content', '')
351 if isinstance(last, dict):
352 combined_prompt += (
353 f"Previous action: {last.get('action', '?')} — "
354 f"{last.get('reasoning', '')[:80]}.\n"
355 f"Check the screenshot: did it succeed?\n\n"
356 )
357 combined_prompt += (
358 _VLM_ACTION_LIST +
359 "\n"
360 "What is the SINGLE next action? Respond in JSON ONLY:\n"
361 "{\n"
362 ' "Reasoning": "What you see and why this action",\n'
363 ' "Next Action": "left_click|right_click|double_click|'
364 'type|key|hotkey|scroll_up|scroll_down|wait|shell|'
365 'open_file_gui|None",\n'
366 ' "coordinate": [x, y],\n'
367 ' "value": "text to type or key name",\n'
368 ' "command": "shell command when Next Action is shell",\n'
369 ' "path": "file or app name when Next Action is open_file_gui",\n'
370 ' "Status": "IN_PROGRESS|DONE"\n'
371 "}\n\n"
372 "For click actions: provide <point>x,y</point> normalized "
373 "0-1000 coordinates.\n"
374 "For type/key/hotkey: set coordinate to null, put text in value.\n"
375 "Only fall back to clicks when the task requires interacting "
376 "with something already visible on screen that cannot be "
377 "done via a command.\n"
378 'When task is complete: "Next Action": "None", "Status": "DONE".'
379 )
381 raw = qwen3vl._call_api([{
382 "role": "user",
383 "content": [
384 {"type": "text", "text": combined_prompt},
385 {"type": "image_url", "image_url": {
386 "url": f"data:image/jpeg;base64,{screenshot_b64}"}},
387 ]
388 }])
389 # Guard against None (e.g. thinking-only response with no content)
390 if raw is None:
391 raw = ''
392 action_json = _parse_vlm_response(raw)
394 # Extract coordinates from <point>x,y</point> if present in raw
395 next_action = action_json.get('Next Action', 'None')
396 _CLICK_ACTIONS = {'left_click', 'right_click', 'double_click',
397 'middle_click', 'hover', 'mouse_move'}
399 if next_action in _CLICK_ACTIONS:
400 # Phase 5 follow-through: was a 4th inline <point>
401 # regex parser duplicating parser._parse_point_shape.
402 # Now delegates to the canonical parser so the
403 # action_json JSON-coordinate vs the raw <point>
404 # tag agree (they previously could disagree when
405 # the JSON had Box ID + the raw text had a point).
406 nx, ny = _extract_click_coord(raw, action_json)
408 # Scale from 1000-normalized or image space to screen space
409 try:
410 import pyautogui as _pag
411 _sw, _sh = _pag.size()
412 if nx <= 1000 and ny <= 1000:
413 # Normalized 0-1000 coords
414 screen_x = int(nx * _sw / 1000)
415 screen_y = int(ny * _sh / 1000)
416 else:
417 # Image pixel coords
418 screen_x = int(nx * _sw / VLM_IMG_W)
419 screen_y = int(ny * _sh / VLM_IMG_H)
420 except Exception as _scale_err:
421 logger.debug(f"coord scale to screen failed: {_scale_err}")
422 screen_x, screen_y = nx, ny
423 action_json['coordinate'] = [screen_x, screen_y]
424 logger.info(f"Action: {next_action} at ({screen_x},{screen_y}) "
425 f"norm=({nx},{ny})")
427 # Bias-detection + elimination retry — additive
428 # restoration of point_and_act's strategy 3 that
429 # 8fa6e97 dropped when this loop adopted its own
430 # inline prompt. Catches center/bottom/top-edge
431 # hallucinations in the 0-1000 normalized coords
432 # the loop just produced and reissues the VLM with
433 # an elimination prompt that explicitly forbids
434 # the suspect region. Skipped when the action
435 # came from taskbar_pre_check (its coords are
436 # already lookup-grounded, no need to retry).
437 # All wrapped in try/except so a retry-time error
438 # NEVER takes down the iteration — original coords
439 # remain in action_json.
440 if action_json.get('_strategy') != 'taskbar_list':
441 try:
442 _bias = qwen3vl.detect_grounding_bias(
443 nx, ny, 'left_click', enhanced,
444 )
445 if _bias:
446 _retry = qwen3vl.retry_with_elimination(
447 screenshot_b64, enhanced,
448 VLM_IMG_W, VLM_IMG_H, _bias,
449 )
450 if _retry is not None:
451 _r_dict, _enx, _eny = _retry
452 nx, ny = _enx, _eny
453 # Re-scale retry coords to screen
454 # space using the same rule as the
455 # original (0-1000 vs image-pixel).
456 try:
457 import pyautogui as _pag_r
458 _swr, _shr = _pag_r.size()
459 if nx <= 1000 and ny <= 1000:
460 screen_x = int(nx * _swr / 1000)
461 screen_y = int(ny * _shr / 1000)
462 else:
463 screen_x = int(nx * _swr / VLM_IMG_W)
464 screen_y = int(ny * _shr / VLM_IMG_H)
465 except Exception:
466 screen_x, screen_y = nx, ny
467 action_json['coordinate'] = [
468 screen_x, screen_y,
469 ]
470 action_json['_strategy'] = (
471 'elimination_retry'
472 )
473 logger.info(
474 f"Loop bias retry ({_bias}) → "
475 f"({screen_x},{screen_y}) "
476 f"norm=({nx},{ny})"
477 )
478 except Exception as _bias_err:
479 logger.debug(
480 f"bias retry failed (non-fatal, "
481 f"keeping original coords): {_bias_err}"
482 )
483 # Sanity check: flag clicks in the likely taskbar region.
484 # If the VLM's reasoning talks about a Start menu item or
485 # app window but the coordinate lands in the bottom 50px,
486 # the grounding probably drifted onto the taskbar strip.
487 # We log a warning and let the verify step catch it; the
488 # router will see exit_reason=action_error if this pattern
489 # keeps happening, so it can respond honestly.
490 try:
491 import pyautogui as _pag2
492 _sw2, _sh2 = _pag2.size()
493 reasoning_lc = (action_json.get('Reasoning') or '').lower()
494 if (screen_y >= _sh2 - 50
495 and any(t in reasoning_lc for t in
496 ('start menu', 'menu item', 'recommended', 'pinned'))):
497 logger.warning(
498 f"VLM click ({screen_x},{screen_y}) is in taskbar "
499 f"region (screen height={_sh2}), but reasoning "
500 f"mentions Start menu — probable grounding drift"
501 )
502 except Exception:
503 pass
504 else:
505 action_json['coordinate'] = None
506 logger.info(f"Action: {next_action} "
507 f"value='{action_json.get('value', '')[:50]}'")
509 parsed = {'screen_info': '', 'parsed_content_list': []}
510 else:
511 # ── Legacy path: OmniParser + separate LLM call ──
512 # 2. Parse UI elements
513 parsed = parse_screen(screenshot_b64, tier)
514 screen_info = parsed.get('screen_info', '')
516 # 3. Build LLM prompt with current screen state
517 user_content = _build_vision_prompt(screen_info, screenshot_b64, iteration)
518 messages.append({"role": "user", "content": user_content})
520 # 4. Call local LLM for reasoning
521 llm_response = _call_local_llm(messages)
522 action_json = _parse_vlm_response(llm_response)
524 # Record the assistant response
525 messages.append({"role": "assistant", "content": llm_response})
527 logger.info(f"VLM action: {action_json.get('Next Action', 'None')}")
529 # Check if task is complete
530 next_action = action_json.get('Next Action', 'None')
531 status = action_json.get('Status', 'IN_PROGRESS')
533 if next_action == 'None' or next_action is None or status == 'DONE':
534 logger.info("VLM task completed")
535 extracted_responses.append({
536 "type": "completion",
537 "content": action_json.get('Reasoning', 'Task completed'),
538 "iteration": iteration + 1,
539 })
540 exit_reason = 'done'
541 break
543 # 5. Execute the action.
544 # Phase 6 wire-up: pass safety=True so the per-session cap
545 # + window blocklist + audit JSONL fire on every loop click.
546 # Verify=True triggers the post-click pre/post diff + 50px
547 # nudge retry from Phase 4. Both default-tunable via env
548 # but ON in the loop is the right safe default — solo
549 # /visual_agent calls keep their existing behaviour.
550 action_payload = _build_action_payload(action_json, parsed)
551 _safety_on = os.environ.get(
552 'HEVOLVE_VLM_LOOP_SAFETY', '1').lower() not in ('0', 'false', 'no')
553 _verify_on = os.environ.get(
554 'HEVOLVE_VLM_LOOP_VERIFY', '0').lower() in ('1', 'true', 'yes')
555 result = execute_action(
556 action_payload, tier,
557 safety=_safety_on, verify=_verify_on)
558 action_ok = result.get('status') != 'error'
559 if action_ok:
560 consecutive_action_errors = 0
561 else:
562 consecutive_action_errors += 1
564 # Surface coordinate + strategy in the response content so
565 # observers (benchmark, audit, /visual_agent telemetry,
566 # post-hoc replay) can reconstruct what the VLM actually
567 # decided this iteration without re-parsing action_json.
568 # Was missing - vlm_grounding_benchmark.py:loop_one_iter
569 # path always read content['coordinate'] = None and scored
570 # all 6 targets as FAIL, hiding any real grounding regression
571 # behind a fixed metric.
572 extracted_responses.append({
573 "type": "action",
574 "content": {
575 "action": next_action,
576 "reasoning": action_json.get('Reasoning', ''),
577 "result": result.get('output', ''),
578 "ok": action_ok,
579 "coordinate": action_json.get('coordinate'),
580 "_strategy": action_json.get('_strategy', 'inline_prompt'),
581 },
582 "iteration": iteration + 1,
583 })
585 # Bail after 3 consecutive action errors — something is structurally
586 # broken (bad coordinates, action type mismatch, subprocess dead)
587 # and more iterations won't help.
588 if consecutive_action_errors >= 3:
589 logger.warning("VLM loop: 3 consecutive action errors, aborting")
590 exit_reason = 'action_error'
591 break
593 # Small delay between iterations (let UI update)
594 time.sleep(0.5)
596 except Exception as e:
597 logger.error(f"VLM loop iteration {iteration + 1} error: {e}")
598 extracted_responses.append({
599 "type": "error",
600 "content": str(e),
601 "iteration": iteration + 1,
602 })
603 consecutive_action_errors += 1
604 if consecutive_action_errors >= 3:
605 logger.warning("VLM loop: 3 consecutive iteration errors, aborting")
606 exit_reason = 'action_error'
607 break
608 # Continue to next iteration rather than aborting
609 continue
611 execution_time = time.time() - start_time
612 logger.info(
613 f"VLM loop finished: {len(extracted_responses)} actions in "
614 f"{execution_time:.1f}s (exit_reason={exit_reason})"
615 )
617 # Drop this session's stop flag so the registry doesn't grow
618 # across runs. Pairs with _register_session above.
619 _unregister_session(user_id, prompt_id)
621 # status mirrors exit_reason: only 'done' is a real success. Callers
622 # (LangChain router, autogen) can inspect exit_reason to craft an honest
623 # response instead of confidently lying when the loop timed out.
624 # 'stopped' is its own honest exit_reason — Nunba's indicator UX
625 # reads it to render the right "Stopped" badge instead of a
626 # generic "incomplete".
627 return {
628 "status": "success" if exit_reason == 'done' else "incomplete",
629 "exit_reason": exit_reason,
630 "extracted_responses": extracted_responses,
631 "execution_time_seconds": execution_time,
632 }
635def _build_vision_prompt(screen_info: str, screenshot_b64: str, iteration: int) -> list:
636 """Build multimodal prompt with screen info + screenshot image."""
637 content = []
639 if iteration == 0:
640 content.append({
641 "type": "text",
642 "text": (
643 "Here is the current screen state. "
644 "Analyze the UI elements and decide the next action.\n\n"
645 f"UI Elements:\n{screen_info}"
646 ),
647 })
648 else:
649 content.append({
650 "type": "text",
651 "text": (
652 "Here is the updated screen after the previous action. "
653 "Verify the previous action succeeded, then decide the next action.\n\n"
654 f"UI Elements:\n{screen_info}"
655 ),
656 })
658 # Add screenshot as image
659 content.append({
660 "type": "image_url",
661 "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},
662 })
664 return content
667def _call_local_llm(messages: list) -> str:
668 """
669 Call local LLM using the same config as create_recipe.py:285-300.
671 Uses OpenAI-compatible API (llama.cpp / Qwen3-VL / cloud endpoint).
672 """
673 import requests as _req
675 node_tier = os.environ.get('HEVOLVE_NODE_TIER', 'flat')
677 # VLM-specific override takes priority, then global AutoGen LLM config,
678 # then node-tier aware defaults (same model the user configured)
679 if os.environ.get('HEVOLVE_VLM_ENDPOINT_URL'):
680 base_url = os.environ['HEVOLVE_VLM_ENDPOINT_URL']
681 model = os.environ.get('HEVOLVE_VLM_MODEL_NAME',
682 os.environ.get('HEVOLVE_LLM_MODEL_NAME', 'gpt-4.1-mini'))
683 api_key = os.environ.get('HEVOLVE_VLM_API_KEY',
684 os.environ.get('HEVOLVE_LLM_API_KEY', 'dummy'))
685 elif os.environ.get('HEVOLVE_LLM_ENDPOINT_URL'):
686 # Use the same LLM config as AutoGen (user's configured model)
687 base_url = os.environ['HEVOLVE_LLM_ENDPOINT_URL']
688 model = os.environ.get('HEVOLVE_LLM_MODEL_NAME', 'gpt-4.1-mini')
689 api_key = os.environ.get('HEVOLVE_LLM_API_KEY', 'dummy')
690 elif os.environ.get('OPENAI_API_KEY'):
691 # Fall back to OpenAI API if configured (common for standalone)
692 base_url = 'https://api.openai.com/v1'
693 model = os.environ.get('HEVOLVE_LLM_MODEL_NAME', 'gpt-4.1-mini')
694 api_key = os.environ['OPENAI_API_KEY']
695 else:
696 # Last resort: local llama.cpp / Qwen3-VL
697 from core.port_registry import get_local_llm_url
698 base_url = get_local_llm_url()
699 model = 'Qwen3-VL-4B-Instruct'
700 api_key = 'dummy'
702 try:
703 resp = _req.post(
704 f'{base_url.rstrip("/")}/chat/completions',
705 json={
706 'model': model,
707 'messages': messages,
708 'max_tokens': 4096,
709 'temperature': 0.0,
710 },
711 headers={'Authorization': f'Bearer {api_key}'},
712 timeout=60,
713 )
714 resp.raise_for_status()
715 data = resp.json()
716 return data['choices'][0]['message']['content']
717 except Exception as e:
718 logger.error(f"Local LLM call failed: {e}")
719 raise
722def _point_action_to_action_json(point_action: dict) -> dict:
723 """Convert a point_and_act-shaped result (from
724 Qwen3VLBackend.try_taskbar_pre_check / point_and_act / retry_with_
725 elimination) into the action_json shape the loop's post-action
726 handler expects.
728 Single source of truth for the shape transformation - was
729 duplicated inline in the iteration body, flagged by reviewer
730 as remaining DRY violation after the Phase 5 parser cleanup.
732 Both shapes are documented:
733 point_action: {action, screen_x, screen_y, norm_x, norm_y,
734 text, done, reasoning, raw, strategy?}
735 action_json: {Reasoning, Next Action, coordinate, value,
736 Status, _strategy?}
737 """
738 return {
739 'Reasoning': point_action.get('reasoning', ''),
740 'Next Action': point_action.get('action', 'left_click'),
741 'coordinate': [
742 point_action.get('screen_x'),
743 point_action.get('screen_y'),
744 ],
745 'value': point_action.get('text', ''),
746 'Status': 'DONE' if point_action.get('done') else 'IN_PROGRESS',
747 '_strategy': point_action.get('strategy', 'taskbar_list'),
748 }
751def _extract_click_coord(raw: str, action_json: dict) -> tuple:
752 """Pull the click target coord from the VLM response.
754 Single source of truth for "where in 0-1000 norm space did the
755 VLM say to click?" — was a 4th parallel parser inline in the
756 iteration body. Now delegates to
757 :func:`integrations.vlm.parser.parse_vlm_action` for the
758 ``<point>`` regex, then falls back to ``action_json['coordinate']``,
759 then to dead center (500, 500).
761 Returns ``(nx, ny)`` always — never raises, never returns None.
762 Center fallback is the historical behaviour the VLM loop has
763 relied on since 2026-04-10.
764 """
765 from integrations.vlm.parser import parse_vlm_action
766 pa = parse_vlm_action(raw or '', expected_shape='point_only')
767 if pa.norm_x is not None and pa.norm_y is not None:
768 return pa.norm_x, pa.norm_y
769 coord = action_json.get('coordinate')
770 if coord and isinstance(coord, list) and len(coord) == 2 \
771 and coord[0] is not None and coord[1] is not None:
772 return coord[0], coord[1]
773 return 500, 500
776def _parse_vlm_response(response_text: str) -> dict:
777 """
778 Parse VLM JSON response, handling markdown code blocks and partial JSON.
780 Matches OmniParser vlm_agent.py extract_data() pattern.
782 Phase 5: thin shim onto the canonical parser in
783 :mod:`integrations.vlm.parser`. Returns the same dict shape this
784 function always has (``{Next Action, Status, Reasoning, ...}``)
785 via :meth:`ParsedAction.to_action_json_dict`. The byte-equivalent
786 fallback for empty / unparseable input is preserved.
787 """
788 from integrations.vlm.parser import parse_vlm_action
789 pa = parse_vlm_action(response_text or '', expected_shape='action_json')
790 return pa.to_action_json_dict()
793def _build_action_payload(action_json: dict, parsed_screen: dict) -> dict:
794 """
795 Convert VLM response JSON into action payload for local_computer_tool.
797 Resolves Box ID → coordinate using parsed_screen bounding boxes.
798 """
799 next_action = action_json.get('Next Action', '')
800 coordinate = action_json.get('coordinate')
801 text = action_json.get('value', '')
802 box_id = action_json.get('Box ID')
804 # Resolve Box ID to coordinate if no explicit coordinate given
805 if coordinate is None and box_id is not None:
806 parsed_list = parsed_screen.get('parsed_content_list', [])
807 for item in parsed_list:
808 if item.get('idx') == box_id or item.get('id') == box_id:
809 bbox = item.get('bbox', [])
810 if len(bbox) == 4:
811 # Center of bounding box
812 coordinate = [
813 int((bbox[0] + bbox[2]) / 2),
814 int((bbox[1] + bbox[3]) / 2),
815 ]
816 break
818 payload = {'action': next_action}
819 if coordinate:
820 payload['coordinate'] = coordinate
821 if text:
822 payload['text'] = text
824 # Pass through extra keys for file/shell operations. 'command' is for
825 # the 'shell' action and 'path' covers 'open_file_gui' — both already
826 # live in SUPPORTED_ACTIONS so _execute_inprocess handles them natively.
827 for key in ('path', 'source_path', 'destination_path', 'content',
828 'duration', 'command'):
829 if key in action_json:
830 payload[key] = action_json[key]
832 return payload