Coverage for integrations / vlm / local_computer_tool.py: 68.0%
331 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2local_computer_tool.py - Synchronous pyautogui/HTTP wrapper for VLM actions.
4Replaces OmniParser's Crossbar RPC-based ComputerTool with direct local execution.
5Supports same action types as OmniParser computer.py (key, type, left_click, etc.).
7Tier 'inprocess': direct pyautogui calls (no network)
8Tier 'http': HTTP to localhost:5001 (omnitool-gui Flask server)
9"""
11import os
12import io
13import sys
14import time
15import base64
16import logging
17from typing import Optional
19# VLM screenshot long-edge — aspect ratio is PRESERVED during resize.
20# Old behavior (1024×576 forced) squished 16:10 screens into 16:9 and the
21# VLM's vertical coordinates drifted accordingly. Qwen3-VL handles 1280px
22# long edge comfortably; longer is better grounding, shorter is faster.
23# HEVOLVE_VLM_IMG_LONG_EDGE lets callers tune this.
24VLM_IMG_LONG_EDGE = int(os.environ.get('HEVOLVE_VLM_IMG_LONG_EDGE', '1280'))
25# Legacy constants kept for backward compat with existing call sites
26# and for tests that reference them. The *real* dimensions are computed
27# per-screenshot from the actual screen aspect ratio.
28VLM_IMG_W = VLM_IMG_LONG_EDGE
29VLM_IMG_H = int(VLM_IMG_LONG_EDGE * 9 / 16)
31logger = logging.getLogger('hevolve.vlm.computer_tool')
33# Module-level imports for mockability (pyautogui is optional)
34try:
35 import pyautogui
36except ImportError:
37 pyautogui = None
40# Single source of truth for SetProcessDpiAwareness — see
41# core/dpi_awareness.py for the rationale (was duplicated in
42# remote_desktop/window_capture.py until 2026-05-03 DRY pass).
43from core.dpi_awareness import ensure_dpi_aware as _ensure_dpi_aware
45# Call at import time so every screenshot/click path is DPI-consistent
46_ensure_dpi_aware()
48try:
49 import pyperclip
50except ImportError:
51 pyperclip = None
53from core.http_pool import pooled_get, pooled_post
55# Action types matching OmniParser computer.py Action literal.
56# 'shell' is a Nunba extension — lets the VLM loop run deterministic commands
57# instead of GUI grounding for tasks that can be done programmatically
58# (e.g., launching an app, opening a file in its default handler).
59SUPPORTED_ACTIONS = {
60 'key', 'type', 'mouse_move', 'left_click', 'left_click_drag',
61 'right_click', 'middle_click', 'double_click', 'screenshot',
62 'cursor_position', 'hover', 'list_folders_and_files',
63 'Open_file_and_copy_paste', 'open_file_gui', 'write_file',
64 'read_file_and_understand', 'wait', 'hotkey', 'shell',
65}
68def take_screenshot(tier: str) -> str:
69 """
70 Capture screen and return base64 JPEG.
72 The image is resized to a long-edge of VLM_IMG_LONG_EDGE while
73 PRESERVING aspect ratio, so the VLM's normalized coordinates map
74 back to the physical screen without distortion. Screen DPI awareness
75 is enabled at import (see _ensure_dpi_aware()).
77 Args:
78 tier: 'inprocess' (pyautogui direct) or 'http' (localhost:5001)
79 Returns:
80 Base64-encoded JPEG screenshot string.
81 """
82 if tier == 'inprocess':
83 if pyautogui is None:
84 raise ImportError("pyautogui is required for in-process screenshots")
85 img = pyautogui.screenshot()
86 from PIL import Image
88 w, h = img.size
89 long_edge = max(w, h)
90 if long_edge > VLM_IMG_LONG_EDGE:
91 scale = VLM_IMG_LONG_EDGE / long_edge
92 new_size = (max(1, int(w * scale)), max(1, int(h * scale)))
93 img = img.resize(new_size, Image.LANCZOS)
95 buf = io.BytesIO()
96 img.save(buf, format='JPEG', quality=70)
97 return base64.b64encode(buf.getvalue()).decode('ascii')
98 else:
99 resp = pooled_get('http://localhost:5001/screenshot', timeout=15)
100 resp.raise_for_status()
101 data = resp.json()
102 return data.get('base64_image', data.get('image', ''))
105def get_active_window_info():
106 """Get the actual foreground window title + process name from the OS.
107 Used to prevent VLM misidentifying windows (e.g. Claude Code as MobaXterm)."""
108 try:
109 import platform, subprocess, json
110 from core.subprocess_safe import hidden_popen_kwargs
111 _os = platform.system()
112 if _os == 'Windows':
113 # CREATE_NO_WINDOW prevents the powershell child from popping a
114 # cmd console on every call (this fires per VLM probe).
115 r = subprocess.run(
116 ['powershell', '-Command',
117 '(Get-Process | Where-Object {$_.MainWindowHandle -eq '
118 '(Add-Type -MemberDefinition \'[DllImport("user32.dll")] '
119 'public static extern IntPtr GetForegroundWindow();\' '
120 '-Name W -PassThru)::GetForegroundWindow()}).ProcessName + '
121 '": " + (Get-Process | Where-Object {$_.MainWindowHandle -eq '
122 '(Add-Type -MemberDefinition \'[DllImport("user32.dll")] '
123 'public static extern IntPtr GetForegroundWindow();\' '
124 '-Name W2 -PassThru)::GetForegroundWindow()}).MainWindowTitle'],
125 capture_output=True, text=True, timeout=3,
126 **hidden_popen_kwargs())
127 if r.returncode == 0 and r.stdout.strip():
128 return r.stdout.strip()
129 elif _os == 'Linux':
130 r = subprocess.run(['xdotool', 'getactivewindow', 'getwindowname'],
131 capture_output=True, text=True, timeout=3)
132 if r.returncode == 0:
133 return r.stdout.strip()
134 elif _os == 'Darwin':
135 r = subprocess.run(
136 ['osascript', '-e',
137 'tell application "System Events" to get name of first process whose frontmost is true'],
138 capture_output=True, text=True, timeout=3)
139 if r.returncode == 0:
140 return r.stdout.strip()
141 except Exception:
142 pass
143 return None
146#: Process-name keyword pairs the reasoning-mismatch detector watches.
147#: ``(reasoning_substring, foreground_window_substring)`` — when the
148#: VLM's reasoning includes the first but the actual foreground window
149#: title doesn't include the second, the action gets flagged. Order
150#: matters: more specific patterns first. Extend by appending tuples.
151_REASONING_MISMATCH_PATTERNS = (
152 ('mobaxt', 'mobaxt'),
153 ('notepad', 'notepad'),
154)
156#: Verbs in the VLM's reasoning that hint a window-targeted action.
157#: We only run the (slow) get_active_window_info probe when the
158#: reasoning suggests the VLM is acting on a specific window, not
159#: when it's typing or generic-clicking somewhere mid-screen.
160_WINDOW_TARGETED_VERBS = ('minimize', 'close', 'switch to', 'click on')
163def _check_reasoning_mismatch(action: dict) -> Optional[str]:
164 """Detect when the VLM's stated reasoning contradicts the actual
165 foreground window. Returns a human-readable mismatch description
166 or None when there's no detectable disagreement.
168 Extracted from execute_action in the SRP cleanup pass — was 14
169 lines tangled in the action-dispatch flow alongside per-window
170 translation, safety, audit, and verify. Self-contained now.
172 Pattern config in module-level ``_REASONING_MISMATCH_PATTERNS``.
173 Adding a new pattern is one tuple append.
174 """
175 reasoning = action.get('Reasoning', action.get('reasoning', '')).lower()
176 if not reasoning:
177 return None
178 if not any(verb in reasoning for verb in _WINDOW_TARGETED_VERBS):
179 return None
180 active = get_active_window_info()
181 if not active:
182 return None
183 active_lower = active.lower()
184 for reasoning_kw, window_kw in _REASONING_MISMATCH_PATTERNS:
185 if reasoning_kw in reasoning and window_kw not in active_lower:
186 return (f"VLM thinks {reasoning_kw.title()} but active window "
187 f"is: {active}")
188 return None
191def execute_action(action: dict, tier: str, *,
192 window_handle: int = None,
193 verify: bool = False,
194 if_occluded: str = 'skip',
195 safety: bool = False) -> dict:
196 """
197 Execute a single VLM action (click, type, key, etc.).
199 Includes active window validation — if the VLM's reasoning mentions
200 a window name that doesn't match the actual foreground window,
201 the action is flagged (prevents clicking the wrong app's taskbar icon).
203 Phase 4 of vlm_best_of_all_worlds_plan.md §3 added the per-window
204 keyword arguments below. All are backward-compatible — every
205 existing caller passes only ``(action, tier)`` and gets the same
206 behaviour as before.
208 Args:
209 action: dict with 'action', optionally 'coordinate' (in
210 window-local 0-1000 norm space when ``window_handle`` is
211 set; in screen-pixel space otherwise), 'text', 'value',
212 'path', 'reasoning'.
213 tier: 'inprocess' or 'http'.
214 window_handle: HWND from
215 :func:`integrations.remote_desktop.window_capture.list_windows`.
216 When set, ``coordinate`` is treated as window-local 0-1000
217 normalized space and translated to current screen coords
218 via the window's freshly-snapshotted rect (handles windows
219 moved between capture and click).
220 verify: when True, take a pre/post screenshot diff and retry
221 once with a 50-px nudge if no visible change occurred.
222 if_occluded: policy for non-foreground / occluded windows:
223 ``'skip'`` (default) — return status='window_occluded'
224 ``'foreground'`` — SetForegroundWindow first, then click
225 ``'force'`` — click regardless (PrintWindow-captured
226 click target may underlie another window)
227 safety: opt-in safety layer (Phase 6 of vlm_best_of_all_worlds_plan
228 §5). When True, runs the action through the SessionGuard
229 (per-session cap + per-second throttle), the WindowBlocklist
230 (refuses lsass / password managers / banking-titled windows),
231 and writes a JSONL audit record per attempt. Existing call
232 sites that don't pass safety=True are unchanged.
234 Returns:
235 dict with 'output' and optionally 'error', 'window_mismatch',
236 'status', 'translated_from', 'translated_to', 'verify_diff',
237 'safety_block' (when safety=True and a guard refused).
238 """
239 _mismatch = _check_reasoning_mismatch(action)
241 # Phase 4: per-window translation + occlusion handling. Mutates
242 # action['coordinate'] in place when needed; returns an early
243 # status dict when the window can't be acted on safely.
244 _window_meta = None
245 if window_handle is not None:
246 _window_meta, _early = _prepare_window_for_action(
247 window_handle, action, if_occluded)
248 if _early is not None:
249 if safety:
250 _emit_audit(action, _early, _window_meta, None,
251 block_reason=_early.get('status'))
252 return _early
254 # Phase 6: safety guards run BEFORE any pyautogui call so a refusal
255 # never reaches the user's screen. Order matters — session-level
256 # rate cap is cheapest, run first; window blocklist needs window
257 # metadata so runs second.
258 if safety:
259 _block = _check_safety(_window_meta)
260 if _block is not None:
261 _result = {
262 'output': '', 'status': 'safety_blocked',
263 'error': _block, 'safety_block': _block,
264 }
265 if _window_meta is not None:
266 _result['window'] = _window_meta
267 _emit_audit(action, _result, _window_meta, None,
268 block_reason=_block)
269 return _result
271 # Phase 4: pre-action screenshot for verify=True diff.
272 _pre_b64 = None
273 if verify and tier == 'inprocess':
274 try:
275 _pre_b64 = take_screenshot('inprocess')
276 except Exception as e:
277 logger.debug(f"verify pre-screenshot skipped: {e}")
279 if tier == 'inprocess':
280 result = _execute_inprocess(action)
281 else:
282 result = _execute_http(action)
284 if _mismatch:
285 result['window_mismatch'] = _mismatch
286 import logging
287 logging.getLogger('hevolve.vlm').warning(f"[WINDOW-MISMATCH] {_mismatch}")
289 # Phase 4: surface window metadata so the loop's caller can audit.
290 if _window_meta is not None:
291 result.setdefault('window', _window_meta)
293 # Phase 4: post-click verify with one 50-px nudge retry.
294 if _pre_b64 is not None and result.get('error') is None:
295 result = _post_click_verify(
296 action, result, _pre_b64,
297 tier=tier, window_meta=_window_meta)
299 # Phase 6: record the action in the session guard + audit log.
300 # Only record on a successful (non-error) attempt — refusals were
301 # logged above and don't count against the session cap.
302 if safety and result.get('error') is None:
303 try:
304 from integrations.vlm.safety import get_session_guard
305 get_session_guard().record()
306 except Exception as e:
307 logger.debug(f"safety: session guard record failed: {e}")
308 _emit_audit(action, result, _window_meta, _pre_b64)
310 return result
313# ─── Phase 6 helper plumbing ──────────────────────────────────────────
315def _check_safety(window_meta):
316 """Run rate guard + window blocklist. Returns block-reason
317 string when refusing, None when OK."""
318 try:
319 from integrations.vlm.safety import (
320 get_session_guard, is_window_blocked)
321 except Exception as e:
322 logger.debug(f"safety module unavailable: {e}")
323 return None
324 reason = get_session_guard().check()
325 if reason is not None:
326 return reason
327 return is_window_blocked(window_meta)
330def _emit_audit(action, result, window_meta, screenshot_b64,
331 block_reason=None):
332 """Best-effort audit log — failures must NOT bubble up and break
333 the action path."""
334 try:
335 from integrations.vlm.safety import get_audit_logger
336 get_audit_logger().log(
337 action, result, window_meta=window_meta,
338 screenshot_b64=screenshot_b64,
339 block_reason=block_reason)
340 except Exception as e:
341 logger.debug(f"audit log failed: {e}")
344# ─── Phase 4 helpers (per-window translation + post-click verify) ────
347def _prepare_window_for_action(window_handle: int, action: dict,
348 if_occluded: str):
349 """Refresh the window's rect, decide if it can be acted on, and
350 translate action's window-local 0-1000 coords into screen pixels
351 in place. Returns ``(window_meta, early_result_or_None)``.
353 When the second tuple element is non-None, ``execute_action``
354 returns it immediately without touching pyautogui — the window
355 can't be acted on safely.
356 """
357 try:
358 from integrations.remote_desktop.window_capture import (
359 WindowEnumerator, WindowInfo)
360 except ImportError as e:
361 logger.debug(f"window_capture unavailable: {e}")
362 return None, {
363 'output': '', 'status': 'window_capture_unavailable',
364 'error': f'window_capture import failed: {e}',
365 }
367 enum = WindowEnumerator()
368 fresh = enum.refresh_window_info(WindowInfo(
369 hwnd=window_handle, title='', process_name='',
370 pid=0, rect=(0, 0, 0, 0)))
371 if fresh is None:
372 return None, {
373 'output': '', 'status': 'window_destroyed',
374 'error': f'hwnd={window_handle} no longer exists',
375 }
376 wx, wy, ww, wh = fresh.rect
377 if ww <= 0 or wh <= 0:
378 return fresh.to_dict(), {
379 'output': '', 'status': 'window_offscreen',
380 'error': f'window rect collapsed to {fresh.rect}',
381 'window': fresh.to_dict(),
382 }
383 # Occlusion / minimized handling per policy.
384 needs_foreground = fresh.minimized or not fresh.visible
385 if needs_foreground:
386 if if_occluded == 'skip':
387 return fresh.to_dict(), {
388 'output': '', 'status': 'window_minimized',
389 'error': 'window minimized; pass if_occluded="foreground" '
390 'to bring it forward first',
391 'window': fresh.to_dict(),
392 }
393 if if_occluded in ('foreground', 'force'):
394 _bring_foreground(window_handle)
395 # Translate window-local 0-1000 normalized coords → screen pixels.
396 coord = action.get('coordinate')
397 if coord and isinstance(coord, (list, tuple)) and len(coord) >= 2:
398 nx, ny = coord[0], coord[1]
399 if 0 <= nx <= 1000 and 0 <= ny <= 1000:
400 sx = wx + int(nx * ww / 1000)
401 sy = wy + int(ny * wh / 1000)
402 action['_translated_from'] = (nx, ny)
403 action['coordinate'] = [sx, sy]
404 action['_translated_to'] = (sx, sy)
405 else:
406 # Out-of-range norm coords → caller passed screen pixels;
407 # leave alone and let the action execute as-is.
408 pass
409 return fresh.to_dict(), None
412def _bring_foreground(hwnd: int) -> None:
413 """SetForegroundWindow + ShowWindow(SW_RESTORE) so a minimized /
414 backgrounded window becomes the click target. Best-effort —
415 Windows blocks SetForegroundWindow from non-foreground processes
416 in many cases, so callers shouldn't assume it always works."""
417 if sys.platform != 'win32':
418 return
419 try:
420 import ctypes
421 SW_RESTORE = 9
422 ctypes.windll.user32.ShowWindow(int(hwnd), SW_RESTORE)
423 ctypes.windll.user32.SetForegroundWindow(int(hwnd))
424 # Brief sleep — SetForegroundWindow is async, the click can
425 # arrive before the new foreground window is composited.
426 time.sleep(0.10)
427 except Exception as e:
428 logger.debug(f"bring-foreground hwnd={hwnd} failed: {e}")
431# Diff thresholds for _post_click_verify. Named so reviewers (and
432# tests) don't have to guess what 0.005 / 16 mean.
433#: Fraction-of-changed-pixels below which we consider the screen
434#: "unchanged" → triggers a 50-px nudge retry. 0.5% covers JPEG
435#: noise on a static frame and small cursor sprites without false-
436#: triggering on real UI updates (button press → dialog → > 5%).
437VERIFY_DIFF_THRESHOLD: float = 0.005
439#: Per-pixel grayscale delta above which a pixel counts as "changed".
440#: Set to absorb JPEG-quality-70 quantization noise (typically < 8).
441VERIFY_PIXEL_NOISE_FLOOR: int = 16
443#: How far to nudge the click on a no-change retry (screen px).
444#: Half a typical button width — high enough to escape a missed edge,
445#: low enough to stay inside the same UI element.
446VERIFY_NUDGE_PX: int = 50
449def _post_click_verify(action: dict, result: dict, pre_b64: str, *,
450 tier: str, window_meta: dict = None) -> dict:
451 """Take a post-action screenshot, diff against pre, and if no
452 visible change occurred, retry the action once with a 50-px
453 nudge. Annotates the result with 'verify_diff' (0.0–1.0) and
454 'verify_retried' so callers can see what happened.
455 """
456 try:
457 time.sleep(0.20) # let the GUI settle before re-snapshot
458 post_b64 = take_screenshot(tier)
459 except Exception as e:
460 # Surface the failure loudly — verification is a contract,
461 # not a courtesy. WARNING (not debug) so users notice when
462 # the screenshot path is broken; downstream callers can read
463 # verify_error and decide whether to trust the action result.
464 logger.warning(
465 f"verify post-screenshot failed - cannot detect no-op clicks "
466 f"this iteration: {e}")
467 result['verify_diff'] = None
468 result['verify_error'] = f'post-screenshot failed: {e}'
469 result['verify_retried'] = False
470 return result
471 diff = _quick_image_diff(pre_b64, post_b64)
472 result['verify_diff'] = round(diff, 3)
473 if diff < VERIFY_DIFF_THRESHOLD:
474 # No visible change — try one nudge. Only meaningful for
475 # click-type actions with a coordinate.
476 coord = action.get('coordinate')
477 if coord and isinstance(coord, (list, tuple)) and len(coord) >= 2:
478 nudged = [int(coord[0]) + VERIFY_NUDGE_PX, int(coord[1])]
479 nudged_action = dict(action, coordinate=nudged)
480 logger.info(
481 f"verify: no visible change after click @ {coord}; "
482 f"retrying with 50-px nudge → {nudged}")
483 try:
484 if tier == 'inprocess':
485 _ = _execute_inprocess(nudged_action)
486 else:
487 _ = _execute_http(nudged_action)
488 except Exception as e:
489 logger.debug(f"verify-retry failed: {e}")
490 result['verify_retried'] = True
491 result['verify_nudge_to'] = nudged
492 else:
493 result['verify_retried'] = False
494 else:
495 result['verify_retried'] = False
496 return result
499def _quick_image_diff(b64_a: str, b64_b: str) -> float:
500 """Fraction of significantly-changed pixels between two base64
501 JPEGs. Downsizes to 64×64 grayscale for speed (each image →
502 4096 bytes → 4096 cheap subtractions). Returns 0.0 (identical)
503 to 1.0 (every pixel differs by > 16).
504 """
505 try:
506 from PIL import Image
507 import base64 as _b64
508 ima = Image.open(io.BytesIO(_b64.b64decode(b64_a))).convert('L').resize((64, 64))
509 imb = Image.open(io.BytesIO(_b64.b64decode(b64_b))).convert('L').resize((64, 64))
510 ba = ima.tobytes()
511 bb = imb.tobytes()
512 n = len(ba)
513 if n == 0:
514 return 0.0
515 # Per-pixel noise floor absorbs JPEG-compression noise on
516 # unchanged regions (see VERIFY_PIXEL_NOISE_FLOOR docstring).
517 changed = sum(1 for a, b in zip(ba, bb)
518 if abs(a - b) > VERIFY_PIXEL_NOISE_FLOOR)
519 return changed / n
520 except Exception:
521 # Conservative: report no diff so we don't trigger spurious nudges.
522 return 0.0
525def _execute_inprocess(action: dict) -> dict:
526 """Execute action via direct pyautogui calls."""
527 act = action.get('action', '')
528 coord = action.get('coordinate')
529 text = action.get('text', action.get('value', ''))
531 # Validate coordinate format (VLM output can be malformed)
532 if coord is not None:
533 if not isinstance(coord, (list, tuple)) or len(coord) < 2:
534 return {'output': '', 'error': f'Invalid coordinate format: {coord}'}
536 # File/wait/shell actions don't need pyautogui
537 _NO_GUI_ACTIONS = {
538 'list_folders_and_files', 'read_file_and_understand', 'write_file',
539 'Open_file_and_copy_paste', 'open_file_gui', 'wait', 'shell',
540 }
542 if act not in _NO_GUI_ACTIONS and pyautogui is None:
543 return {'output': '', 'error': 'pyautogui not installed'}
545 try:
546 if act == 'left_click':
547 if coord:
548 pyautogui.click(coord[0], coord[1])
549 return {'output': f'Clicked at {coord}'}
551 elif act == 'right_click':
552 if coord:
553 pyautogui.rightClick(coord[0], coord[1])
554 return {'output': f'Right-clicked at {coord}'}
556 elif act == 'double_click':
557 if coord:
558 pyautogui.doubleClick(coord[0], coord[1])
559 return {'output': f'Double-clicked at {coord}'}
561 elif act == 'middle_click':
562 if coord:
563 pyautogui.middleClick(coord[0], coord[1])
564 return {'output': f'Middle-clicked at {coord}'}
566 elif act == 'hover' or act == 'mouse_move':
567 if coord:
568 pyautogui.moveTo(coord[0], coord[1])
569 return {'output': f'Moved to {coord}'}
571 elif act == 'type':
572 if text:
573 # Use clipboard for reliability (same as OmniParser)
574 if pyperclip is not None:
575 pyperclip.copy(text)
576 pyautogui.hotkey('ctrl', 'v')
577 else:
578 pyautogui.typewrite(text, interval=0.012)
579 return {'output': f'Typed: {text[:50]}...'}
581 elif act == 'key':
582 if text:
583 pyautogui.press(text)
584 return {'output': f'Pressed key: {text}'}
586 elif act == 'hotkey':
587 if text:
588 if isinstance(text, list):
589 keys = [str(k).strip() for k in text]
590 else:
591 keys = [k.strip() for k in str(text).split('+')]
592 pyautogui.hotkey(*keys)
593 return {'output': f'Hotkey: {text}'}
595 elif act == 'left_click_drag':
596 start = action.get('startCoordinate', coord)
597 end = action.get('endCoordinate', action.get('coordinate_end'))
598 if start and end:
599 pyautogui.moveTo(start[0], start[1])
600 pyautogui.drag(end[0] - start[0], end[1] - start[1], duration=0.5)
601 return {'output': f'Dragged from {start} to {end}'}
603 elif act == 'screenshot':
604 return {'output': 'Screenshot taken', 'base64_image': take_screenshot('inprocess')}
606 elif act == 'wait':
607 wait_time = action.get('duration', 2)
608 time.sleep(wait_time)
609 return {'output': f'Waited {wait_time}s'}
611 elif act == 'cursor_position':
612 pos = pyautogui.position()
613 return {'output': f'Cursor at ({pos.x}, {pos.y})'}
615 elif act == 'list_folders_and_files':
616 path = action.get('path', '.')
617 try:
618 entries = os.listdir(path)
619 return {'output': '\n'.join(entries[:100])}
620 except OSError as e:
621 return {'output': '', 'error': str(e)}
623 elif act == 'read_file_and_understand':
624 path = action.get('path', '')
625 try:
626 with open(path, 'r', encoding='utf-8', errors='replace') as f:
627 content = f.read(10000)
628 return {'output': content}
629 except OSError as e:
630 return {'output': '', 'error': str(e)}
632 elif act == 'write_file':
633 path = action.get('path', '')
634 content = action.get('content', text)
635 try:
636 with open(path, 'w', encoding='utf-8') as f:
637 f.write(content)
638 return {'output': f'Written to {path}'}
639 except OSError as e:
640 return {'output': '', 'error': str(e)}
642 elif act == 'open_file_gui':
643 # Open a file / app in the OS default handler. On Windows this is
644 # os.startfile (uses ShellExecute). On Linux/Mac the equivalent is
645 # `xdg-open` / `open`, which aren't available as a Python API —
646 # route through the shell handler so the same denylist applies.
647 path = action.get('path', '') or text
648 if not path:
649 return {'output': '', 'error': 'open_file_gui needs a path'}
650 if sys.platform == 'win32':
651 try:
652 os.startfile(path) # type: ignore[attr-defined]
653 return {'output': f'Opened {path}'}
654 except OSError as e:
655 return {'output': '', 'error': f'open_file_gui failed: {e}'}
656 # Non-Windows: delegate to shell so we reuse the denylist
657 shell_cmd = (
658 f'open {path}' if sys.platform == 'darwin' else f'xdg-open {path}'
659 )
660 from core.safe_hartos_attr import safe_hartos_attr
661 _handle_shell_command_tool = safe_hartos_attr(
662 '_handle_shell_command_tool')
663 if _handle_shell_command_tool is None:
664 logger.info(
665 "open_file_gui blocked: HARTOS _handle_shell_command_tool "
666 "not yet resolvable (loader still init). Failing closed "
667 "to preserve denylist guarantees.",
668 )
669 return {
670 'output': '',
671 'error': 'open_file_gui unavailable: HARTOS still loading',
672 'status': 'error',
673 }
674 result_text = _handle_shell_command_tool(shell_cmd)
675 logger.info(
676 "open_file_gui dispatched: cmd=%r exit_signature=%r",
677 shell_cmd, (result_text or '')[:40],
678 )
679 ok = isinstance(result_text, str) and result_text.startswith('Exit code: 0')
680 return {
681 'output': result_text,
682 'status': 'ok' if ok else 'error',
683 }
685 elif act == 'shell':
686 # Deterministic command execution inside the VLM loop. The ONLY
687 # implementation lives in hart_intelligence_entry._handle_shell_command_tool
688 # so the denylist + timeout + truncation + shell-selector parsing all
689 # apply identically to Shell_Command and this VLM-emitted action. If
690 # that import fails (stripped frozen build / circular import), we
691 # fail CLOSED rather than falling back to a bare subprocess.run —
692 # a bare fallback would skip the denylist and expose a command
693 # injection channel that silently weakens safety posture.
694 cmd = action.get('command', text)
695 if not cmd:
696 return {'output': '', 'error': 'shell action needs command string'}
697 from core.safe_hartos_attr import safe_hartos_attr
698 _handle_shell_command_tool = safe_hartos_attr(
699 '_handle_shell_command_tool')
700 if _handle_shell_command_tool is None:
701 logger.info(
702 "VLM shell action blocked: HARTOS "
703 "_handle_shell_command_tool not yet resolvable. "
704 "Failing closed (denylist unavailable) — cmd=%r",
705 (cmd or '')[:80],
706 )
707 return {
708 'output': '',
709 'error': (
710 "shell action unavailable: HARTOS still loading. "
711 "Refusing to run without the shared denylist."
712 ),
713 'status': 'error',
714 }
715 logger.info(
716 "VLM shell action dispatching: cmd=%r",
717 (cmd or '')[:80],
718 )
719 result_text = _handle_shell_command_tool(cmd)
720 # _handle_shell_command_tool returns 'Exit code: N\n<body>' on
721 # success and 'Shell_Command refused: ...' / 'Shell_Command error: ...'
722 # on refusal or failure. Classify anything other than a clean
723 # 'Exit code: 0' prefix as a non-success so the VLM loop's
724 # consecutive-action-error counter can back off.
725 ok = isinstance(result_text, str) and result_text.startswith('Exit code: 0')
726 return {
727 'output': result_text,
728 'status': 'ok' if ok else 'error',
729 }
731 elif act == 'Open_file_and_copy_paste':
732 src = action.get('source_path', '')
733 dst = action.get('destination_path', '')
734 try:
735 with open(src, 'r', encoding='utf-8', errors='replace') as f:
736 content = f.read()
737 with open(dst, 'w', encoding='utf-8') as f:
738 f.write(content)
739 return {'output': f'Copied {src} → {dst}'}
740 except OSError as e:
741 return {'output': '', 'error': str(e)}
743 else:
744 return {'output': '', 'error': f'Unknown action: {act}'}
746 except Exception as e:
747 logger.error(f"Action execution error ({act}): {e}")
748 return {'output': '', 'error': str(e)}
751def _execute_http(action: dict) -> dict:
752 """Execute action via HTTP POST to localhost:5001/execute."""
753 try:
754 resp = pooled_post(
755 'http://localhost:5001/execute',
756 json=action,
757 timeout=30
758 )
759 resp.raise_for_status()
760 return resp.json()
761 except Exception as e:
762 logger.error(f"HTTP action execution error: {e}")
763 return {'output': '', 'error': str(e)}