Coverage for integrations / vlm / local_computer_tool.py: 68.0%

331 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2local_computer_tool.py - Synchronous pyautogui/HTTP wrapper for VLM actions. 

3 

4Replaces OmniParser's Crossbar RPC-based ComputerTool with direct local execution. 

5Supports same action types as OmniParser computer.py (key, type, left_click, etc.). 

6 

7Tier 'inprocess': direct pyautogui calls (no network) 

8Tier 'http': HTTP to localhost:5001 (omnitool-gui Flask server) 

9""" 

10 

11import os 

12import io 

13import sys 

14import time 

15import base64 

16import logging 

17from typing import Optional 

18 

19# VLM screenshot long-edge — aspect ratio is PRESERVED during resize. 

20# Old behavior (1024×576 forced) squished 16:10 screens into 16:9 and the 

21# VLM's vertical coordinates drifted accordingly. Qwen3-VL handles 1280px 

22# long edge comfortably; longer is better grounding, shorter is faster. 

23# HEVOLVE_VLM_IMG_LONG_EDGE lets callers tune this. 

24VLM_IMG_LONG_EDGE = int(os.environ.get('HEVOLVE_VLM_IMG_LONG_EDGE', '1280')) 

25# Legacy constants kept for backward compat with existing call sites 

26# and for tests that reference them. The *real* dimensions are computed 

27# per-screenshot from the actual screen aspect ratio. 

28VLM_IMG_W = VLM_IMG_LONG_EDGE 

29VLM_IMG_H = int(VLM_IMG_LONG_EDGE * 9 / 16) 

30 

31logger = logging.getLogger('hevolve.vlm.computer_tool') 

32 

33# Module-level imports for mockability (pyautogui is optional) 

34try: 

35 import pyautogui 

36except ImportError: 

37 pyautogui = None 

38 

39 

40# Single source of truth for SetProcessDpiAwareness — see 

41# core/dpi_awareness.py for the rationale (was duplicated in 

42# remote_desktop/window_capture.py until 2026-05-03 DRY pass). 

43from core.dpi_awareness import ensure_dpi_aware as _ensure_dpi_aware 

44 

45# Call at import time so every screenshot/click path is DPI-consistent 

46_ensure_dpi_aware() 

47 

48try: 

49 import pyperclip 

50except ImportError: 

51 pyperclip = None 

52 

53from core.http_pool import pooled_get, pooled_post 

54 

55# Action types matching OmniParser computer.py Action literal. 

56# 'shell' is a Nunba extension — lets the VLM loop run deterministic commands 

57# instead of GUI grounding for tasks that can be done programmatically 

58# (e.g., launching an app, opening a file in its default handler). 

59SUPPORTED_ACTIONS = { 

60 'key', 'type', 'mouse_move', 'left_click', 'left_click_drag', 

61 'right_click', 'middle_click', 'double_click', 'screenshot', 

62 'cursor_position', 'hover', 'list_folders_and_files', 

63 'Open_file_and_copy_paste', 'open_file_gui', 'write_file', 

64 'read_file_and_understand', 'wait', 'hotkey', 'shell', 

65} 

66 

67 

68def take_screenshot(tier: str) -> str: 

69 """ 

70 Capture screen and return base64 JPEG. 

71 

72 The image is resized to a long-edge of VLM_IMG_LONG_EDGE while 

73 PRESERVING aspect ratio, so the VLM's normalized coordinates map 

74 back to the physical screen without distortion. Screen DPI awareness 

75 is enabled at import (see _ensure_dpi_aware()). 

76 

77 Args: 

78 tier: 'inprocess' (pyautogui direct) or 'http' (localhost:5001) 

79 Returns: 

80 Base64-encoded JPEG screenshot string. 

81 """ 

82 if tier == 'inprocess': 

83 if pyautogui is None: 

84 raise ImportError("pyautogui is required for in-process screenshots") 

85 img = pyautogui.screenshot() 

86 from PIL import Image 

87 

88 w, h = img.size 

89 long_edge = max(w, h) 

90 if long_edge > VLM_IMG_LONG_EDGE: 

91 scale = VLM_IMG_LONG_EDGE / long_edge 

92 new_size = (max(1, int(w * scale)), max(1, int(h * scale))) 

93 img = img.resize(new_size, Image.LANCZOS) 

94 

95 buf = io.BytesIO() 

96 img.save(buf, format='JPEG', quality=70) 

97 return base64.b64encode(buf.getvalue()).decode('ascii') 

98 else: 

99 resp = pooled_get('http://localhost:5001/screenshot', timeout=15) 

100 resp.raise_for_status() 

101 data = resp.json() 

102 return data.get('base64_image', data.get('image', '')) 

103 

104 

105def get_active_window_info(): 

106 """Get the actual foreground window title + process name from the OS. 

107 Used to prevent VLM misidentifying windows (e.g. Claude Code as MobaXterm).""" 

108 try: 

109 import platform, subprocess, json 

110 from core.subprocess_safe import hidden_popen_kwargs 

111 _os = platform.system() 

112 if _os == 'Windows': 

113 # CREATE_NO_WINDOW prevents the powershell child from popping a 

114 # cmd console on every call (this fires per VLM probe). 

115 r = subprocess.run( 

116 ['powershell', '-Command', 

117 '(Get-Process | Where-Object {$_.MainWindowHandle -eq ' 

118 '(Add-Type -MemberDefinition \'[DllImport("user32.dll")] ' 

119 'public static extern IntPtr GetForegroundWindow();\' ' 

120 '-Name W -PassThru)::GetForegroundWindow()}).ProcessName + ' 

121 '": " + (Get-Process | Where-Object {$_.MainWindowHandle -eq ' 

122 '(Add-Type -MemberDefinition \'[DllImport("user32.dll")] ' 

123 'public static extern IntPtr GetForegroundWindow();\' ' 

124 '-Name W2 -PassThru)::GetForegroundWindow()}).MainWindowTitle'], 

125 capture_output=True, text=True, timeout=3, 

126 **hidden_popen_kwargs()) 

127 if r.returncode == 0 and r.stdout.strip(): 

128 return r.stdout.strip() 

129 elif _os == 'Linux': 

130 r = subprocess.run(['xdotool', 'getactivewindow', 'getwindowname'], 

131 capture_output=True, text=True, timeout=3) 

132 if r.returncode == 0: 

133 return r.stdout.strip() 

134 elif _os == 'Darwin': 

135 r = subprocess.run( 

136 ['osascript', '-e', 

137 'tell application "System Events" to get name of first process whose frontmost is true'], 

138 capture_output=True, text=True, timeout=3) 

139 if r.returncode == 0: 

140 return r.stdout.strip() 

141 except Exception: 

142 pass 

143 return None 

144 

145 

146#: Process-name keyword pairs the reasoning-mismatch detector watches. 

147#: ``(reasoning_substring, foreground_window_substring)`` — when the 

148#: VLM's reasoning includes the first but the actual foreground window 

149#: title doesn't include the second, the action gets flagged. Order 

150#: matters: more specific patterns first. Extend by appending tuples. 

151_REASONING_MISMATCH_PATTERNS = ( 

152 ('mobaxt', 'mobaxt'), 

153 ('notepad', 'notepad'), 

154) 

155 

156#: Verbs in the VLM's reasoning that hint a window-targeted action. 

157#: We only run the (slow) get_active_window_info probe when the 

158#: reasoning suggests the VLM is acting on a specific window, not 

159#: when it's typing or generic-clicking somewhere mid-screen. 

160_WINDOW_TARGETED_VERBS = ('minimize', 'close', 'switch to', 'click on') 

161 

162 

163def _check_reasoning_mismatch(action: dict) -> Optional[str]: 

164 """Detect when the VLM's stated reasoning contradicts the actual 

165 foreground window. Returns a human-readable mismatch description 

166 or None when there's no detectable disagreement. 

167 

168 Extracted from execute_action in the SRP cleanup pass — was 14 

169 lines tangled in the action-dispatch flow alongside per-window 

170 translation, safety, audit, and verify. Self-contained now. 

171 

172 Pattern config in module-level ``_REASONING_MISMATCH_PATTERNS``. 

173 Adding a new pattern is one tuple append. 

174 """ 

175 reasoning = action.get('Reasoning', action.get('reasoning', '')).lower() 

176 if not reasoning: 

177 return None 

178 if not any(verb in reasoning for verb in _WINDOW_TARGETED_VERBS): 

179 return None 

180 active = get_active_window_info() 

181 if not active: 

182 return None 

183 active_lower = active.lower() 

184 for reasoning_kw, window_kw in _REASONING_MISMATCH_PATTERNS: 

185 if reasoning_kw in reasoning and window_kw not in active_lower: 

186 return (f"VLM thinks {reasoning_kw.title()} but active window " 

187 f"is: {active}") 

188 return None 

189 

190 

191def execute_action(action: dict, tier: str, *, 

192 window_handle: int = None, 

193 verify: bool = False, 

194 if_occluded: str = 'skip', 

195 safety: bool = False) -> dict: 

196 """ 

197 Execute a single VLM action (click, type, key, etc.). 

198 

199 Includes active window validation — if the VLM's reasoning mentions 

200 a window name that doesn't match the actual foreground window, 

201 the action is flagged (prevents clicking the wrong app's taskbar icon). 

202 

203 Phase 4 of vlm_best_of_all_worlds_plan.md §3 added the per-window 

204 keyword arguments below. All are backward-compatible — every 

205 existing caller passes only ``(action, tier)`` and gets the same 

206 behaviour as before. 

207 

208 Args: 

209 action: dict with 'action', optionally 'coordinate' (in 

210 window-local 0-1000 norm space when ``window_handle`` is 

211 set; in screen-pixel space otherwise), 'text', 'value', 

212 'path', 'reasoning'. 

213 tier: 'inprocess' or 'http'. 

214 window_handle: HWND from 

215 :func:`integrations.remote_desktop.window_capture.list_windows`. 

216 When set, ``coordinate`` is treated as window-local 0-1000 

217 normalized space and translated to current screen coords 

218 via the window's freshly-snapshotted rect (handles windows 

219 moved between capture and click). 

220 verify: when True, take a pre/post screenshot diff and retry 

221 once with a 50-px nudge if no visible change occurred. 

222 if_occluded: policy for non-foreground / occluded windows: 

223 ``'skip'`` (default) — return status='window_occluded' 

224 ``'foreground'`` — SetForegroundWindow first, then click 

225 ``'force'`` — click regardless (PrintWindow-captured 

226 click target may underlie another window) 

227 safety: opt-in safety layer (Phase 6 of vlm_best_of_all_worlds_plan 

228 §5). When True, runs the action through the SessionGuard 

229 (per-session cap + per-second throttle), the WindowBlocklist 

230 (refuses lsass / password managers / banking-titled windows), 

231 and writes a JSONL audit record per attempt. Existing call 

232 sites that don't pass safety=True are unchanged. 

233 

234 Returns: 

235 dict with 'output' and optionally 'error', 'window_mismatch', 

236 'status', 'translated_from', 'translated_to', 'verify_diff', 

237 'safety_block' (when safety=True and a guard refused). 

238 """ 

239 _mismatch = _check_reasoning_mismatch(action) 

240 

241 # Phase 4: per-window translation + occlusion handling. Mutates 

242 # action['coordinate'] in place when needed; returns an early 

243 # status dict when the window can't be acted on safely. 

244 _window_meta = None 

245 if window_handle is not None: 

246 _window_meta, _early = _prepare_window_for_action( 

247 window_handle, action, if_occluded) 

248 if _early is not None: 

249 if safety: 

250 _emit_audit(action, _early, _window_meta, None, 

251 block_reason=_early.get('status')) 

252 return _early 

253 

254 # Phase 6: safety guards run BEFORE any pyautogui call so a refusal 

255 # never reaches the user's screen. Order matters — session-level 

256 # rate cap is cheapest, run first; window blocklist needs window 

257 # metadata so runs second. 

258 if safety: 

259 _block = _check_safety(_window_meta) 

260 if _block is not None: 

261 _result = { 

262 'output': '', 'status': 'safety_blocked', 

263 'error': _block, 'safety_block': _block, 

264 } 

265 if _window_meta is not None: 

266 _result['window'] = _window_meta 

267 _emit_audit(action, _result, _window_meta, None, 

268 block_reason=_block) 

269 return _result 

270 

271 # Phase 4: pre-action screenshot for verify=True diff. 

272 _pre_b64 = None 

273 if verify and tier == 'inprocess': 

274 try: 

275 _pre_b64 = take_screenshot('inprocess') 

276 except Exception as e: 

277 logger.debug(f"verify pre-screenshot skipped: {e}") 

278 

279 if tier == 'inprocess': 

280 result = _execute_inprocess(action) 

281 else: 

282 result = _execute_http(action) 

283 

284 if _mismatch: 

285 result['window_mismatch'] = _mismatch 

286 import logging 

287 logging.getLogger('hevolve.vlm').warning(f"[WINDOW-MISMATCH] {_mismatch}") 

288 

289 # Phase 4: surface window metadata so the loop's caller can audit. 

290 if _window_meta is not None: 

291 result.setdefault('window', _window_meta) 

292 

293 # Phase 4: post-click verify with one 50-px nudge retry. 

294 if _pre_b64 is not None and result.get('error') is None: 

295 result = _post_click_verify( 

296 action, result, _pre_b64, 

297 tier=tier, window_meta=_window_meta) 

298 

299 # Phase 6: record the action in the session guard + audit log. 

300 # Only record on a successful (non-error) attempt — refusals were 

301 # logged above and don't count against the session cap. 

302 if safety and result.get('error') is None: 

303 try: 

304 from integrations.vlm.safety import get_session_guard 

305 get_session_guard().record() 

306 except Exception as e: 

307 logger.debug(f"safety: session guard record failed: {e}") 

308 _emit_audit(action, result, _window_meta, _pre_b64) 

309 

310 return result 

311 

312 

313# ─── Phase 6 helper plumbing ────────────────────────────────────────── 

314 

315def _check_safety(window_meta): 

316 """Run rate guard + window blocklist. Returns block-reason 

317 string when refusing, None when OK.""" 

318 try: 

319 from integrations.vlm.safety import ( 

320 get_session_guard, is_window_blocked) 

321 except Exception as e: 

322 logger.debug(f"safety module unavailable: {e}") 

323 return None 

324 reason = get_session_guard().check() 

325 if reason is not None: 

326 return reason 

327 return is_window_blocked(window_meta) 

328 

329 

330def _emit_audit(action, result, window_meta, screenshot_b64, 

331 block_reason=None): 

332 """Best-effort audit log — failures must NOT bubble up and break 

333 the action path.""" 

334 try: 

335 from integrations.vlm.safety import get_audit_logger 

336 get_audit_logger().log( 

337 action, result, window_meta=window_meta, 

338 screenshot_b64=screenshot_b64, 

339 block_reason=block_reason) 

340 except Exception as e: 

341 logger.debug(f"audit log failed: {e}") 

342 

343 

344# ─── Phase 4 helpers (per-window translation + post-click verify) ──── 

345 

346 

347def _prepare_window_for_action(window_handle: int, action: dict, 

348 if_occluded: str): 

349 """Refresh the window's rect, decide if it can be acted on, and 

350 translate action's window-local 0-1000 coords into screen pixels 

351 in place. Returns ``(window_meta, early_result_or_None)``. 

352 

353 When the second tuple element is non-None, ``execute_action`` 

354 returns it immediately without touching pyautogui — the window 

355 can't be acted on safely. 

356 """ 

357 try: 

358 from integrations.remote_desktop.window_capture import ( 

359 WindowEnumerator, WindowInfo) 

360 except ImportError as e: 

361 logger.debug(f"window_capture unavailable: {e}") 

362 return None, { 

363 'output': '', 'status': 'window_capture_unavailable', 

364 'error': f'window_capture import failed: {e}', 

365 } 

366 

367 enum = WindowEnumerator() 

368 fresh = enum.refresh_window_info(WindowInfo( 

369 hwnd=window_handle, title='', process_name='', 

370 pid=0, rect=(0, 0, 0, 0))) 

371 if fresh is None: 

372 return None, { 

373 'output': '', 'status': 'window_destroyed', 

374 'error': f'hwnd={window_handle} no longer exists', 

375 } 

376 wx, wy, ww, wh = fresh.rect 

377 if ww <= 0 or wh <= 0: 

378 return fresh.to_dict(), { 

379 'output': '', 'status': 'window_offscreen', 

380 'error': f'window rect collapsed to {fresh.rect}', 

381 'window': fresh.to_dict(), 

382 } 

383 # Occlusion / minimized handling per policy. 

384 needs_foreground = fresh.minimized or not fresh.visible 

385 if needs_foreground: 

386 if if_occluded == 'skip': 

387 return fresh.to_dict(), { 

388 'output': '', 'status': 'window_minimized', 

389 'error': 'window minimized; pass if_occluded="foreground" ' 

390 'to bring it forward first', 

391 'window': fresh.to_dict(), 

392 } 

393 if if_occluded in ('foreground', 'force'): 

394 _bring_foreground(window_handle) 

395 # Translate window-local 0-1000 normalized coords → screen pixels. 

396 coord = action.get('coordinate') 

397 if coord and isinstance(coord, (list, tuple)) and len(coord) >= 2: 

398 nx, ny = coord[0], coord[1] 

399 if 0 <= nx <= 1000 and 0 <= ny <= 1000: 

400 sx = wx + int(nx * ww / 1000) 

401 sy = wy + int(ny * wh / 1000) 

402 action['_translated_from'] = (nx, ny) 

403 action['coordinate'] = [sx, sy] 

404 action['_translated_to'] = (sx, sy) 

405 else: 

406 # Out-of-range norm coords → caller passed screen pixels; 

407 # leave alone and let the action execute as-is. 

408 pass 

409 return fresh.to_dict(), None 

410 

411 

412def _bring_foreground(hwnd: int) -> None: 

413 """SetForegroundWindow + ShowWindow(SW_RESTORE) so a minimized / 

414 backgrounded window becomes the click target. Best-effort — 

415 Windows blocks SetForegroundWindow from non-foreground processes 

416 in many cases, so callers shouldn't assume it always works.""" 

417 if sys.platform != 'win32': 

418 return 

419 try: 

420 import ctypes 

421 SW_RESTORE = 9 

422 ctypes.windll.user32.ShowWindow(int(hwnd), SW_RESTORE) 

423 ctypes.windll.user32.SetForegroundWindow(int(hwnd)) 

424 # Brief sleep — SetForegroundWindow is async, the click can 

425 # arrive before the new foreground window is composited. 

426 time.sleep(0.10) 

427 except Exception as e: 

428 logger.debug(f"bring-foreground hwnd={hwnd} failed: {e}") 

429 

430 

431# Diff thresholds for _post_click_verify. Named so reviewers (and 

432# tests) don't have to guess what 0.005 / 16 mean. 

433#: Fraction-of-changed-pixels below which we consider the screen 

434#: "unchanged" → triggers a 50-px nudge retry. 0.5% covers JPEG 

435#: noise on a static frame and small cursor sprites without false- 

436#: triggering on real UI updates (button press → dialog → > 5%). 

437VERIFY_DIFF_THRESHOLD: float = 0.005 

438 

439#: Per-pixel grayscale delta above which a pixel counts as "changed". 

440#: Set to absorb JPEG-quality-70 quantization noise (typically < 8). 

441VERIFY_PIXEL_NOISE_FLOOR: int = 16 

442 

443#: How far to nudge the click on a no-change retry (screen px). 

444#: Half a typical button width — high enough to escape a missed edge, 

445#: low enough to stay inside the same UI element. 

446VERIFY_NUDGE_PX: int = 50 

447 

448 

449def _post_click_verify(action: dict, result: dict, pre_b64: str, *, 

450 tier: str, window_meta: dict = None) -> dict: 

451 """Take a post-action screenshot, diff against pre, and if no 

452 visible change occurred, retry the action once with a 50-px 

453 nudge. Annotates the result with 'verify_diff' (0.0–1.0) and 

454 'verify_retried' so callers can see what happened. 

455 """ 

456 try: 

457 time.sleep(0.20) # let the GUI settle before re-snapshot 

458 post_b64 = take_screenshot(tier) 

459 except Exception as e: 

460 # Surface the failure loudly — verification is a contract, 

461 # not a courtesy. WARNING (not debug) so users notice when 

462 # the screenshot path is broken; downstream callers can read 

463 # verify_error and decide whether to trust the action result. 

464 logger.warning( 

465 f"verify post-screenshot failed - cannot detect no-op clicks " 

466 f"this iteration: {e}") 

467 result['verify_diff'] = None 

468 result['verify_error'] = f'post-screenshot failed: {e}' 

469 result['verify_retried'] = False 

470 return result 

471 diff = _quick_image_diff(pre_b64, post_b64) 

472 result['verify_diff'] = round(diff, 3) 

473 if diff < VERIFY_DIFF_THRESHOLD: 

474 # No visible change — try one nudge. Only meaningful for 

475 # click-type actions with a coordinate. 

476 coord = action.get('coordinate') 

477 if coord and isinstance(coord, (list, tuple)) and len(coord) >= 2: 

478 nudged = [int(coord[0]) + VERIFY_NUDGE_PX, int(coord[1])] 

479 nudged_action = dict(action, coordinate=nudged) 

480 logger.info( 

481 f"verify: no visible change after click @ {coord}; " 

482 f"retrying with 50-px nudge → {nudged}") 

483 try: 

484 if tier == 'inprocess': 

485 _ = _execute_inprocess(nudged_action) 

486 else: 

487 _ = _execute_http(nudged_action) 

488 except Exception as e: 

489 logger.debug(f"verify-retry failed: {e}") 

490 result['verify_retried'] = True 

491 result['verify_nudge_to'] = nudged 

492 else: 

493 result['verify_retried'] = False 

494 else: 

495 result['verify_retried'] = False 

496 return result 

497 

498 

499def _quick_image_diff(b64_a: str, b64_b: str) -> float: 

500 """Fraction of significantly-changed pixels between two base64 

501 JPEGs. Downsizes to 64×64 grayscale for speed (each image → 

502 4096 bytes → 4096 cheap subtractions). Returns 0.0 (identical) 

503 to 1.0 (every pixel differs by > 16). 

504 """ 

505 try: 

506 from PIL import Image 

507 import base64 as _b64 

508 ima = Image.open(io.BytesIO(_b64.b64decode(b64_a))).convert('L').resize((64, 64)) 

509 imb = Image.open(io.BytesIO(_b64.b64decode(b64_b))).convert('L').resize((64, 64)) 

510 ba = ima.tobytes() 

511 bb = imb.tobytes() 

512 n = len(ba) 

513 if n == 0: 

514 return 0.0 

515 # Per-pixel noise floor absorbs JPEG-compression noise on 

516 # unchanged regions (see VERIFY_PIXEL_NOISE_FLOOR docstring). 

517 changed = sum(1 for a, b in zip(ba, bb) 

518 if abs(a - b) > VERIFY_PIXEL_NOISE_FLOOR) 

519 return changed / n 

520 except Exception: 

521 # Conservative: report no diff so we don't trigger spurious nudges. 

522 return 0.0 

523 

524 

525def _execute_inprocess(action: dict) -> dict: 

526 """Execute action via direct pyautogui calls.""" 

527 act = action.get('action', '') 

528 coord = action.get('coordinate') 

529 text = action.get('text', action.get('value', '')) 

530 

531 # Validate coordinate format (VLM output can be malformed) 

532 if coord is not None: 

533 if not isinstance(coord, (list, tuple)) or len(coord) < 2: 

534 return {'output': '', 'error': f'Invalid coordinate format: {coord}'} 

535 

536 # File/wait/shell actions don't need pyautogui 

537 _NO_GUI_ACTIONS = { 

538 'list_folders_and_files', 'read_file_and_understand', 'write_file', 

539 'Open_file_and_copy_paste', 'open_file_gui', 'wait', 'shell', 

540 } 

541 

542 if act not in _NO_GUI_ACTIONS and pyautogui is None: 

543 return {'output': '', 'error': 'pyautogui not installed'} 

544 

545 try: 

546 if act == 'left_click': 

547 if coord: 

548 pyautogui.click(coord[0], coord[1]) 

549 return {'output': f'Clicked at {coord}'} 

550 

551 elif act == 'right_click': 

552 if coord: 

553 pyautogui.rightClick(coord[0], coord[1]) 

554 return {'output': f'Right-clicked at {coord}'} 

555 

556 elif act == 'double_click': 

557 if coord: 

558 pyautogui.doubleClick(coord[0], coord[1]) 

559 return {'output': f'Double-clicked at {coord}'} 

560 

561 elif act == 'middle_click': 

562 if coord: 

563 pyautogui.middleClick(coord[0], coord[1]) 

564 return {'output': f'Middle-clicked at {coord}'} 

565 

566 elif act == 'hover' or act == 'mouse_move': 

567 if coord: 

568 pyautogui.moveTo(coord[0], coord[1]) 

569 return {'output': f'Moved to {coord}'} 

570 

571 elif act == 'type': 

572 if text: 

573 # Use clipboard for reliability (same as OmniParser) 

574 if pyperclip is not None: 

575 pyperclip.copy(text) 

576 pyautogui.hotkey('ctrl', 'v') 

577 else: 

578 pyautogui.typewrite(text, interval=0.012) 

579 return {'output': f'Typed: {text[:50]}...'} 

580 

581 elif act == 'key': 

582 if text: 

583 pyautogui.press(text) 

584 return {'output': f'Pressed key: {text}'} 

585 

586 elif act == 'hotkey': 

587 if text: 

588 if isinstance(text, list): 

589 keys = [str(k).strip() for k in text] 

590 else: 

591 keys = [k.strip() for k in str(text).split('+')] 

592 pyautogui.hotkey(*keys) 

593 return {'output': f'Hotkey: {text}'} 

594 

595 elif act == 'left_click_drag': 

596 start = action.get('startCoordinate', coord) 

597 end = action.get('endCoordinate', action.get('coordinate_end')) 

598 if start and end: 

599 pyautogui.moveTo(start[0], start[1]) 

600 pyautogui.drag(end[0] - start[0], end[1] - start[1], duration=0.5) 

601 return {'output': f'Dragged from {start} to {end}'} 

602 

603 elif act == 'screenshot': 

604 return {'output': 'Screenshot taken', 'base64_image': take_screenshot('inprocess')} 

605 

606 elif act == 'wait': 

607 wait_time = action.get('duration', 2) 

608 time.sleep(wait_time) 

609 return {'output': f'Waited {wait_time}s'} 

610 

611 elif act == 'cursor_position': 

612 pos = pyautogui.position() 

613 return {'output': f'Cursor at ({pos.x}, {pos.y})'} 

614 

615 elif act == 'list_folders_and_files': 

616 path = action.get('path', '.') 

617 try: 

618 entries = os.listdir(path) 

619 return {'output': '\n'.join(entries[:100])} 

620 except OSError as e: 

621 return {'output': '', 'error': str(e)} 

622 

623 elif act == 'read_file_and_understand': 

624 path = action.get('path', '') 

625 try: 

626 with open(path, 'r', encoding='utf-8', errors='replace') as f: 

627 content = f.read(10000) 

628 return {'output': content} 

629 except OSError as e: 

630 return {'output': '', 'error': str(e)} 

631 

632 elif act == 'write_file': 

633 path = action.get('path', '') 

634 content = action.get('content', text) 

635 try: 

636 with open(path, 'w', encoding='utf-8') as f: 

637 f.write(content) 

638 return {'output': f'Written to {path}'} 

639 except OSError as e: 

640 return {'output': '', 'error': str(e)} 

641 

642 elif act == 'open_file_gui': 

643 # Open a file / app in the OS default handler. On Windows this is 

644 # os.startfile (uses ShellExecute). On Linux/Mac the equivalent is 

645 # `xdg-open` / `open`, which aren't available as a Python API — 

646 # route through the shell handler so the same denylist applies. 

647 path = action.get('path', '') or text 

648 if not path: 

649 return {'output': '', 'error': 'open_file_gui needs a path'} 

650 if sys.platform == 'win32': 

651 try: 

652 os.startfile(path) # type: ignore[attr-defined] 

653 return {'output': f'Opened {path}'} 

654 except OSError as e: 

655 return {'output': '', 'error': f'open_file_gui failed: {e}'} 

656 # Non-Windows: delegate to shell so we reuse the denylist 

657 shell_cmd = ( 

658 f'open {path}' if sys.platform == 'darwin' else f'xdg-open {path}' 

659 ) 

660 from core.safe_hartos_attr import safe_hartos_attr 

661 _handle_shell_command_tool = safe_hartos_attr( 

662 '_handle_shell_command_tool') 

663 if _handle_shell_command_tool is None: 

664 logger.info( 

665 "open_file_gui blocked: HARTOS _handle_shell_command_tool " 

666 "not yet resolvable (loader still init). Failing closed " 

667 "to preserve denylist guarantees.", 

668 ) 

669 return { 

670 'output': '', 

671 'error': 'open_file_gui unavailable: HARTOS still loading', 

672 'status': 'error', 

673 } 

674 result_text = _handle_shell_command_tool(shell_cmd) 

675 logger.info( 

676 "open_file_gui dispatched: cmd=%r exit_signature=%r", 

677 shell_cmd, (result_text or '')[:40], 

678 ) 

679 ok = isinstance(result_text, str) and result_text.startswith('Exit code: 0') 

680 return { 

681 'output': result_text, 

682 'status': 'ok' if ok else 'error', 

683 } 

684 

685 elif act == 'shell': 

686 # Deterministic command execution inside the VLM loop. The ONLY 

687 # implementation lives in hart_intelligence_entry._handle_shell_command_tool 

688 # so the denylist + timeout + truncation + shell-selector parsing all 

689 # apply identically to Shell_Command and this VLM-emitted action. If 

690 # that import fails (stripped frozen build / circular import), we 

691 # fail CLOSED rather than falling back to a bare subprocess.run — 

692 # a bare fallback would skip the denylist and expose a command 

693 # injection channel that silently weakens safety posture. 

694 cmd = action.get('command', text) 

695 if not cmd: 

696 return {'output': '', 'error': 'shell action needs command string'} 

697 from core.safe_hartos_attr import safe_hartos_attr 

698 _handle_shell_command_tool = safe_hartos_attr( 

699 '_handle_shell_command_tool') 

700 if _handle_shell_command_tool is None: 

701 logger.info( 

702 "VLM shell action blocked: HARTOS " 

703 "_handle_shell_command_tool not yet resolvable. " 

704 "Failing closed (denylist unavailable) — cmd=%r", 

705 (cmd or '')[:80], 

706 ) 

707 return { 

708 'output': '', 

709 'error': ( 

710 "shell action unavailable: HARTOS still loading. " 

711 "Refusing to run without the shared denylist." 

712 ), 

713 'status': 'error', 

714 } 

715 logger.info( 

716 "VLM shell action dispatching: cmd=%r", 

717 (cmd or '')[:80], 

718 ) 

719 result_text = _handle_shell_command_tool(cmd) 

720 # _handle_shell_command_tool returns 'Exit code: N\n<body>' on 

721 # success and 'Shell_Command refused: ...' / 'Shell_Command error: ...' 

722 # on refusal or failure. Classify anything other than a clean 

723 # 'Exit code: 0' prefix as a non-success so the VLM loop's 

724 # consecutive-action-error counter can back off. 

725 ok = isinstance(result_text, str) and result_text.startswith('Exit code: 0') 

726 return { 

727 'output': result_text, 

728 'status': 'ok' if ok else 'error', 

729 } 

730 

731 elif act == 'Open_file_and_copy_paste': 

732 src = action.get('source_path', '') 

733 dst = action.get('destination_path', '') 

734 try: 

735 with open(src, 'r', encoding='utf-8', errors='replace') as f: 

736 content = f.read() 

737 with open(dst, 'w', encoding='utf-8') as f: 

738 f.write(content) 

739 return {'output': f'Copied {src} → {dst}'} 

740 except OSError as e: 

741 return {'output': '', 'error': str(e)} 

742 

743 else: 

744 return {'output': '', 'error': f'Unknown action: {act}'} 

745 

746 except Exception as e: 

747 logger.error(f"Action execution error ({act}): {e}") 

748 return {'output': '', 'error': str(e)} 

749 

750 

751def _execute_http(action: dict) -> dict: 

752 """Execute action via HTTP POST to localhost:5001/execute.""" 

753 try: 

754 resp = pooled_post( 

755 'http://localhost:5001/execute', 

756 json=action, 

757 timeout=30 

758 ) 

759 resp.raise_for_status() 

760 return resp.json() 

761 except Exception as e: 

762 logger.error(f"HTTP action execution error: {e}") 

763 return {'output': '', 'error': str(e)}