Coverage for integrations/vlm/local_computer

1"""

2local_computer_tool.py - Synchronous pyautogui/HTTP wrapper for VLM actions.

4Replaces OmniParser's Crossbar RPC-based ComputerTool with direct local execution.

5Supports same action types as OmniParser computer.py (key, type, left_click, etc.).

7Tier 'inprocess': direct pyautogui calls (no network)

8Tier 'http': HTTP to localhost:5001 (omnitool-gui Flask server)

9"""

11import os

12import io

13import sys

14import time

15import base64

16import logging

17from typing import Optional

19# VLM screenshot long-edge — aspect ratio is PRESERVED during resize.

20# Old behavior (1024×576 forced) squished 16:10 screens into 16:9 and the

21# VLM's vertical coordinates drifted accordingly. Qwen3-VL handles 1280px

22# long edge comfortably; longer is better grounding, shorter is faster.

23# HEVOLVE_VLM_IMG_LONG_EDGE lets callers tune this.

24VLM_IMG_LONG_EDGE = int(os.environ.get('HEVOLVE_VLM_IMG_LONG_EDGE', '1280'))

25# Legacy constants kept for backward compat with existing call sites

26# and for tests that reference them. The *real* dimensions are computed

27# per-screenshot from the actual screen aspect ratio.

28VLM_IMG_W = VLM_IMG_LONG_EDGE

29VLM_IMG_H = int(VLM_IMG_LONG_EDGE * 9 / 16)

31logger = logging.getLogger('hevolve.vlm.computer_tool')

33# Module-level imports for mockability (pyautogui is optional)

34try:

35 import pyautogui

36except ImportError:

37 pyautogui = None

40# Single source of truth for SetProcessDpiAwareness — see

41# core/dpi_awareness.py for the rationale (was duplicated in

42# remote_desktop/window_capture.py until 2026-05-03 DRY pass).

43from core.dpi_awareness import ensure_dpi_aware as _ensure_dpi_aware

45# Call at import time so every screenshot/click path is DPI-consistent

46_ensure_dpi_aware()

48try:

49 import pyperclip

50except ImportError:

51 pyperclip = None

53from core.http_pool import pooled_get, pooled_post

55# Action types matching OmniParser computer.py Action literal.

56# 'shell' is a Nunba extension — lets the VLM loop run deterministic commands

57# instead of GUI grounding for tasks that can be done programmatically

58# (e.g., launching an app, opening a file in its default handler).

59SUPPORTED_ACTIONS = {

60 'key', 'type', 'mouse_move', 'left_click', 'left_click_drag',

61 'right_click', 'middle_click', 'double_click', 'screenshot',

62 'cursor_position', 'hover', 'list_folders_and_files',

63 'Open_file_and_copy_paste', 'open_file_gui', 'write_file',

64 'read_file_and_understand', 'wait', 'hotkey', 'shell',

65}

68def take_screenshot(tier: str) -> str:

69 """

70 Capture screen and return base64 JPEG.

72 The image is resized to a long-edge of VLM_IMG_LONG_EDGE while

73 PRESERVING aspect ratio, so the VLM's normalized coordinates map

74 back to the physical screen without distortion. Screen DPI awareness

75 is enabled at import (see _ensure_dpi_aware()).

77 Args:

78 tier: 'inprocess' (pyautogui direct) or 'http' (localhost:5001)

79 Returns:

80 Base64-encoded JPEG screenshot string.

81 """

82 if tier == 'inprocess':

83 if pyautogui is None:

84 raise ImportError("pyautogui is required for in-process screenshots")

85 img = pyautogui.screenshot()

86 from PIL import Image

88 w, h = img.size

89 long_edge = max(w, h)

90 if long_edge > VLM_IMG_LONG_EDGE:

91 scale = VLM_IMG_LONG_EDGE / long_edge

92 new_size = (max(1, int(w * scale)), max(1, int(h * scale)))

93 img = img.resize(new_size, Image.LANCZOS)

95 buf = io.BytesIO()

96 img.save(buf, format='JPEG', quality=70)

97 return base64.b64encode(buf.getvalue()).decode('ascii')

98 else:

99 resp = pooled_get('http://localhost:5001/screenshot', timeout=15)

100 resp.raise_for_status()

101 data = resp.json()

102 return data.get('base64_image', data.get('image', ''))

103

104

105def get_active_window_info():

106 """Get the actual foreground window title + process name from the OS.

107 Used to prevent VLM misidentifying windows (e.g. Claude Code as MobaXterm)."""

108 try:

109 import platform, subprocess, json

110 from core.subprocess_safe import hidden_popen_kwargs

111 _os = platform.system()

112 if _os == 'Windows':

113 # CREATE_NO_WINDOW prevents the powershell child from popping a

114 # cmd console on every call (this fires per VLM probe).

115 r = subprocess.run(

116 ['powershell', '-Command',

117 '(Get-Process | Where-Object {$_.MainWindowHandle -eq '

118 '(Add-Type -MemberDefinition \'[DllImport("user32.dll")] '

119 'public static extern IntPtr GetForegroundWindow();\' '

120 '-Name W -PassThru)::GetForegroundWindow()}).ProcessName + '

121 '": " + (Get-Process | Where-Object {$_.MainWindowHandle -eq '

122 '(Add-Type -MemberDefinition \'[DllImport("user32.dll")] '

123 'public static extern IntPtr GetForegroundWindow();\' '

124 '-Name W2 -PassThru)::GetForegroundWindow()}).MainWindowTitle'],

125 capture_output=True, text=True, timeout=3,

126 **hidden_popen_kwargs())

127 if r.returncode == 0 and r.stdout.strip():

128 return r.stdout.strip()

129 elif _os == 'Linux':

130 r = subprocess.run(['xdotool', 'getactivewindow', 'getwindowname'],

131 capture_output=True, text=True, timeout=3)

132 if r.returncode == 0:

133 return r.stdout.strip()

134 elif _os == 'Darwin':

135 r = subprocess.run(

136 ['osascript', '-e',

137 'tell application "System Events" to get name of first process whose frontmost is true'],

138 capture_output=True, text=True, timeout=3)

139 if r.returncode == 0:

140 return r.stdout.strip()

141 except Exception:

142 pass

143 return None

144

145

146#: Process-name keyword pairs the reasoning-mismatch detector watches.

147#: ``(reasoning_substring, foreground_window_substring)`` — when the

148#: VLM's reasoning includes the first but the actual foreground window

149#: title doesn't include the second, the action gets flagged. Order

150#: matters: more specific patterns first. Extend by appending tuples.

151_REASONING_MISMATCH_PATTERNS = (

152 ('mobaxt', 'mobaxt'),

153 ('notepad', 'notepad'),

154)

155

156#: Verbs in the VLM's reasoning that hint a window-targeted action.

157#: We only run the (slow) get_active_window_info probe when the

158#: reasoning suggests the VLM is acting on a specific window, not

159#: when it's typing or generic-clicking somewhere mid-screen.

160_WINDOW_TARGETED_VERBS = ('minimize', 'close', 'switch to', 'click on')

161

162

163def _check_reasoning_mismatch(action: dict) -> Optional[str]:

164 """Detect when the VLM's stated reasoning contradicts the actual

165 foreground window. Returns a human-readable mismatch description

166 or None when there's no detectable disagreement.

167

168 Extracted from execute_action in the SRP cleanup pass — was 14

169 lines tangled in the action-dispatch flow alongside per-window

170 translation, safety, audit, and verify. Self-contained now.

171

172 Pattern config in module-level ``_REASONING_MISMATCH_PATTERNS``.

173 Adding a new pattern is one tuple append.

174 """

175 reasoning = action.get('Reasoning', action.get('reasoning', '')).lower()

176 if not reasoning:

177 return None

178 if not any(verb in reasoning for verb in _WINDOW_TARGETED_VERBS):

179 return None

180 active = get_active_window_info()

181 if not active:

182 return None

183 active_lower = active.lower()

184 for reasoning_kw, window_kw in _REASONING_MISMATCH_PATTERNS:

185 if reasoning_kw in reasoning and window_kw not in active_lower:

186 return (f"VLM thinks {reasoning_kw.title()} but active window "

187 f"is: {active}")

188 return None

189

190

191def execute_action(action: dict, tier: str, *,

192 window_handle: int = None,

193 verify: bool = False,

194 if_occluded: str = 'skip',

195 safety: bool = False) -> dict:

196 """

197 Execute a single VLM action (click, type, key, etc.).

198

199 Includes active window validation — if the VLM's reasoning mentions

200 a window name that doesn't match the actual foreground window,

201 the action is flagged (prevents clicking the wrong app's taskbar icon).

202

203 Phase 4 of vlm_best_of_all_worlds_plan.md §3 added the per-window

204 keyword arguments below. All are backward-compatible — every

205 existing caller passes only ``(action, tier)`` and gets the same

206 behaviour as before.

207

208 Args:

209 action: dict with 'action', optionally 'coordinate' (in

210 window-local 0-1000 norm space when ``window_handle`` is

211 set; in screen-pixel space otherwise), 'text', 'value',

212 'path', 'reasoning'.

213 tier: 'inprocess' or 'http'.

214 window_handle: HWND from

215 :func:`integrations.remote_desktop.window_capture.list_windows`.

216 When set, ``coordinate`` is treated as window-local 0-1000

217 normalized space and translated to current screen coords

218 via the window's freshly-snapshotted rect (handles windows

219 moved between capture and click).

220 verify: when True, take a pre/post screenshot diff and retry

221 once with a 50-px nudge if no visible change occurred.

222 if_occluded: policy for non-foreground / occluded windows:

223 ``'skip'`` (default) — return status='window_occluded'

224 ``'foreground'`` — SetForegroundWindow first, then click

225 ``'force'`` — click regardless (PrintWindow-captured

226 click target may underlie another window)

227 safety: opt-in safety layer (Phase 6 of vlm_best_of_all_worlds_plan

228 §5). When True, runs the action through the SessionGuard

229 (per-session cap + per-second throttle), the WindowBlocklist

230 (refuses lsass / password managers / banking-titled windows),

231 and writes a JSONL audit record per attempt. Existing call

232 sites that don't pass safety=True are unchanged.

233

234 Returns:

235 dict with 'output' and optionally 'error', 'window_mismatch',

236 'status', 'translated_from', 'translated_to', 'verify_diff',

237 'safety_block' (when safety=True and a guard refused).

238 """

239 _mismatch = _check_reasoning_mismatch(action)

240

241 # Phase 4: per-window translation + occlusion handling. Mutates

242 # action['coordinate'] in place when needed; returns an early

243 # status dict when the window can't be acted on safely.

244 _window_meta = None

245 if window_handle is not None:

246 _window_meta, _early = _prepare_window_for_action(

247 window_handle, action, if_occluded)

248 if _early is not None:

249 if safety:

250 _emit_audit(action, _early, _window_meta, None,

251 block_reason=_early.get('status'))

252 return _early

253

254 # Phase 6: safety guards run BEFORE any pyautogui call so a refusal

255 # never reaches the user's screen. Order matters — session-level

256 # rate cap is cheapest, run first; window blocklist needs window

257 # metadata so runs second.

258 if safety:

259 _block = _check_safety(_window_meta)

260 if _block is not None:

261 _result = {

262 'output': '', 'status': 'safety_blocked',

263 'error': _block, 'safety_block': _block,

264 }

265 if _window_meta is not None:

266 _result['window'] = _window_meta

267 _emit_audit(action, _result, _window_meta, None,

268 block_reason=_block)

269 return _result

270

271 # Phase 4: pre-action screenshot for verify=True diff.

272 _pre_b64 = None

273 if verify and tier == 'inprocess':

274 try:

275 _pre_b64 = take_screenshot('inprocess')

276 except Exception as e:

277 logger.debug(f"verify pre-screenshot skipped: {e}")

278

279 if tier == 'inprocess':

280 result = _execute_inprocess(action)

281 else:

282 result = _execute_http(action)

283

284 if _mismatch:

285 result['window_mismatch'] = _mismatch

286 import logging

287 logging.getLogger('hevolve.vlm').warning(f"[WINDOW-MISMATCH] {_mismatch}")

288

289 # Phase 4: surface window metadata so the loop's caller can audit.

290 if _window_meta is not None:

291 result.setdefault('window', _window_meta)

292

293 # Phase 4: post-click verify with one 50-px nudge retry.

294 if _pre_b64 is not None and result.get('error') is None:

295 result = _post_click_verify(

296 action, result, _pre_b64,

297 tier=tier, window_meta=_window_meta)

298

299 # Phase 6: record the action in the session guard + audit log.

300 # Only record on a successful (non-error) attempt — refusals were

301 # logged above and don't count against the session cap.

302 if safety and result.get('error') is None:

303 try:

304 from integrations.vlm.safety import get_session_guard

305 get_session_guard().record()

306 except Exception as e:

307 logger.debug(f"safety: session guard record failed: {e}")

308 _emit_audit(action, result, _window_meta, _pre_b64)

309

310 return result

311

312

313# ─── Phase 6 helper plumbing ──────────────────────────────────────────

314

315def _check_safety(window_meta):

316 """Run rate guard + window blocklist. Returns block-reason

317 string when refusing, None when OK."""

318 try:

319 from integrations.vlm.safety import (

320 get_session_guard, is_window_blocked)

321 except Exception as e:

322 logger.debug(f"safety module unavailable: {e}")

323 return None

324 reason = get_session_guard().check()

325 if reason is not None:

326 return reason

327 return is_window_blocked(window_meta)

328

329

330def _emit_audit(action, result, window_meta, screenshot_b64,

331 block_reason=None):

332 """Best-effort audit log — failures must NOT bubble up and break

333 the action path."""

334 try:

335 from integrations.vlm.safety import get_audit_logger

336 get_audit_logger().log(

337 action, result, window_meta=window_meta,

338 screenshot_b64=screenshot_b64,

339 block_reason=block_reason)

340 except Exception as e:

341 logger.debug(f"audit log failed: {e}")

342

343

344# ─── Phase 4 helpers (per-window translation + post-click verify) ────

345

346

347def _prepare_window_for_action(window_handle: int, action: dict,

348 if_occluded: str):

349 """Refresh the window's rect, decide if it can be acted on, and

350 translate action's window-local 0-1000 coords into screen pixels

351 in place. Returns ``(window_meta, early_result_or_None)``.

352

353 When the second tuple element is non-None, ``execute_action``

354 returns it immediately without touching pyautogui — the window

355 can't be acted on safely.

356 """

357 try:

358 from integrations.remote_desktop.window_capture import (

359 WindowEnumerator, WindowInfo)

360 except ImportError as e:

361 logger.debug(f"window_capture unavailable: {e}")

362 return None, {

363 'output': '', 'status': 'window_capture_unavailable',

364 'error': f'window_capture import failed: {e}',

365 }

366

367 enum = WindowEnumerator()

368 fresh = enum.refresh_window_info(WindowInfo(

369 hwnd=window_handle, title='', process_name='',

370 pid=0, rect=(0, 0, 0, 0)))

371 if fresh is None:

372 return None, {

373 'output': '', 'status': 'window_destroyed',

374 'error': f'hwnd={window_handle} no longer exists',

375 }

376 wx, wy, ww, wh = fresh.rect

377 if ww <= 0 or wh <= 0:

378 return fresh.to_dict(), {

379 'output': '', 'status': 'window_offscreen',

380 'error': f'window rect collapsed to {fresh.rect}',

381 'window': fresh.to_dict(),

382 }

383 # Occlusion / minimized handling per policy.

384 needs_foreground = fresh.minimized or not fresh.visible

385 if needs_foreground:

386 if if_occluded == 'skip':

387 return fresh.to_dict(), {

388 'output': '', 'status': 'window_minimized',

389 'error': 'window minimized; pass if_occluded="foreground" '

390 'to bring it forward first',

391 'window': fresh.to_dict(),

392 }

393 if if_occluded in ('foreground', 'force'):

394 _bring_foreground(window_handle)

395 # Translate window-local 0-1000 normalized coords → screen pixels.

396 coord = action.get('coordinate')

397 if coord and isinstance(coord, (list, tuple)) and len(coord) >= 2:

398 nx, ny = coord[0], coord[1]

399 if 0 <= nx <= 1000 and 0 <= ny <= 1000:

400 sx = wx + int(nx * ww / 1000)

401 sy = wy + int(ny * wh / 1000)

402 action['_translated_from'] = (nx, ny)

403 action['coordinate'] = [sx, sy]

404 action['_translated_to'] = (sx, sy)

405 else:

406 # Out-of-range norm coords → caller passed screen pixels;

407 # leave alone and let the action execute as-is.

408 pass

409 return fresh.to_dict(), None

410

411

412def _bring_foreground(hwnd: int) -> None:

413 """SetForegroundWindow + ShowWindow(SW_RESTORE) so a minimized /

414 backgrounded window becomes the click target. Best-effort —

415 Windows blocks SetForegroundWindow from non-foreground processes

416 in many cases, so callers shouldn't assume it always works."""

417 if sys.platform != 'win32':

418 return

419 try:

420 import ctypes

421 SW_RESTORE = 9

422 ctypes.windll.user32.ShowWindow(int(hwnd), SW_RESTORE)

423 ctypes.windll.user32.SetForegroundWindow(int(hwnd))

424 # Brief sleep — SetForegroundWindow is async, the click can

425 # arrive before the new foreground window is composited.

426 time.sleep(0.10)

427 except Exception as e:

428 logger.debug(f"bring-foreground hwnd={hwnd} failed: {e}")

429

430

431# Diff thresholds for _post_click_verify. Named so reviewers (and

432# tests) don't have to guess what 0.005 / 16 mean.

433#: Fraction-of-changed-pixels below which we consider the screen

434#: "unchanged" → triggers a 50-px nudge retry. 0.5% covers JPEG

435#: noise on a static frame and small cursor sprites without false-

436#: triggering on real UI updates (button press → dialog → > 5%).

437VERIFY_DIFF_THRESHOLD: float = 0.005

438

439#: Per-pixel grayscale delta above which a pixel counts as "changed".

440#: Set to absorb JPEG-quality-70 quantization noise (typically < 8).

441VERIFY_PIXEL_NOISE_FLOOR: int = 16

442

443#: How far to nudge the click on a no-change retry (screen px).

444#: Half a typical button width — high enough to escape a missed edge,

445#: low enough to stay inside the same UI element.

446VERIFY_NUDGE_PX: int = 50

447

448

449def _post_click_verify(action: dict, result: dict, pre_b64: str, *,

450 tier: str, window_meta: dict = None) -> dict:

451 """Take a post-action screenshot, diff against pre, and if no

452 visible change occurred, retry the action once with a 50-px

453 nudge. Annotates the result with 'verify_diff' (0.0–1.0) and

454 'verify_retried' so callers can see what happened.

455 """

456 try:

457 time.sleep(0.20) # let the GUI settle before re-snapshot

458 post_b64 = take_screenshot(tier)

459 except Exception as e:

460 # Surface the failure loudly — verification is a contract,

461 # not a courtesy. WARNING (not debug) so users notice when

462 # the screenshot path is broken; downstream callers can read

463 # verify_error and decide whether to trust the action result.

464 logger.warning(

465 f"verify post-screenshot failed - cannot detect no-op clicks "

466 f"this iteration: {e}")

467 result['verify_diff'] = None

468 result['verify_error'] = f'post-screenshot failed: {e}'

469 result['verify_retried'] = False

470 return result

471 diff = _quick_image_diff(pre_b64, post_b64)

472 result['verify_diff'] = round(diff, 3)

473 if diff < VERIFY_DIFF_THRESHOLD:

474 # No visible change — try one nudge. Only meaningful for

475 # click-type actions with a coordinate.

476 coord = action.get('coordinate')

477 if coord and isinstance(coord, (list, tuple)) and len(coord) >= 2:

478 nudged = [int(coord[0]) + VERIFY_NUDGE_PX, int(coord[1])]

479 nudged_action = dict(action, coordinate=nudged)

480 logger.info(

481 f"verify: no visible change after click @ {coord}; "

482 f"retrying with 50-px nudge → {nudged}")

483 try:

484 if tier == 'inprocess':

485 _ = _execute_inprocess(nudged_action)

486 else:

487 _ = _execute_http(nudged_action)

488 except Exception as e:

489 logger.debug(f"verify-retry failed: {e}")

490 result['verify_retried'] = True

491 result['verify_nudge_to'] = nudged

492 else:

493 result['verify_retried'] = False

494 else:

495 result['verify_retried'] = False

496 return result

497

498

499def _quick_image_diff(b64_a: str, b64_b: str) -> float:

500 """Fraction of significantly-changed pixels between two base64

501 JPEGs. Downsizes to 64×64 grayscale for speed (each image →

502 4096 bytes → 4096 cheap subtractions). Returns 0.0 (identical)

503 to 1.0 (every pixel differs by > 16).

504 """

505 try:

506 from PIL import Image

507 import base64 as _b64

508 ima = Image.open(io.BytesIO(_b64.b64decode(b64_a))).convert('L').resize((64, 64))

509 imb = Image.open(io.BytesIO(_b64.b64decode(b64_b))).convert('L').resize((64, 64))

510 ba = ima.tobytes()

511 bb = imb.tobytes()

512 n = len(ba)

513 if n == 0:

514 return 0.0

515 # Per-pixel noise floor absorbs JPEG-compression noise on

516 # unchanged regions (see VERIFY_PIXEL_NOISE_FLOOR docstring).

517 changed = sum(1 for a, b in zip(ba, bb)

518 if abs(a - b) > VERIFY_PIXEL_NOISE_FLOOR)

519 return changed / n

520 except Exception:

521 # Conservative: report no diff so we don't trigger spurious nudges.

522 return 0.0

523

524

525def _execute_inprocess(action: dict) -> dict:

526 """Execute action via direct pyautogui calls."""

527 act = action.get('action', '')

528 coord = action.get('coordinate')

529 text = action.get('text', action.get('value', ''))

530

531 # Validate coordinate format (VLM output can be malformed)

532 if coord is not None:

533 if not isinstance(coord, (list, tuple)) or len(coord) < 2:

534 return {'output': '', 'error': f'Invalid coordinate format: {coord}'}

535

536 # File/wait/shell actions don't need pyautogui

537 _NO_GUI_ACTIONS = {

538 'list_folders_and_files', 'read_file_and_understand', 'write_file',

539 'Open_file_and_copy_paste', 'open_file_gui', 'wait', 'shell',

540 }

541

542 if act not in _NO_GUI_ACTIONS and pyautogui is None:

543 return {'output': '', 'error': 'pyautogui not installed'}

544

545 try:

546 if act == 'left_click':

547 if coord:

548 pyautogui.click(coord[0], coord[1])

549 return {'output': f'Clicked at {coord}'}

550

551 elif act == 'right_click':

552 if coord:

553 pyautogui.rightClick(coord[0], coord[1])

554 return {'output': f'Right-clicked at {coord}'}

555

556 elif act == 'double_click':

557 if coord:

558 pyautogui.doubleClick(coord[0], coord[1])

559 return {'output': f'Double-clicked at {coord}'}

560

561 elif act == 'middle_click':

562 if coord:

563 pyautogui.middleClick(coord[0], coord[1])

564 return {'output': f'Middle-clicked at {coord}'}

565

566 elif act == 'hover' or act == 'mouse_move':

567 if coord:

568 pyautogui.moveTo(coord[0], coord[1])

569 return {'output': f'Moved to {coord}'}

570

571 elif act == 'type':

572 if text:

573 # Use clipboard for reliability (same as OmniParser)

574 if pyperclip is not None:

575 pyperclip.copy(text)

576 pyautogui.hotkey('ctrl', 'v')

577 else:

578 pyautogui.typewrite(text, interval=0.012)

579 return {'output': f'Typed: {text[:50]}...'}

580

581 elif act == 'key':

582 if text:

583 pyautogui.press(text)

584 return {'output': f'Pressed key: {text}'}

585

586 elif act == 'hotkey':

587 if text:

588 if isinstance(text, list):

589 keys = [str(k).strip() for k in text]

590 else:

591 keys = [k.strip() for k in str(text).split('+')]

592 pyautogui.hotkey(*keys)

593 return {'output': f'Hotkey: {text}'}

594

595 elif act == 'left_click_drag':

596 start = action.get('startCoordinate', coord)

597 end = action.get('endCoordinate', action.get('coordinate_end'))

598 if start and end:

599 pyautogui.moveTo(start[0], start[1])

600 pyautogui.drag(end[0] - start[0], end[1] - start[1], duration=0.5)

601 return {'output': f'Dragged from {start} to {end}'}

602

603 elif act == 'screenshot':

604 return {'output': 'Screenshot taken', 'base64_image': take_screenshot('inprocess')}

605

606 elif act == 'wait':

607 wait_time = action.get('duration', 2)

608 time.sleep(wait_time)

609 return {'output': f'Waited {wait_time}s'}

610

611 elif act == 'cursor_position':

612 pos = pyautogui.position()

613 return {'output': f'Cursor at ({pos.x}, {pos.y})'}

614

615 elif act == 'list_folders_and_files':

616 path = action.get('path', '.')

617 try:

618 entries = os.listdir(path)

619 return {'output': '\n'.join(entries[:100])}

620 except OSError as e:

621 return {'output': '', 'error': str(e)}

622

623 elif act == 'read_file_and_understand':

624 path = action.get('path', '')

625 try:

626 with open(path, 'r', encoding='utf-8', errors='replace') as f:

627 content = f.read(10000)

628 return {'output': content}

629 except OSError as e:

630 return {'output': '', 'error': str(e)}

631

632 elif act == 'write_file':

633 path = action.get('path', '')

634 content = action.get('content', text)

635 try:

636 with open(path, 'w', encoding='utf-8') as f:

637 f.write(content)

638 return {'output': f'Written to {path}'}

639 except OSError as e:

640 return {'output': '', 'error': str(e)}

641

642 elif act == 'open_file_gui':

643 # Open a file / app in the OS default handler. On Windows this is

644 # os.startfile (uses ShellExecute). On Linux/Mac the equivalent is

645 # `xdg-open` / `open`, which aren't available as a Python API —

646 # route through the shell handler so the same denylist applies.

647 path = action.get('path', '') or text

648 if not path:

649 return {'output': '', 'error': 'open_file_gui needs a path'}

650 if sys.platform == 'win32':

651 try:

652 os.startfile(path) # type: ignore[attr-defined]

653 return {'output': f'Opened {path}'}

654 except OSError as e:

655 return {'output': '', 'error': f'open_file_gui failed: {e}'}

656 # Non-Windows: delegate to shell so we reuse the denylist

657 shell_cmd = (

658 f'open {path}' if sys.platform == 'darwin' else f'xdg-open {path}'

659 )

660 from core.safe_hartos_attr import safe_hartos_attr

661 _handle_shell_command_tool = safe_hartos_attr(

662 '_handle_shell_command_tool')

663 if _handle_shell_command_tool is None:

664 logger.info(

665 "open_file_gui blocked: HARTOS _handle_shell_command_tool "

666 "not yet resolvable (loader still init). Failing closed "

667 "to preserve denylist guarantees.",

668 )

669 return {

670 'output': '',

671 'error': 'open_file_gui unavailable: HARTOS still loading',

672 'status': 'error',

673 }

674 result_text = _handle_shell_command_tool(shell_cmd)

675 logger.info(

676 "open_file_gui dispatched: cmd=%r exit_signature=%r",

677 shell_cmd, (result_text or '')[:40],

678 )

679 ok = isinstance(result_text, str) and result_text.startswith('Exit code: 0')

680 return {

681 'output': result_text,

682 'status': 'ok' if ok else 'error',

683 }

684

685 elif act == 'shell':

686 # Deterministic command execution inside the VLM loop. The ONLY

687 # implementation lives in hart_intelligence_entry._handle_shell_command_tool

688 # so the denylist + timeout + truncation + shell-selector parsing all

689 # apply identically to Shell_Command and this VLM-emitted action. If

690 # that import fails (stripped frozen build / circular import), we

691 # fail CLOSED rather than falling back to a bare subprocess.run —

692 # a bare fallback would skip the denylist and expose a command

693 # injection channel that silently weakens safety posture.

694 cmd = action.get('command', text)

695 if not cmd:

696 return {'output': '', 'error': 'shell action needs command string'}

697 from core.safe_hartos_attr import safe_hartos_attr

698 _handle_shell_command_tool = safe_hartos_attr(

699 '_handle_shell_command_tool')

700 if _handle_shell_command_tool is None:

701 logger.info(

702 "VLM shell action blocked: HARTOS "

703 "_handle_shell_command_tool not yet resolvable. "

704 "Failing closed (denylist unavailable) — cmd=%r",

705 (cmd or '')[:80],

706 )

707 return {

708 'output': '',

709 'error': (

710 "shell action unavailable: HARTOS still loading. "

711 "Refusing to run without the shared denylist."

712 ),

713 'status': 'error',

714 }

715 logger.info(

716 "VLM shell action dispatching: cmd=%r",

717 (cmd or '')[:80],

718 )

719 result_text = _handle_shell_command_tool(cmd)

720 # _handle_shell_command_tool returns 'Exit code: N\n<body>' on

721 # success and 'Shell_Command refused: ...' / 'Shell_Command error: ...'

722 # on refusal or failure. Classify anything other than a clean

723 # 'Exit code: 0' prefix as a non-success so the VLM loop's

724 # consecutive-action-error counter can back off.

725 ok = isinstance(result_text, str) and result_text.startswith('Exit code: 0')

726 return {

727 'output': result_text,

728 'status': 'ok' if ok else 'error',

729 }

730

731 elif act == 'Open_file_and_copy_paste':

732 src = action.get('source_path', '')

733 dst = action.get('destination_path', '')

734 try:

735 with open(src, 'r', encoding='utf-8', errors='replace') as f:

736 content = f.read()

737 with open(dst, 'w', encoding='utf-8') as f:

738 f.write(content)

739 return {'output': f'Copied {src} → {dst}'}

740 except OSError as e:

741 return {'output': '', 'error': str(e)}

742

743 else:

744 return {'output': '', 'error': f'Unknown action: {act}'}

745

746 except Exception as e:

747 logger.error(f"Action execution error ({act}): {e}")

748 return {'output': '', 'error': str(e)}

749

750

751def _execute_http(action: dict) -> dict:

752 """Execute action via HTTP POST to localhost:5001/execute."""

753 try:

754 resp = pooled_post(

755 'http://localhost:5001/execute',

756 json=action,

757 timeout=30

758 )

759 resp.raise_for_status()

760 return resp.json()

761 except Exception as e:

762 logger.error(f"HTTP action execution error: {e}")

763 return {'output': '', 'error': str(e)}

Coverage for integrations / vlm / local_computer_tool.py: 68.0%

331 statements