Coverage for integrations / remote_desktop / window_capture.py: 35.1%

655 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Window Capture — Per-window enumeration and frame capture for tab-detach streaming. 

3 

4Instead of full-screen capture, this module captures individual OS windows so each 

5remote application (Notepad, CMD, etc.) can be streamed as a separate session. 

6 

7Backends (cross-platform, guarded imports): 

8 Windows: win32gui EnumWindows + GetWindowDC/BitBlt, fallback mss region 

9 Linux: Xlib _NET_CLIENT_LIST or xdotool, fallback mss region 

10 

11Follows FrameCapture contract: capture_frame() → JPEG bytes, capture_loop() → generator. 

12 

13Reuses: 

14 - frame_capture.py: FrameConfig, _CaptureCircuitBreaker, _encode_pil_image pattern 

15 - frame_capture.py:54-63: compute_frame_difference() for skip-unchanged 

16""" 

17 

18import io 

19import logging 

20import os 

21import platform 

22import re 

23import subprocess 

24import threading 

25import time 

26from dataclasses import dataclass, field 

27from typing import Dict, Generator, List, Optional, Tuple 

28 

29logger = logging.getLogger('hevolve.remote_desktop') 

30 

31# ── Optional dependencies (guarded imports) ───────────────────── 

32 

33_mss = None 

34_PIL_Image = None 

35 

36try: 

37 import mss as _mss_module 

38 _mss = _mss_module 

39except ImportError: 

40 pass 

41 

42try: 

43 from PIL import Image as _PIL_Image_module 

44 _PIL_Image = _PIL_Image_module 

45except ImportError: 

46 pass 

47 

48# Windows-specific (win32gui, win32ui, win32con, win32api, win32process) 

49_win32gui = None 

50_win32ui = None 

51_win32con = None 

52_win32api = None 

53_win32process = None 

54 

55try: 

56 import win32gui as _win32gui_mod 

57 import win32ui as _win32ui_mod 

58 import win32con as _win32con_mod 

59 import win32api as _win32api_mod 

60 import win32process as _win32process_mod 

61 _win32gui = _win32gui_mod 

62 _win32ui = _win32ui_mod 

63 _win32con = _win32con_mod 

64 _win32api = _win32api_mod 

65 _win32process = _win32process_mod 

66except ImportError: 

67 pass 

68 

69# Linux-specific (Xlib) 

70_Xlib_display = None 

71try: 

72 from Xlib import display as _Xlib_display_mod 

73 _Xlib_display = _Xlib_display_mod 

74except ImportError: 

75 pass 

76 

77 

78# ── Data Structures ──────────────────────────────────────────── 

79 

80@dataclass 

81class WindowInfo: 

82 """Metadata for a single OS window. 

83 

84 The trailing ``z_order`` / ``is_foreground`` / ``is_occluded`` / 

85 ``occluded_pct`` / ``is_protected`` / ``monitor_idx`` fields were 

86 added in Phase 1 of the VLM best-of-all-worlds plan (memory/ 

87 vlm_best_of_all_worlds_plan.md §1). All have safe defaults so 

88 existing callers (window_session, dlna_bridge, agent_tools) keep 

89 working without modification. 

90 """ 

91 hwnd: int # Window handle (HWND on Windows, XID on Linux) 

92 title: str 

93 process_name: str 

94 pid: int 

95 rect: Tuple[int, int, int, int] # (x, y, width, height) 

96 visible: bool = True 

97 minimized: bool = False 

98 # ── Phase-1 additions (VLM occlusion + multi-monitor) ── 

99 z_order: int = 0 # 0 = topmost; higher = further back 

100 is_foreground: bool = False # True if this is the active window 

101 is_occluded: bool = False # True if any other window covers > 5% of rect 

102 occluded_pct: float = 0.0 # 0.0–100.0; % of rect area covered 

103 is_protected: bool = False # DWM-cloaked (DRM, virtual desktop hidden) 

104 monitor_idx: int = -1 # Index into list_monitors() (-1 = unknown) 

105 

106 def to_dict(self) -> dict: 

107 return { 

108 'hwnd': self.hwnd, 

109 'title': self.title, 

110 'process_name': self.process_name, 

111 'pid': self.pid, 

112 'rect': list(self.rect), 

113 'visible': self.visible, 

114 'minimized': self.minimized, 

115 'z_order': self.z_order, 

116 'is_foreground': self.is_foreground, 

117 'is_occluded': self.is_occluded, 

118 'occluded_pct': round(self.occluded_pct, 1), 

119 'is_protected': self.is_protected, 

120 'monitor_idx': self.monitor_idx, 

121 } 

122 

123 @classmethod 

124 def from_dict(cls, d: dict) -> 'WindowInfo': 

125 return cls( 

126 hwnd=d['hwnd'], 

127 title=d['title'], 

128 process_name=d.get('process_name', ''), 

129 pid=d.get('pid', 0), 

130 rect=tuple(d.get('rect', (0, 0, 0, 0))), 

131 visible=d.get('visible', True), 

132 minimized=d.get('minimized', False), 

133 z_order=d.get('z_order', 0), 

134 is_foreground=d.get('is_foreground', False), 

135 is_occluded=d.get('is_occluded', False), 

136 occluded_pct=d.get('occluded_pct', 0.0), 

137 is_protected=d.get('is_protected', False), 

138 monitor_idx=d.get('monitor_idx', -1), 

139 ) 

140 

141 

142@dataclass 

143class WindowCaptureConfig: 

144 """Configuration for per-window capture.""" 

145 quality: int = 80 # JPEG quality (1-100) 

146 scale_factor: float = 1.0 # Downscale factor 

147 max_fps: int = 30 

148 min_change_threshold: float = 0.01 

149 keyframe_interval: int = 30 

150 adaptive_interval: bool = True 

151 max_backoff_seconds: float = 2.0 

152 

153 

154# ── Window Enumerator ────────────────────────────────────────── 

155 

156class WindowEnumerator: 

157 """Cross-platform window enumeration. 

158 

159 Windows: win32gui.EnumWindows + win32gui.GetWindowText 

160 Linux: Xlib _NET_CLIENT_LIST or xdotool fallback 

161 """ 

162 

163 def __init__(self): 

164 self._system = platform.system() 

165 

166 def list_windows(self, include_minimized: bool = False) -> List[WindowInfo]: 

167 """List all visible application windows on the host. 

168 

169 Args: 

170 include_minimized: Include minimized/iconic windows. 

171 

172 Returns: 

173 List of WindowInfo for each visible window. 

174 """ 

175 if self._system == 'Windows' and _win32gui: 

176 return self._list_windows_win32(include_minimized) 

177 elif self._system == 'Linux': 

178 return self._list_windows_linux(include_minimized) 

179 return [] 

180 

181 def get_window_by_title(self, title_pattern: str) -> Optional[WindowInfo]: 

182 """Find window by title substring or regex pattern.""" 

183 windows = self.list_windows(include_minimized=True) 

184 pattern = re.compile(title_pattern, re.IGNORECASE) 

185 for w in windows: 

186 if pattern.search(w.title): 

187 return w 

188 return None 

189 

190 def get_window_by_pid(self, pid: int) -> Optional[WindowInfo]: 

191 """Find the primary window for a process ID.""" 

192 windows = self.list_windows(include_minimized=True) 

193 for w in windows: 

194 if w.pid == pid: 

195 return w 

196 return None 

197 

198 def refresh_window_info(self, window: WindowInfo) -> Optional[WindowInfo]: 

199 """Refresh a window's position/visibility (handle may have moved).""" 

200 if self._system == 'Windows' and _win32gui: 

201 return self._refresh_win32(window) 

202 elif self._system == 'Linux': 

203 return self._refresh_linux(window) 

204 return None 

205 

206 # ── Windows backend ──────────────────────────────────────── 

207 

208 def _list_windows_win32(self, include_minimized: bool) -> List[WindowInfo]: 

209 """Enumerate windows via Win32 API. 

210 

211 EnumWindows yields windows in **top-to-bottom z-order** — the first 

212 callback invocation is the topmost window. We use that order to 

213 populate ``z_order`` (0 = topmost) and to compute ``is_occluded`` / 

214 ``occluded_pct`` in :func:`_compute_occlusion`. 

215 """ 

216 results = [] 

217 try: 

218 foreground_hwnd = _win32gui.GetForegroundWindow() 

219 except Exception: 

220 foreground_hwnd = 0 

221 

222 # Cache PID → process name across this enumeration. Browsers (Chrome, 

223 # Edge, VS Code) spawn 5–20 windows under the SAME PID; without the 

224 # cache we OpenProcess + QueryFullProcessImageName once per window, 

225 # which is pure-syscall waste on the EnumWindows hot path. 

226 process_name_cache: dict = {} 

227 

228 def enum_callback(hwnd, _): 

229 if not _win32gui.IsWindowVisible(hwnd): 

230 return 

231 title = _win32gui.GetWindowText(hwnd) 

232 if not title: 

233 return 

234 

235 minimized = bool(_win32gui.IsIconic(hwnd)) 

236 if minimized and not include_minimized: 

237 return 

238 

239 # Get window rect 

240 try: 

241 left, top, right, bottom = _win32gui.GetWindowRect(hwnd) 

242 width = right - left 

243 height = bottom - top 

244 if width <= 0 or height <= 0: 

245 return 

246 except Exception: 

247 return 

248 

249 # Get process info — cached per-PID so a 20-window Chrome session 

250 # makes one OpenProcess call instead of 20. 

251 pid = 0 

252 process_name = '' 

253 try: 

254 _, pid = _win32process.GetWindowThreadProcessId(hwnd) 

255 if pid in process_name_cache: 

256 process_name = process_name_cache[pid] 

257 else: 

258 process_name = self._get_process_name_win32(pid) 

259 process_name_cache[pid] = process_name 

260 except Exception: 

261 pass 

262 

263 # Phase-1 enrichment. z_order is just the EnumWindows arrival 

264 # index (top = 0). is_protected uses DWMWA_CLOAKED — true for 

265 # DRM-protected windows (Netflix, banking apps that opt out) 

266 # AND for virtual-desktop-hidden windows (cloaked while not on 

267 # current desktop). Either way, capture_window will return 

268 # black pixels, so the flag warns callers to fall back. 

269 results.append(WindowInfo( 

270 hwnd=hwnd, 

271 title=title, 

272 process_name=process_name, 

273 pid=pid, 

274 rect=(left, top, width, height), 

275 visible=True, 

276 minimized=minimized, 

277 z_order=len(results), 

278 is_foreground=(hwnd == foreground_hwnd), 

279 is_protected=_is_dwm_cloaked(hwnd), 

280 )) 

281 

282 _win32gui.EnumWindows(enum_callback, None) 

283 # Compute occlusion + monitor assignment in a second pass — both 

284 # need the full window list / monitor list to make sense. 

285 _compute_occlusion(results) 

286 try: 

287 _assign_monitors(results, list_monitors()) 

288 except Exception as e: 

289 logger.debug(f"Monitor assignment skipped: {e}") 

290 return results 

291 

292 def _get_process_name_win32(self, pid: int) -> str: 

293 """Get process name from PID on Windows.""" 

294 try: 

295 import ctypes 

296 kernel32 = ctypes.windll.kernel32 

297 PROCESS_QUERY_LIMITED_INFORMATION = 0x1000 

298 handle = kernel32.OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, 

299 False, pid) 

300 if handle: 

301 try: 

302 import os 

303 import ctypes.wintypes 

304 buf = ctypes.create_unicode_buffer(260) 

305 size = ctypes.wintypes.DWORD(260) 

306 kernel32.QueryFullProcessImageNameW(handle, 0, 

307 buf, ctypes.byref(size)) 

308 full_path = buf.value 

309 return os.path.basename(full_path) if full_path else '' 

310 finally: 

311 kernel32.CloseHandle(handle) 

312 except Exception: 

313 pass 

314 return '' 

315 

316 def _refresh_win32(self, window: WindowInfo) -> Optional[WindowInfo]: 

317 """Refresh window info for an existing handle.""" 

318 hwnd = window.hwnd 

319 try: 

320 if not _win32gui.IsWindow(hwnd): 

321 return None 

322 title = _win32gui.GetWindowText(hwnd) 

323 visible = bool(_win32gui.IsWindowVisible(hwnd)) 

324 minimized = bool(_win32gui.IsIconic(hwnd)) 

325 left, top, right, bottom = _win32gui.GetWindowRect(hwnd) 

326 return WindowInfo( 

327 hwnd=hwnd, 

328 title=title, 

329 process_name=window.process_name, 

330 pid=window.pid, 

331 rect=(left, top, right - left, bottom - top), 

332 visible=visible, 

333 minimized=minimized, 

334 ) 

335 except Exception: 

336 return None 

337 

338 # ── Linux backend ────────────────────────────────────────── 

339 

340 def _list_windows_linux(self, include_minimized: bool) -> List[WindowInfo]: 

341 """Enumerate windows via xdotool (most portable) or Xlib.""" 

342 # Try xdotool first (works on both X11 and some Wayland setups) 

343 try: 

344 return self._list_windows_xdotool(include_minimized) 

345 except Exception: 

346 pass 

347 

348 # Fallback to Xlib 

349 if _Xlib_display: 

350 try: 

351 return self._list_windows_xlib(include_minimized) 

352 except Exception: 

353 pass 

354 

355 return [] 

356 

357 def _list_windows_xdotool(self, include_minimized: bool) -> List[WindowInfo]: 

358 """Enumerate via xdotool search + getwindowgeometry.""" 

359 output = subprocess.check_output( 

360 ['xdotool', 'search', '--name', '.'], 

361 timeout=5, 

362 text=True, 

363 ) 

364 results = [] 

365 for line in output.strip().split('\n'): 

366 xid_str = line.strip() 

367 if not xid_str: 

368 continue 

369 try: 

370 xid = int(xid_str) 

371 except ValueError: 

372 continue 

373 

374 info = self._get_xdotool_window_info(xid, include_minimized) 

375 if info: 

376 results.append(info) 

377 return results 

378 

379 def _get_xdotool_window_info(self, xid: int, 

380 include_minimized: bool) -> Optional[WindowInfo]: 

381 """Get window info for a single XID via xdotool.""" 

382 try: 

383 name_out = subprocess.check_output( 

384 ['xdotool', 'getwindowname', str(xid)], 

385 timeout=2, text=True, 

386 ).strip() 

387 except Exception: 

388 return None 

389 

390 if not name_out: 

391 return None 

392 

393 try: 

394 geo_out = subprocess.check_output( 

395 ['xdotool', 'getwindowgeometry', '--shell', str(xid)], 

396 timeout=2, text=True, 

397 ) 

398 except Exception: 

399 return None 

400 

401 # Parse geometry: X=, Y=, WIDTH=, HEIGHT= 

402 geo = {} 

403 for gline in geo_out.strip().split('\n'): 

404 if '=' in gline: 

405 k, v = gline.split('=', 1) 

406 geo[k.strip()] = int(v.strip()) 

407 

408 x = geo.get('X', 0) 

409 y = geo.get('Y', 0) 

410 w = geo.get('WIDTH', 0) 

411 h = geo.get('HEIGHT', 0) 

412 if w <= 0 or h <= 0: 

413 return None 

414 

415 # Get PID 

416 pid = 0 

417 try: 

418 pid_out = subprocess.check_output( 

419 ['xdotool', 'getwindowpid', str(xid)], 

420 timeout=2, text=True, 

421 ).strip() 

422 pid = int(pid_out) 

423 except Exception: 

424 pass 

425 

426 # Get process name from PID 

427 process_name = '' 

428 if pid: 

429 try: 

430 cmd_out = subprocess.check_output( 

431 ['ps', '-p', str(pid), '-o', 'comm='], 

432 timeout=2, text=True, 

433 ).strip() 

434 process_name = cmd_out 

435 except Exception: 

436 pass 

437 

438 return WindowInfo( 

439 hwnd=xid, 

440 title=name_out, 

441 process_name=process_name, 

442 pid=pid, 

443 rect=(x, y, w, h), 

444 visible=True, 

445 minimized=False, 

446 ) 

447 

448 def _list_windows_xlib(self, include_minimized: bool) -> List[WindowInfo]: 

449 """Enumerate via python-xlib _NET_CLIENT_LIST.""" 

450 disp = _Xlib_display.Display() 

451 root = disp.screen().root 

452 

453 # Get _NET_CLIENT_LIST atom 

454 client_list_atom = disp.intern_atom('_NET_CLIENT_LIST') 

455 prop = root.get_full_property(client_list_atom, 0) 

456 if not prop: 

457 disp.close() 

458 return [] 

459 

460 results = [] 

461 for xid in prop.value: 

462 try: 

463 win = disp.create_resource_object('window', xid) 

464 name = win.get_wm_name() or '' 

465 if not name: 

466 continue 

467 geo = win.get_geometry() 

468 pid = 0 

469 pid_atom = disp.intern_atom('_NET_WM_PID') 

470 pid_prop = win.get_full_property(pid_atom, 0) 

471 if pid_prop: 

472 pid = pid_prop.value[0] 

473 

474 results.append(WindowInfo( 

475 hwnd=xid, 

476 title=name, 

477 process_name='', 

478 pid=pid, 

479 rect=(geo.x, geo.y, geo.width, geo.height), 

480 visible=True, 

481 minimized=False, 

482 )) 

483 except Exception: 

484 continue 

485 

486 disp.close() 

487 return results 

488 

489 def _refresh_linux(self, window: WindowInfo) -> Optional[WindowInfo]: 

490 """Refresh window info on Linux.""" 

491 return self._get_xdotool_window_info(window.hwnd, True) 

492 

493 

494# ── Per-Window Frame Capture ────────────────────────────────── 

495 

496class WindowCapture: 

497 """Capture a specific window (not full screen). 

498 

499 Follows FrameCapture contract: capture_frame() → JPEG bytes. 

500 Uses mss region capture with window rect as the capture area. 

501 

502 Windows: Prefers win32gui GetWindowDC + BitBlt (captures even occluded windows). 

503 Linux/fallback: mss region capture (only works if window is visible). 

504 """ 

505 

506 def __init__(self, window_info: WindowInfo, 

507 config: Optional[WindowCaptureConfig] = None): 

508 self._window = window_info 

509 self.config = config or WindowCaptureConfig() 

510 self._running = False 

511 self._last_frame: Optional[bytes] = None 

512 self._frame_count = 0 

513 self._mss_instance = None 

514 self._system = platform.system() 

515 

516 @property 

517 def window_info(self) -> WindowInfo: 

518 return self._window 

519 

520 def capture_frame(self) -> Optional[bytes]: 

521 """Capture single frame of this window as JPEG bytes.""" 

522 # Try win32 (can capture occluded windows) 

523 if self._system == 'Windows' and _win32gui: 

524 try: 

525 frame = self._capture_win32() 

526 if frame: 

527 return frame 

528 except Exception as e: 

529 logger.debug(f"Win32 window capture failed: {e}") 

530 

531 # Fallback: mss region capture (window must be visible) 

532 if _mss: 

533 try: 

534 frame = self._capture_mss_region() 

535 if frame: 

536 return frame 

537 except Exception as e: 

538 logger.debug(f"MSS region capture failed: {e}") 

539 

540 return None 

541 

542 def capture_loop(self) -> Generator[bytes, None, None]: 

543 """Yield JPEG frames of this window (same contract as FrameCapture).""" 

544 self._running = True 

545 interval = 1.0 / self.config.max_fps 

546 adaptive_interval = interval 

547 self._frame_count = 0 

548 

549 try: 

550 from integrations.vision.frame_store import compute_frame_difference 

551 except ImportError: 

552 def compute_frame_difference(f1, f2): 

553 if len(f1) != len(f2): 

554 return 1.0 

555 diff = sum(abs(a - b) for a, b in zip(f1[:1000], f2[:1000])) 

556 return min(diff / (255 * min(len(f1), 1000)), 1.0) 

557 

558 try: 

559 while self._running: 

560 start = time.monotonic() 

561 

562 frame = self.capture_frame() 

563 if frame is None: 

564 time.sleep(interval) 

565 continue 

566 

567 self._frame_count += 1 

568 

569 # Skip unchanged frames (unless keyframe) 

570 is_keyframe = (self._frame_count % 

571 self.config.keyframe_interval == 0) 

572 if self._last_frame and not is_keyframe: 

573 try: 

574 diff = compute_frame_difference( 

575 self._last_frame[:4096], frame[:4096]) 

576 if diff < self.config.min_change_threshold: 

577 if self.config.adaptive_interval: 

578 adaptive_interval = min( 

579 adaptive_interval * 1.5, 

580 self.config.max_backoff_seconds, 

581 ) 

582 elapsed = time.monotonic() - start 

583 time.sleep(max(0, adaptive_interval - elapsed)) 

584 continue 

585 except Exception: 

586 pass 

587 

588 adaptive_interval = interval 

589 self._last_frame = frame 

590 yield frame 

591 

592 elapsed = time.monotonic() - start 

593 time.sleep(max(0, interval - elapsed)) 

594 finally: 

595 self._running = False 

596 self._cleanup() 

597 

598 def stop(self) -> None: 

599 """Stop the capture loop.""" 

600 self._running = False 

601 

602 def is_running(self) -> bool: 

603 return self._running 

604 

605 def get_window_info(self) -> WindowInfo: 

606 """Return current window metadata (position may have changed).""" 

607 enum = WindowEnumerator() 

608 refreshed = enum.refresh_window_info(self._window) 

609 if refreshed: 

610 self._window = refreshed 

611 return self._window 

612 

613 def get_stats(self) -> dict: 

614 return { 

615 'running': self._running, 

616 'frame_count': self._frame_count, 

617 'window': self._window.to_dict(), 

618 'config': { 

619 'max_fps': self.config.max_fps, 

620 'quality': self.config.quality, 

621 'scale_factor': self.config.scale_factor, 

622 }, 

623 } 

624 

625 # ── Windows capture backend ──────────────────────────────── 

626 

627 def _capture_win32(self) -> Optional[bytes]: 

628 """Capture window via Win32 GDI (works even if window is behind others).""" 

629 hwnd = self._window.hwnd 

630 if not _win32gui.IsWindow(hwnd): 

631 return None 

632 

633 # Get client area dimensions 

634 left, top, right, bottom = _win32gui.GetClientRect(hwnd) 

635 width = right - left 

636 height = bottom - top 

637 if width <= 0 or height <= 0: 

638 return None 

639 

640 # Create device contexts 

641 hwnd_dc = _win32gui.GetWindowDC(hwnd) 

642 mfc_dc = _win32ui.CreateDCFromHandle(hwnd_dc) 

643 save_dc = mfc_dc.CreateCompatibleDC() 

644 

645 # Create bitmap 

646 bitmap = _win32ui.CreateBitmap() 

647 bitmap.CreateCompatibleBitmap(mfc_dc, width, height) 

648 save_dc.SelectObject(bitmap) 

649 

650 # BitBlt: copy window content to bitmap 

651 # PW_RENDERFULLCONTENT = 0x00000002 for layered windows 

652 try: 

653 result = save_dc.BitBlt( 

654 (0, 0), (width, height), mfc_dc, 

655 (left, top), _win32con.SRCCOPY, 

656 ) 

657 except Exception: 

658 result = False 

659 

660 if not result and result is not None: 

661 # Cleanup on failure 

662 _win32gui.DeleteObject(bitmap.GetHandle()) 

663 save_dc.DeleteDC() 

664 mfc_dc.DeleteDC() 

665 _win32gui.ReleaseDC(hwnd, hwnd_dc) 

666 return None 

667 

668 # Extract bitmap data 

669 bmp_info = bitmap.GetInfo() 

670 bmp_data = bitmap.GetBitmapBits(True) 

671 

672 # Cleanup GDI objects 

673 _win32gui.DeleteObject(bitmap.GetHandle()) 

674 save_dc.DeleteDC() 

675 mfc_dc.DeleteDC() 

676 _win32gui.ReleaseDC(hwnd, hwnd_dc) 

677 

678 # Convert to JPEG via PIL 

679 if _PIL_Image and bmp_data: 

680 try: 

681 img = _PIL_Image.frombuffer( 

682 'RGB', (bmp_info['bmWidth'], bmp_info['bmHeight']), 

683 bmp_data, 'raw', 'BGRX', 0, 1, 

684 ) 

685 return self._encode_pil_image(img) 

686 except Exception as e: 

687 logger.debug(f"PIL conversion failed: {e}") 

688 

689 return None 

690 

691 # ── MSS region capture backend ───────────────────────────── 

692 

693 def _capture_mss_region(self) -> Optional[bytes]: 

694 """Capture window region via mss (cross-platform, window must be visible).""" 

695 if self._mss_instance is None: 

696 self._mss_instance = _mss.mss() 

697 

698 x, y, w, h = self._window.rect 

699 if w <= 0 or h <= 0: 

700 return None 

701 

702 monitor = {'left': x, 'top': y, 'width': w, 'height': h} 

703 sct_img = self._mss_instance.grab(monitor) 

704 

705 if _PIL_Image: 

706 img = _PIL_Image.frombytes('RGB', sct_img.size, 

707 sct_img.bgra, 'raw', 'BGRX') 

708 return self._encode_pil_image(img) 

709 return _mss.tools.to_png(sct_img.rgb, sct_img.size) 

710 

711 # ── Encoding (matches FrameCapture._encode_pil_image) ────── 

712 

713 def _encode_pil_image(self, img) -> bytes: 

714 """Encode PIL Image to JPEG bytes with configured quality and scale.""" 

715 if self.config.scale_factor != 1.0: 

716 new_size = ( 

717 int(img.width * self.config.scale_factor), 

718 int(img.height * self.config.scale_factor), 

719 ) 

720 img = img.resize(new_size, 

721 _PIL_Image.LANCZOS if _PIL_Image else 1) 

722 

723 buf = io.BytesIO() 

724 img.save(buf, format='JPEG', quality=self.config.quality, optimize=True) 

725 return buf.getvalue() 

726 

727 def _cleanup(self) -> None: 

728 """Release capture resources.""" 

729 if self._mss_instance: 

730 try: 

731 self._mss_instance.close() 

732 except Exception: 

733 pass 

734 self._mss_instance = None 

735 

736 

737# ════════════════════════════════════════════════════════════════════ 

738# Phase 1 of vlm_best_of_all_worlds_plan.md §1 — module-level helpers 

739# the VLM stack uses for occlusion-tolerant capture and multi-monitor 

740# enumeration. Lives in this file (not a sibling) so we have ONE 

741# canonical home for window enumeration; the VLM stack imports from 

742# here rather than maintaining a parallel implementation (Gate 4). 

743# ════════════════════════════════════════════════════════════════════ 

744 

745DWMWA_CLOAKED = 14 # DWM window-attribute index — non-zero = cloaked 

746 

747 

748def _is_dwm_cloaked(hwnd) -> bool: 

749 """True if the window is DWM-cloaked. 

750 

751 Cloaked windows include: 

752 * DRM-protected content (Netflix desktop app, some banking apps 

753 opted out of capture) — PrintWindow/BitBlt return black pixels 

754 * Windows on other virtual desktops (cloaked while not on current 

755 desktop) — capturing them returns last-frame snapshot, often stale 

756 

757 Either way, callers should be told the capture won't reflect live 

758 content and they may want a different fallback. Best-effort: if 

759 dwmapi isn't available (very old Windows), return False. 

760 """ 

761 if not _win32gui: 

762 return False 

763 try: 

764 import ctypes 

765 cloaked = ctypes.c_int(0) 

766 result = ctypes.windll.dwmapi.DwmGetWindowAttribute( 

767 int(hwnd), DWMWA_CLOAKED, 

768 ctypes.byref(cloaked), ctypes.sizeof(cloaked)) 

769 return result == 0 and cloaked.value != 0 

770 except Exception: 

771 return False 

772 

773 

774# Cap on the inner loop of _compute_occlusion. Without it, the 

775# nominally-O(N²) algorithm scales as N(N-1)/2 = 4950 ops at N=100, 

776# 19900 at N=200. In practice typical desktops have <50 visible 

777# windows; extreme outliers (terminal multiplexers, notification 

778# stacks) rarely exceed 100. Capping at OCCLUSION_INNER_CAP+1 

779# windows-above means even at N=500 we do at most 500 * 100 = 50k 

780# cheap rect-intersection ops. Combined with the in-loop 

781# short-circuit (overlap >= win_area → 100%), this is dominated by 

782# the actual EnumWindows syscall overhead. 

783OCCLUSION_INNER_CAP = 100 

784 

785 

786def _compute_occlusion(windows: List[WindowInfo]) -> None: 

787 """Annotate each window's ``is_occluded`` / ``occluded_pct`` in place. 

788 

789 Assumes windows are sorted top-to-bottom z-order (the EnumWindows 

790 callback order). For each window, compute the union of intersections 

791 with every window above it; cap at the window's own area to avoid 

792 over-counting when multiple windows above overlap each other AND this 

793 window. Threshold for is_occluded = > 5% covered (lets small overlay 

794 bars / tray windows not count as 'occluded'). 

795 

796 Performance: O(N × min(N, OCCLUSION_INNER_CAP)) with an inner-loop 

797 short-circuit when overlap_area saturates win_area. 

798 """ 

799 for i, win in enumerate(windows): 

800 if win.minimized: 

801 continue 

802 wx, wy, ww, wh = win.rect 

803 if ww <= 0 or wh <= 0: 

804 continue 

805 win_area = ww * wh 

806 overlap_area = 0 

807 # Inner loop capped: only the topmost OCCLUSION_INNER_CAP windows 

808 # above can occlude this one. Anything deeper than that is 

809 # almost certainly already 100% covered by closer-to-top windows. 

810 upper_bound = min(i, OCCLUSION_INNER_CAP) 

811 for j in range(upper_bound): 

812 other = windows[j] 

813 if other.minimized: 

814 continue 

815 ox, oy, ow, oh = other.rect 

816 ix1 = max(wx, ox) 

817 iy1 = max(wy, oy) 

818 ix2 = min(wx + ww, ox + ow) 

819 iy2 = min(wy + wh, oy + oh) 

820 if ix1 < ix2 and iy1 < iy2: 

821 overlap_area += (ix2 - ix1) * (iy2 - iy1) 

822 # Short-circuit: once we hit 100% covered, more checks 

823 # can't change the verdict. Saves ~half the inner-loop 

824 # work on heavily-stacked desktops. 

825 if overlap_area >= win_area: 

826 overlap_area = win_area 

827 break 

828 win.occluded_pct = (overlap_area / win_area) * 100.0 

829 win.is_occluded = win.occluded_pct > 5.0 

830 

831 

832def _printwindow_with_fallback(hwnd: int, hdc: int, _printwindow=None) -> int: 

833 """Try ``PrintWindow`` with ``PW_RENDERFULLCONTENT=0x02`` (DWM- 

834 aware, captures Chrome / Edge / UWP correctly), fall back to 

835 plain ``PrintWindow`` (flag=0) if the flag is unsupported on 

836 pre-Win10-1903 systems. 

837 

838 Returns the BOOL result of whichever call succeeded, or 0 if 

839 both failed. 

840 

841 ``_printwindow`` is an injection point for unit tests so the 

842 fallback can be verified without a live HWND / GDI context. 

843 Defaults to ``ctypes.windll.user32.PrintWindow``. 

844 """ 

845 if _printwindow is None: 

846 try: 

847 import ctypes 

848 _printwindow = ctypes.windll.user32.PrintWindow 

849 except Exception: 

850 return 0 

851 PW_RENDERFULLCONTENT = 0x02 

852 result = _printwindow(hwnd, hdc, PW_RENDERFULLCONTENT) 

853 if not result: 

854 # Older Win — flag unsupported. Retry without it. Worst case: 

855 # captured frame is missing DWM-rendered content for layered 

856 # windows, but it's still better than nothing. 

857 result = _printwindow(hwnd, hdc, 0) 

858 return result 

859 

860 

861def _assign_monitors(windows: List[WindowInfo], 

862 monitors: List[dict]) -> None: 

863 """Set each window's ``monitor_idx`` based on which monitor its 

864 rect's center point falls on. Monitors that fully contain the 

865 window's center beat partial-overlap monitors (avoids ambiguity 

866 for windows straddling two monitors).""" 

867 for win in windows: 

868 wx, wy, ww, wh = win.rect 

869 cx, cy = wx + ww // 2, wy + wh // 2 

870 win.monitor_idx = -1 

871 for m in monitors: 

872 mx, my, mw, mh = m['rect'] 

873 if mx <= cx < mx + mw and my <= cy < my + mh: 

874 win.monitor_idx = m['idx'] 

875 break 

876 

877 

878# DPI awareness has a single canonical home in core/dpi_awareness.py. 

879# This module imports from there instead of duplicating the 

880# SetProcessDpiAwareness ctypes call. 

881from core.dpi_awareness import ensure_dpi_aware as _ensure_dpi_aware_for_enum 

882 

883 

884def list_monitors() -> List[dict]: 

885 """Enumerate physical displays. 

886 

887 Returns: 

888 List of dicts: ``[{idx, rect: (x,y,w,h), scale_factor, 

889 is_primary, name}]``. ``rect`` is in **physical** pixel 

890 coords (handled by :func:`_ensure_dpi_aware_for_enum` on Win, 

891 Quartz already returns physical coords on macOS, Xinerama 

892 does the same on X11). Negative values are valid for 

893 monitors left/above the primary. ``scale_factor`` is the 

894 DPI scale (1.0 = 96 DPI; 1.5 = 144 DPI / 150% scaling). 

895 Empty list when no backend is available for the host OS. 

896 """ 

897 sysname = platform.system() 

898 if sysname == 'Darwin': 

899 return _list_monitors_macos() 

900 if sysname == 'Linux': 

901 return _list_monitors_linux() 

902 if sysname != 'Windows': 

903 return [] 

904 try: 

905 import ctypes 

906 from ctypes import wintypes 

907 except ImportError: 

908 return [] 

909 

910 _ensure_dpi_aware_for_enum() 

911 

912 MONITORINFOF_PRIMARY = 0x00000001 

913 

914 class MONITORINFOEX(ctypes.Structure): 

915 _fields_ = [ 

916 ('cbSize', wintypes.DWORD), 

917 ('rcMonitor', wintypes.RECT), 

918 ('rcWork', wintypes.RECT), 

919 ('dwFlags', wintypes.DWORD), 

920 ('szDevice', ctypes.c_wchar * 32), 

921 ] 

922 

923 monitors: List[dict] = [] 

924 

925 @ctypes.WINFUNCTYPE( 

926 ctypes.c_int, 

927 wintypes.HMONITOR, 

928 wintypes.HDC, 

929 ctypes.POINTER(wintypes.RECT), 

930 wintypes.LPARAM, 

931 ) 

932 def _enum_proc(hmon, _hdc, _lprect, _lparam): 

933 info = MONITORINFOEX() 

934 info.cbSize = ctypes.sizeof(MONITORINFOEX) 

935 try: 

936 ctypes.windll.user32.GetMonitorInfoW(hmon, ctypes.byref(info)) 

937 except Exception: 

938 return 1 

939 rect = info.rcMonitor 

940 scale_factor = 1.0 

941 try: 

942 # MDT_EFFECTIVE_DPI = 0 (Win 8.1+); falls back to 96 if unavailable 

943 dpi_x = ctypes.c_uint(96) 

944 dpi_y = ctypes.c_uint(96) 

945 ctypes.windll.shcore.GetDpiForMonitor( 

946 hmon, 0, ctypes.byref(dpi_x), ctypes.byref(dpi_y)) 

947 scale_factor = dpi_x.value / 96.0 

948 except (AttributeError, OSError): 

949 pass 

950 monitors.append({ 

951 'idx': len(monitors), 

952 'rect': ( 

953 rect.left, rect.top, 

954 rect.right - rect.left, rect.bottom - rect.top, 

955 ), 

956 'work_rect': ( 

957 info.rcWork.left, info.rcWork.top, 

958 info.rcWork.right - info.rcWork.left, 

959 info.rcWork.bottom - info.rcWork.top, 

960 ), 

961 'scale_factor': scale_factor, 

962 'is_primary': bool(info.dwFlags & MONITORINFOF_PRIMARY), 

963 'name': info.szDevice, 

964 }) 

965 return 1 

966 

967 try: 

968 ctypes.windll.user32.EnumDisplayMonitors(0, 0, _enum_proc, 0) 

969 except Exception as e: 

970 logger.debug(f"EnumDisplayMonitors failed: {e}") 

971 return monitors 

972 

973 

974def _list_monitors_macos() -> List[dict]: 

975 """macOS list_monitors via Quartz NSScreen. Phase 2 of the VLM 

976 plan §1. Requires pyobjc-Quartz (already shipped in the macOS 

977 Nunba bundle); returns ``[]`` if not importable.""" 

978 try: 

979 from AppKit import NSScreen 

980 except ImportError: 

981 try: 

982 from Quartz import CGDisplayBounds, CGGetActiveDisplayList 

983 except ImportError: 

984 return [] 

985 # Fallback Quartz-only path. 

986 return _list_monitors_macos_quartz() 

987 monitors: List[dict] = [] 

988 screens = NSScreen.screens() 

989 main_screen = NSScreen.mainScreen() 

990 main_id = main_screen.deviceDescription()['NSScreenNumber'] if main_screen else None 

991 for idx, screen in enumerate(screens): 

992 frame = screen.frame() 

993 scale = screen.backingScaleFactor() if hasattr( 

994 screen, 'backingScaleFactor') else 1.0 

995 sid = screen.deviceDescription().get('NSScreenNumber') \ 

996 if hasattr(screen, 'deviceDescription') else None 

997 monitors.append({ 

998 'idx': idx, 

999 'rect': ( 

1000 int(frame.origin.x), int(frame.origin.y), 

1001 int(frame.size.width), int(frame.size.height), 

1002 ), 

1003 'work_rect': ( 

1004 int(frame.origin.x), int(frame.origin.y), 

1005 int(frame.size.width), int(frame.size.height), 

1006 ), 

1007 'scale_factor': float(scale), 

1008 'is_primary': (sid == main_id) if sid is not None else (idx == 0), 

1009 'name': str(sid) if sid is not None else f'Display{idx}', 

1010 }) 

1011 return monitors 

1012 

1013 

1014def _list_monitors_macos_quartz() -> List[dict]: 

1015 """Pure-Quartz path used when AppKit isn't importable (rare).""" 

1016 try: 

1017 from Quartz import ( 

1018 CGDisplayBounds, CGGetActiveDisplayList, CGMainDisplayID, 

1019 CGDisplayPixelsWide, CGDisplayPixelsHigh, 

1020 ) 

1021 except ImportError: 

1022 return [] 

1023 import ctypes as _ct 

1024 max_displays = 16 

1025 active = (_ct.c_uint32 * max_displays)() 

1026 count = _ct.c_uint32(0) 

1027 err = CGGetActiveDisplayList(max_displays, active, _ct.byref(count)) 

1028 if err != 0: 

1029 return [] 

1030 main_id = CGMainDisplayID() 

1031 monitors: List[dict] = [] 

1032 for i in range(count.value): 

1033 did = active[i] 

1034 bounds = CGDisplayBounds(did) 

1035 # Scale: physical pixels / logical points 

1036 try: 

1037 pw = CGDisplayPixelsWide(did) 

1038 scale = pw / bounds.size.width if bounds.size.width else 1.0 

1039 except Exception: 

1040 scale = 1.0 

1041 monitors.append({ 

1042 'idx': i, 

1043 'rect': ( 

1044 int(bounds.origin.x), int(bounds.origin.y), 

1045 int(bounds.size.width), int(bounds.size.height), 

1046 ), 

1047 'work_rect': ( 

1048 int(bounds.origin.x), int(bounds.origin.y), 

1049 int(bounds.size.width), int(bounds.size.height), 

1050 ), 

1051 'scale_factor': float(scale), 

1052 'is_primary': did == main_id, 

1053 'name': f'Display{did}', 

1054 }) 

1055 return monitors 

1056 

1057 

1058def _list_monitors_linux() -> List[dict]: 

1059 """Linux list_monitors via Xlib (X11) with xrandr fallback. 

1060 Wayland portal path is in :func:`_list_monitors_wayland_portal` 

1061 and called automatically when XDG_SESSION_TYPE=wayland.""" 

1062 if os.environ.get('XDG_SESSION_TYPE', '').lower() == 'wayland': 

1063 wayland = _list_monitors_wayland_portal() 

1064 if wayland: 

1065 return wayland 

1066 # Fall through to xrandr — works on XWayland and many Wayland 

1067 # compositors that proxy X11 enum requests. 

1068 monitors = _list_monitors_xrandr() 

1069 if monitors: 

1070 return monitors 

1071 if _Xlib_display is not None: 

1072 try: 

1073 return _list_monitors_xlib() 

1074 except Exception as e: 

1075 logger.debug(f"xlib monitor enum failed: {e}") 

1076 return [] 

1077 

1078 

1079def _list_monitors_xrandr() -> List[dict]: 

1080 """xrandr CLI shellout — ubiquitous on X11 + many Wayland setups 

1081 and avoids the python-xlib dependency.""" 

1082 try: 

1083 out = subprocess.check_output( 

1084 ['xrandr', '--listmonitors'], timeout=3, text=True) 

1085 except Exception: 

1086 return [] 

1087 monitors: List[dict] = [] 

1088 for line in out.splitlines(): 

1089 # Format: " 0: +*HDMI-1 1920/598x1080/336+0+0 HDMI-1" 

1090 line = line.strip() 

1091 m = re.match( 

1092 r'(\d+):\s*\+?\*?(\S+)\s+(\d+)/\d+x(\d+)/\d+\+(-?\d+)\+(-?\d+)', 

1093 line) 

1094 if not m: 

1095 continue 

1096 idx = int(m.group(1)) 

1097 name = m.group(2) 

1098 is_primary = '*' in line.split(':', 1)[1].split()[0] 

1099 w, h, x, y = (int(m.group(3)), int(m.group(4)), 

1100 int(m.group(5)), int(m.group(6))) 

1101 monitors.append({ 

1102 'idx': idx, 'rect': (x, y, w, h), 'work_rect': (x, y, w, h), 

1103 'scale_factor': 1.0, 'is_primary': is_primary, 'name': name, 

1104 }) 

1105 return monitors 

1106 

1107 

1108def _list_monitors_xlib(): 

1109 """Pure python-xlib fallback for Xinerama screens.""" 

1110 if _Xlib_display is None: 

1111 return [] 

1112 disp = _Xlib_display.Display() 

1113 try: 

1114 from Xlib.ext import xinerama 

1115 if not xinerama.query_version(disp): 

1116 return [] 

1117 screens = xinerama.query_screens(disp).screens 

1118 primary_idx = 0 

1119 return [{ 

1120 'idx': i, 'rect': (s.x, s.y, s.width, s.height), 

1121 'work_rect': (s.x, s.y, s.width, s.height), 

1122 'scale_factor': 1.0, 

1123 'is_primary': i == primary_idx, 

1124 'name': f'Xinerama{i}', 

1125 } for i, s in enumerate(screens)] 

1126 finally: 

1127 disp.close() 

1128 

1129 

1130def _list_monitors_wayland_portal() -> List[dict]: 

1131 """xdg-desktop-portal screencast / output-info via D-Bus. 

1132 Phase 7 of the VLM plan §1. Stub-quality: returns ``[]`` when 

1133 the portal isn't available. Full impl needs ``dbus-python`` 

1134 which isn't a hard dep; users on Wayland install it themselves 

1135 (``pip install dbus-python``) and this function detects it.""" 

1136 try: 

1137 import dbus # type: ignore 

1138 except ImportError: 

1139 logger.debug( 

1140 'wayland: dbus-python missing; install for portal monitor ' 

1141 'enum, or rely on xrandr/XWayland fallback') 

1142 return [] 

1143 try: 

1144 bus = dbus.SessionBus() 

1145 portal = bus.get_object( 

1146 'org.freedesktop.portal.Desktop', 

1147 '/org/freedesktop/portal/desktop') 

1148 # The OutputInfo interface isn't standardized yet across 

1149 # compositors. This is a best-effort probe; on most setups 

1150 # we'll fall back to xrandr above anyway. 

1151 _ = portal # placeholder for future probe 

1152 except Exception as e: 

1153 logger.debug(f'wayland portal probe failed: {e}') 

1154 return [] 

1155 

1156 

1157def _capture_window_macos(wid: int, *, fmt: str = 'jpeg', 

1158 quality: int = 70) -> Optional[bytes]: 

1159 """macOS per-window capture via CGWindowListCreateImage. 

1160 

1161 ``wid`` is the CGWindowID returned by CGWindowListCopyWindowInfo — 

1162 NOT a generic process handle. Captures even when the window is 

1163 occluded or off-screen (kCGWindowImageBoundsIgnoreFraming). 

1164 

1165 Requires Screen Recording permission on macOS 10.15+ — first 

1166 call surfaces the system prompt; subsequent calls succeed once 

1167 granted. Returns None when permission is denied. 

1168 """ 

1169 try: 

1170 from Quartz import ( 

1171 CGWindowListCreateImage, CGRectNull, 

1172 kCGWindowListOptionIncludingWindow, 

1173 kCGWindowImageBoundsIgnoreFraming, 

1174 kCGWindowImageDefault, 

1175 ) 

1176 from Quartz.CoreGraphics import ( 

1177 CGImageGetWidth, CGImageGetHeight, 

1178 CGImageGetBytesPerRow, CGImageGetDataProvider, 

1179 CGDataProviderCopyData, 

1180 ) 

1181 except ImportError: 

1182 logger.debug('macOS capture: pyobjc-Quartz not installed') 

1183 return None 

1184 if _PIL_Image is None: 

1185 return None 

1186 try: 

1187 image_ref = CGWindowListCreateImage( 

1188 CGRectNull, kCGWindowListOptionIncludingWindow, 

1189 int(wid), 

1190 kCGWindowImageBoundsIgnoreFraming | kCGWindowImageDefault, 

1191 ) 

1192 if image_ref is None: 

1193 return None 

1194 w = CGImageGetWidth(image_ref) 

1195 h = CGImageGetHeight(image_ref) 

1196 bpr = CGImageGetBytesPerRow(image_ref) 

1197 provider = CGImageGetDataProvider(image_ref) 

1198 data = CGDataProviderCopyData(provider) 

1199 # CFData → bytes 

1200 raw = bytes(data) 

1201 # Quartz returns BGRA on little-endian Macs. 

1202 img = _PIL_Image.frombuffer( 

1203 'RGBA', (w, h), raw, 'raw', 'BGRA', bpr, 1).convert('RGB') 

1204 buf = io.BytesIO() 

1205 if fmt.lower() == 'png': 

1206 img.save(buf, format='PNG', optimize=True) 

1207 else: 

1208 img.save(buf, format='JPEG', quality=quality, optimize=True) 

1209 return buf.getvalue() 

1210 except Exception as e: 

1211 logger.debug(f'macOS capture failed for wid={wid}: {e}') 

1212 return None 

1213 

1214 

1215def _capture_window_linux(xid: int, *, fmt: str = 'jpeg', 

1216 quality: int = 70) -> Optional[bytes]: 

1217 """Linux per-window capture. 

1218 

1219 Tries in order: 

1220 1. X11 + XComposite redirect → captures occluded windows 

1221 2. mss region capture using window rect from xdotool/xlib 

1222 (visible windows only — fallback) 

1223 

1224 Wayland: returns None unless the desktop portal granted 

1225 capture permission for this window (rare for cross-app calls). 

1226 """ 

1227 if os.environ.get('XDG_SESSION_TYPE', '').lower() == 'wayland': 

1228 return _capture_window_wayland_portal(xid, fmt=fmt, quality=quality) 

1229 # Try XComposite-aware path first 

1230 composited = _capture_window_xcomposite(xid, fmt=fmt, quality=quality) 

1231 if composited is not None: 

1232 return composited 

1233 # Fall back to mss region capture from the window's known rect 

1234 enum = WindowEnumerator() 

1235 fresh = enum._refresh_linux(WindowInfo( 

1236 hwnd=xid, title='', process_name='', pid=0, rect=(0, 0, 0, 0))) 

1237 if fresh is None or fresh.rect[2] <= 0 or fresh.rect[3] <= 0: 

1238 return None 

1239 if _mss is None or _PIL_Image is None: 

1240 return None 

1241 try: 

1242 with _mss.mss() as sct: 

1243 x, y, w, h = fresh.rect 

1244 sct_img = sct.grab({'left': x, 'top': y, 

1245 'width': w, 'height': h}) 

1246 img = _PIL_Image.frombytes( 

1247 'RGB', sct_img.size, sct_img.bgra, 'raw', 'BGRX') 

1248 buf = io.BytesIO() 

1249 if fmt.lower() == 'png': 

1250 img.save(buf, format='PNG', optimize=True) 

1251 else: 

1252 img.save(buf, format='JPEG', quality=quality, optimize=True) 

1253 return buf.getvalue() 

1254 except Exception as e: 

1255 logger.debug(f'mss region capture failed for xid={xid}: {e}') 

1256 return None 

1257 

1258 

1259def _capture_window_xcomposite(xid: int, *, fmt: str, quality: int) -> Optional[bytes]: 

1260 """X11 XComposite path — captures occluded windows by reading the 

1261 off-screen pixmap the compositor maintains for each redirected 

1262 window. Requires python-xlib + a compositor running (kwin / mutter 

1263 / picom). Returns None if XComposite isn't available.""" 

1264 if _Xlib_display is None: 

1265 return None 

1266 try: 

1267 from Xlib.ext import composite 

1268 except ImportError: 

1269 return None 

1270 if _PIL_Image is None: 

1271 return None 

1272 try: 

1273 disp = _Xlib_display.Display() 

1274 try: 

1275 composite.query_version(disp) 

1276 win = disp.create_resource_object('window', int(xid)) 

1277 composite.redirect_window( 

1278 win, composite.RedirectAutomatic) 

1279 pixmap = composite.name_window_pixmap(win) 

1280 geom = win.get_geometry() 

1281 raw = pixmap.get_image( 

1282 0, 0, geom.width, geom.height, 2, 0xffffffff) 

1283 img = _PIL_Image.frombytes( 

1284 'RGB', (geom.width, geom.height), raw.data, 

1285 'raw', 'BGRX') 

1286 buf = io.BytesIO() 

1287 if fmt.lower() == 'png': 

1288 img.save(buf, format='PNG', optimize=True) 

1289 else: 

1290 img.save(buf, format='JPEG', quality=quality, optimize=True) 

1291 return buf.getvalue() 

1292 finally: 

1293 disp.close() 

1294 except Exception as e: 

1295 logger.debug(f'XComposite capture failed for xid={xid}: {e}') 

1296 return None 

1297 

1298 

1299def _capture_window_wayland_portal(wid: int, *, fmt: str, quality: int) -> Optional[bytes]: 

1300 """xdg-desktop-portal Screenshot.PickWindow. Phase 7 of the VLM 

1301 plan §1 — interactive (user must approve via portal UI), so this 

1302 is best invoked sparingly. Returns None when dbus-python isn't 

1303 installed or the portal denied.""" 

1304 try: 

1305 import dbus # type: ignore 

1306 except ImportError: 

1307 logger.debug( 

1308 'wayland capture: dbus-python missing; install for portal ' 

1309 'screencast or limit to in-app screenshots only') 

1310 return None 

1311 # Full Screenshot.PickWindow flow requires a GLib mainloop wired 

1312 # to handle the portal Response signal. Not invoked from the 

1313 # synchronous VLM action path — out of scope for Phase 7 stub. 

1314 logger.info( 

1315 f'wayland capture for wid={wid}: portal flow not yet wired; ' 

1316 f'returning None. Cross-app capture on Wayland needs an ' 

1317 f'event-loop integration.') 

1318 return None 

1319 

1320 

1321def capture_window_one_shot(hwnd: int, *, fmt: str = 'jpeg', 

1322 quality: int = 70) -> Optional[bytes]: 

1323 """Capture a single window's pixels even when it's occluded / 

1324 not the foreground. 

1325 

1326 Uses ``user32.PrintWindow`` with ``PW_RENDERFULLCONTENT = 0x02`` 

1327 which captures DWM-rendered content correctly for windows that 

1328 don't respond to ``WM_PRINT`` (most modern Win10+ apps including 

1329 Chrome / Edge / UWP). Falls back to plain ``PrintWindow`` (flag 

1330 = 0) for older Windows where the flag is unsupported. 

1331 

1332 Args: 

1333 hwnd: Window handle from :func:`list_windows`. 

1334 fmt: 'jpeg' (default) or 'png'. 

1335 quality: JPEG quality 1–100 (ignored for png). 

1336 

1337 Returns: 

1338 Image bytes, or None if the window is gone / dimensions zero / 

1339 capture failed. 

1340 

1341 Failure modes (callers should handle): 

1342 * DRM-protected / cloaked windows return all-black pixels. Check 

1343 ``WindowInfo.is_protected`` before relying on the capture. 

1344 * Pre-Win 10 1903 lacks PW_RENDERFULLCONTENT. This function 

1345 downgrades to flag=0 with a debug log. 

1346 * Window minimized: returns last-saved DWM thumbnail (may be stale). 

1347 * macOS: uses CGWindowListCreateImage with kCGWindowListOptionIncludingWindow 

1348 + kCGWindowImageBoundsIgnoreFraming so off-screen / occluded 

1349 windows still capture. 

1350 * Linux X11: uses XCompositeNameWindowPixmap when COMPOSITE 

1351 extension is available (most modern desktops); else falls 

1352 back to mss region capture which only works when the 

1353 window is visible. 

1354 * Linux Wayland: cross-app capture is portal-gated and 

1355 per-app-permission; returns None when the portal denies. 

1356 """ 

1357 sysname = platform.system() 

1358 if sysname == 'Darwin': 

1359 return _capture_window_macos(hwnd, fmt=fmt, quality=quality) 

1360 if sysname == 'Linux': 

1361 return _capture_window_linux(hwnd, fmt=fmt, quality=quality) 

1362 if sysname != 'Windows' or not _win32gui or not _PIL_Image: 

1363 return None 

1364 try: 

1365 import ctypes 

1366 except ImportError: 

1367 return None 

1368 if not _win32gui.IsWindow(hwnd): 

1369 return None 

1370 try: 

1371 left, top, right, bottom = _win32gui.GetClientRect(hwnd) 

1372 except Exception: 

1373 return None 

1374 width = right - left 

1375 height = bottom - top 

1376 if width <= 0 or height <= 0: 

1377 return None 

1378 

1379 hwnd_dc = _win32gui.GetWindowDC(hwnd) 

1380 if not hwnd_dc: 

1381 return None 

1382 mfc_dc = None 

1383 save_dc = None 

1384 bitmap = None 

1385 try: 

1386 mfc_dc = _win32ui.CreateDCFromHandle(hwnd_dc) 

1387 save_dc = mfc_dc.CreateCompatibleDC() 

1388 bitmap = _win32ui.CreateBitmap() 

1389 bitmap.CreateCompatibleBitmap(mfc_dc, width, height) 

1390 save_dc.SelectObject(bitmap) 

1391 result = _printwindow_with_fallback(hwnd, save_dc.GetSafeHdc()) 

1392 if not result: 

1393 return None 

1394 bmp_info = bitmap.GetInfo() 

1395 bmp_data = bitmap.GetBitmapBits(True) 

1396 img = _PIL_Image.frombuffer( 

1397 'RGB', 

1398 (bmp_info['bmWidth'], bmp_info['bmHeight']), 

1399 bmp_data, 'raw', 'BGRX', 0, 1, 

1400 ) 

1401 buf = io.BytesIO() 

1402 if fmt.lower() == 'png': 

1403 img.save(buf, format='PNG', optimize=True) 

1404 else: 

1405 img.save(buf, format='JPEG', quality=quality, optimize=True) 

1406 return buf.getvalue() 

1407 except Exception as e: 

1408 logger.debug(f"capture_window_one_shot failed for hwnd={hwnd}: {e}") 

1409 return None 

1410 finally: 

1411 try: 

1412 if bitmap is not None: 

1413 _win32gui.DeleteObject(bitmap.GetHandle()) 

1414 if save_dc is not None: 

1415 save_dc.DeleteDC() 

1416 if mfc_dc is not None: 

1417 mfc_dc.DeleteDC() 

1418 _win32gui.ReleaseDC(hwnd, hwnd_dc) 

1419 except Exception: 

1420 pass 

1421 

1422 

1423def list_windows(*, include_minimized: bool = False) -> List[dict]: 

1424 """VLM-friendly thin wrapper: return list of dicts (not WindowInfo 

1425 objects) ready to ship to the VLM grounding prompt. 

1426 

1427 Calls into :class:`WindowEnumerator` so there's one canonical 

1428 enumerator implementation for the whole codebase. 

1429 """ 

1430 enum = WindowEnumerator() 

1431 return [w.to_dict() for w in enum.list_windows( 

1432 include_minimized=include_minimized)]