Coverage for integrations / remote_desktop / window_capture.py: 35.1%
655 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Window Capture — Per-window enumeration and frame capture for tab-detach streaming.
4Instead of full-screen capture, this module captures individual OS windows so each
5remote application (Notepad, CMD, etc.) can be streamed as a separate session.
7Backends (cross-platform, guarded imports):
8 Windows: win32gui EnumWindows + GetWindowDC/BitBlt, fallback mss region
9 Linux: Xlib _NET_CLIENT_LIST or xdotool, fallback mss region
11Follows FrameCapture contract: capture_frame() → JPEG bytes, capture_loop() → generator.
13Reuses:
14 - frame_capture.py: FrameConfig, _CaptureCircuitBreaker, _encode_pil_image pattern
15 - frame_capture.py:54-63: compute_frame_difference() for skip-unchanged
16"""
18import io
19import logging
20import os
21import platform
22import re
23import subprocess
24import threading
25import time
26from dataclasses import dataclass, field
27from typing import Dict, Generator, List, Optional, Tuple
29logger = logging.getLogger('hevolve.remote_desktop')
31# ── Optional dependencies (guarded imports) ─────────────────────
33_mss = None
34_PIL_Image = None
36try:
37 import mss as _mss_module
38 _mss = _mss_module
39except ImportError:
40 pass
42try:
43 from PIL import Image as _PIL_Image_module
44 _PIL_Image = _PIL_Image_module
45except ImportError:
46 pass
48# Windows-specific (win32gui, win32ui, win32con, win32api, win32process)
49_win32gui = None
50_win32ui = None
51_win32con = None
52_win32api = None
53_win32process = None
55try:
56 import win32gui as _win32gui_mod
57 import win32ui as _win32ui_mod
58 import win32con as _win32con_mod
59 import win32api as _win32api_mod
60 import win32process as _win32process_mod
61 _win32gui = _win32gui_mod
62 _win32ui = _win32ui_mod
63 _win32con = _win32con_mod
64 _win32api = _win32api_mod
65 _win32process = _win32process_mod
66except ImportError:
67 pass
69# Linux-specific (Xlib)
70_Xlib_display = None
71try:
72 from Xlib import display as _Xlib_display_mod
73 _Xlib_display = _Xlib_display_mod
74except ImportError:
75 pass
78# ── Data Structures ────────────────────────────────────────────
80@dataclass
81class WindowInfo:
82 """Metadata for a single OS window.
84 The trailing ``z_order`` / ``is_foreground`` / ``is_occluded`` /
85 ``occluded_pct`` / ``is_protected`` / ``monitor_idx`` fields were
86 added in Phase 1 of the VLM best-of-all-worlds plan (memory/
87 vlm_best_of_all_worlds_plan.md §1). All have safe defaults so
88 existing callers (window_session, dlna_bridge, agent_tools) keep
89 working without modification.
90 """
91 hwnd: int # Window handle (HWND on Windows, XID on Linux)
92 title: str
93 process_name: str
94 pid: int
95 rect: Tuple[int, int, int, int] # (x, y, width, height)
96 visible: bool = True
97 minimized: bool = False
98 # ── Phase-1 additions (VLM occlusion + multi-monitor) ──
99 z_order: int = 0 # 0 = topmost; higher = further back
100 is_foreground: bool = False # True if this is the active window
101 is_occluded: bool = False # True if any other window covers > 5% of rect
102 occluded_pct: float = 0.0 # 0.0–100.0; % of rect area covered
103 is_protected: bool = False # DWM-cloaked (DRM, virtual desktop hidden)
104 monitor_idx: int = -1 # Index into list_monitors() (-1 = unknown)
106 def to_dict(self) -> dict:
107 return {
108 'hwnd': self.hwnd,
109 'title': self.title,
110 'process_name': self.process_name,
111 'pid': self.pid,
112 'rect': list(self.rect),
113 'visible': self.visible,
114 'minimized': self.minimized,
115 'z_order': self.z_order,
116 'is_foreground': self.is_foreground,
117 'is_occluded': self.is_occluded,
118 'occluded_pct': round(self.occluded_pct, 1),
119 'is_protected': self.is_protected,
120 'monitor_idx': self.monitor_idx,
121 }
123 @classmethod
124 def from_dict(cls, d: dict) -> 'WindowInfo':
125 return cls(
126 hwnd=d['hwnd'],
127 title=d['title'],
128 process_name=d.get('process_name', ''),
129 pid=d.get('pid', 0),
130 rect=tuple(d.get('rect', (0, 0, 0, 0))),
131 visible=d.get('visible', True),
132 minimized=d.get('minimized', False),
133 z_order=d.get('z_order', 0),
134 is_foreground=d.get('is_foreground', False),
135 is_occluded=d.get('is_occluded', False),
136 occluded_pct=d.get('occluded_pct', 0.0),
137 is_protected=d.get('is_protected', False),
138 monitor_idx=d.get('monitor_idx', -1),
139 )
142@dataclass
143class WindowCaptureConfig:
144 """Configuration for per-window capture."""
145 quality: int = 80 # JPEG quality (1-100)
146 scale_factor: float = 1.0 # Downscale factor
147 max_fps: int = 30
148 min_change_threshold: float = 0.01
149 keyframe_interval: int = 30
150 adaptive_interval: bool = True
151 max_backoff_seconds: float = 2.0
154# ── Window Enumerator ──────────────────────────────────────────
156class WindowEnumerator:
157 """Cross-platform window enumeration.
159 Windows: win32gui.EnumWindows + win32gui.GetWindowText
160 Linux: Xlib _NET_CLIENT_LIST or xdotool fallback
161 """
163 def __init__(self):
164 self._system = platform.system()
166 def list_windows(self, include_minimized: bool = False) -> List[WindowInfo]:
167 """List all visible application windows on the host.
169 Args:
170 include_minimized: Include minimized/iconic windows.
172 Returns:
173 List of WindowInfo for each visible window.
174 """
175 if self._system == 'Windows' and _win32gui:
176 return self._list_windows_win32(include_minimized)
177 elif self._system == 'Linux':
178 return self._list_windows_linux(include_minimized)
179 return []
181 def get_window_by_title(self, title_pattern: str) -> Optional[WindowInfo]:
182 """Find window by title substring or regex pattern."""
183 windows = self.list_windows(include_minimized=True)
184 pattern = re.compile(title_pattern, re.IGNORECASE)
185 for w in windows:
186 if pattern.search(w.title):
187 return w
188 return None
190 def get_window_by_pid(self, pid: int) -> Optional[WindowInfo]:
191 """Find the primary window for a process ID."""
192 windows = self.list_windows(include_minimized=True)
193 for w in windows:
194 if w.pid == pid:
195 return w
196 return None
198 def refresh_window_info(self, window: WindowInfo) -> Optional[WindowInfo]:
199 """Refresh a window's position/visibility (handle may have moved)."""
200 if self._system == 'Windows' and _win32gui:
201 return self._refresh_win32(window)
202 elif self._system == 'Linux':
203 return self._refresh_linux(window)
204 return None
206 # ── Windows backend ────────────────────────────────────────
208 def _list_windows_win32(self, include_minimized: bool) -> List[WindowInfo]:
209 """Enumerate windows via Win32 API.
211 EnumWindows yields windows in **top-to-bottom z-order** — the first
212 callback invocation is the topmost window. We use that order to
213 populate ``z_order`` (0 = topmost) and to compute ``is_occluded`` /
214 ``occluded_pct`` in :func:`_compute_occlusion`.
215 """
216 results = []
217 try:
218 foreground_hwnd = _win32gui.GetForegroundWindow()
219 except Exception:
220 foreground_hwnd = 0
222 # Cache PID → process name across this enumeration. Browsers (Chrome,
223 # Edge, VS Code) spawn 5–20 windows under the SAME PID; without the
224 # cache we OpenProcess + QueryFullProcessImageName once per window,
225 # which is pure-syscall waste on the EnumWindows hot path.
226 process_name_cache: dict = {}
228 def enum_callback(hwnd, _):
229 if not _win32gui.IsWindowVisible(hwnd):
230 return
231 title = _win32gui.GetWindowText(hwnd)
232 if not title:
233 return
235 minimized = bool(_win32gui.IsIconic(hwnd))
236 if minimized and not include_minimized:
237 return
239 # Get window rect
240 try:
241 left, top, right, bottom = _win32gui.GetWindowRect(hwnd)
242 width = right - left
243 height = bottom - top
244 if width <= 0 or height <= 0:
245 return
246 except Exception:
247 return
249 # Get process info — cached per-PID so a 20-window Chrome session
250 # makes one OpenProcess call instead of 20.
251 pid = 0
252 process_name = ''
253 try:
254 _, pid = _win32process.GetWindowThreadProcessId(hwnd)
255 if pid in process_name_cache:
256 process_name = process_name_cache[pid]
257 else:
258 process_name = self._get_process_name_win32(pid)
259 process_name_cache[pid] = process_name
260 except Exception:
261 pass
263 # Phase-1 enrichment. z_order is just the EnumWindows arrival
264 # index (top = 0). is_protected uses DWMWA_CLOAKED — true for
265 # DRM-protected windows (Netflix, banking apps that opt out)
266 # AND for virtual-desktop-hidden windows (cloaked while not on
267 # current desktop). Either way, capture_window will return
268 # black pixels, so the flag warns callers to fall back.
269 results.append(WindowInfo(
270 hwnd=hwnd,
271 title=title,
272 process_name=process_name,
273 pid=pid,
274 rect=(left, top, width, height),
275 visible=True,
276 minimized=minimized,
277 z_order=len(results),
278 is_foreground=(hwnd == foreground_hwnd),
279 is_protected=_is_dwm_cloaked(hwnd),
280 ))
282 _win32gui.EnumWindows(enum_callback, None)
283 # Compute occlusion + monitor assignment in a second pass — both
284 # need the full window list / monitor list to make sense.
285 _compute_occlusion(results)
286 try:
287 _assign_monitors(results, list_monitors())
288 except Exception as e:
289 logger.debug(f"Monitor assignment skipped: {e}")
290 return results
292 def _get_process_name_win32(self, pid: int) -> str:
293 """Get process name from PID on Windows."""
294 try:
295 import ctypes
296 kernel32 = ctypes.windll.kernel32
297 PROCESS_QUERY_LIMITED_INFORMATION = 0x1000
298 handle = kernel32.OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION,
299 False, pid)
300 if handle:
301 try:
302 import os
303 import ctypes.wintypes
304 buf = ctypes.create_unicode_buffer(260)
305 size = ctypes.wintypes.DWORD(260)
306 kernel32.QueryFullProcessImageNameW(handle, 0,
307 buf, ctypes.byref(size))
308 full_path = buf.value
309 return os.path.basename(full_path) if full_path else ''
310 finally:
311 kernel32.CloseHandle(handle)
312 except Exception:
313 pass
314 return ''
316 def _refresh_win32(self, window: WindowInfo) -> Optional[WindowInfo]:
317 """Refresh window info for an existing handle."""
318 hwnd = window.hwnd
319 try:
320 if not _win32gui.IsWindow(hwnd):
321 return None
322 title = _win32gui.GetWindowText(hwnd)
323 visible = bool(_win32gui.IsWindowVisible(hwnd))
324 minimized = bool(_win32gui.IsIconic(hwnd))
325 left, top, right, bottom = _win32gui.GetWindowRect(hwnd)
326 return WindowInfo(
327 hwnd=hwnd,
328 title=title,
329 process_name=window.process_name,
330 pid=window.pid,
331 rect=(left, top, right - left, bottom - top),
332 visible=visible,
333 minimized=minimized,
334 )
335 except Exception:
336 return None
338 # ── Linux backend ──────────────────────────────────────────
340 def _list_windows_linux(self, include_minimized: bool) -> List[WindowInfo]:
341 """Enumerate windows via xdotool (most portable) or Xlib."""
342 # Try xdotool first (works on both X11 and some Wayland setups)
343 try:
344 return self._list_windows_xdotool(include_minimized)
345 except Exception:
346 pass
348 # Fallback to Xlib
349 if _Xlib_display:
350 try:
351 return self._list_windows_xlib(include_minimized)
352 except Exception:
353 pass
355 return []
357 def _list_windows_xdotool(self, include_minimized: bool) -> List[WindowInfo]:
358 """Enumerate via xdotool search + getwindowgeometry."""
359 output = subprocess.check_output(
360 ['xdotool', 'search', '--name', '.'],
361 timeout=5,
362 text=True,
363 )
364 results = []
365 for line in output.strip().split('\n'):
366 xid_str = line.strip()
367 if not xid_str:
368 continue
369 try:
370 xid = int(xid_str)
371 except ValueError:
372 continue
374 info = self._get_xdotool_window_info(xid, include_minimized)
375 if info:
376 results.append(info)
377 return results
379 def _get_xdotool_window_info(self, xid: int,
380 include_minimized: bool) -> Optional[WindowInfo]:
381 """Get window info for a single XID via xdotool."""
382 try:
383 name_out = subprocess.check_output(
384 ['xdotool', 'getwindowname', str(xid)],
385 timeout=2, text=True,
386 ).strip()
387 except Exception:
388 return None
390 if not name_out:
391 return None
393 try:
394 geo_out = subprocess.check_output(
395 ['xdotool', 'getwindowgeometry', '--shell', str(xid)],
396 timeout=2, text=True,
397 )
398 except Exception:
399 return None
401 # Parse geometry: X=, Y=, WIDTH=, HEIGHT=
402 geo = {}
403 for gline in geo_out.strip().split('\n'):
404 if '=' in gline:
405 k, v = gline.split('=', 1)
406 geo[k.strip()] = int(v.strip())
408 x = geo.get('X', 0)
409 y = geo.get('Y', 0)
410 w = geo.get('WIDTH', 0)
411 h = geo.get('HEIGHT', 0)
412 if w <= 0 or h <= 0:
413 return None
415 # Get PID
416 pid = 0
417 try:
418 pid_out = subprocess.check_output(
419 ['xdotool', 'getwindowpid', str(xid)],
420 timeout=2, text=True,
421 ).strip()
422 pid = int(pid_out)
423 except Exception:
424 pass
426 # Get process name from PID
427 process_name = ''
428 if pid:
429 try:
430 cmd_out = subprocess.check_output(
431 ['ps', '-p', str(pid), '-o', 'comm='],
432 timeout=2, text=True,
433 ).strip()
434 process_name = cmd_out
435 except Exception:
436 pass
438 return WindowInfo(
439 hwnd=xid,
440 title=name_out,
441 process_name=process_name,
442 pid=pid,
443 rect=(x, y, w, h),
444 visible=True,
445 minimized=False,
446 )
448 def _list_windows_xlib(self, include_minimized: bool) -> List[WindowInfo]:
449 """Enumerate via python-xlib _NET_CLIENT_LIST."""
450 disp = _Xlib_display.Display()
451 root = disp.screen().root
453 # Get _NET_CLIENT_LIST atom
454 client_list_atom = disp.intern_atom('_NET_CLIENT_LIST')
455 prop = root.get_full_property(client_list_atom, 0)
456 if not prop:
457 disp.close()
458 return []
460 results = []
461 for xid in prop.value:
462 try:
463 win = disp.create_resource_object('window', xid)
464 name = win.get_wm_name() or ''
465 if not name:
466 continue
467 geo = win.get_geometry()
468 pid = 0
469 pid_atom = disp.intern_atom('_NET_WM_PID')
470 pid_prop = win.get_full_property(pid_atom, 0)
471 if pid_prop:
472 pid = pid_prop.value[0]
474 results.append(WindowInfo(
475 hwnd=xid,
476 title=name,
477 process_name='',
478 pid=pid,
479 rect=(geo.x, geo.y, geo.width, geo.height),
480 visible=True,
481 minimized=False,
482 ))
483 except Exception:
484 continue
486 disp.close()
487 return results
489 def _refresh_linux(self, window: WindowInfo) -> Optional[WindowInfo]:
490 """Refresh window info on Linux."""
491 return self._get_xdotool_window_info(window.hwnd, True)
494# ── Per-Window Frame Capture ──────────────────────────────────
496class WindowCapture:
497 """Capture a specific window (not full screen).
499 Follows FrameCapture contract: capture_frame() → JPEG bytes.
500 Uses mss region capture with window rect as the capture area.
502 Windows: Prefers win32gui GetWindowDC + BitBlt (captures even occluded windows).
503 Linux/fallback: mss region capture (only works if window is visible).
504 """
506 def __init__(self, window_info: WindowInfo,
507 config: Optional[WindowCaptureConfig] = None):
508 self._window = window_info
509 self.config = config or WindowCaptureConfig()
510 self._running = False
511 self._last_frame: Optional[bytes] = None
512 self._frame_count = 0
513 self._mss_instance = None
514 self._system = platform.system()
516 @property
517 def window_info(self) -> WindowInfo:
518 return self._window
520 def capture_frame(self) -> Optional[bytes]:
521 """Capture single frame of this window as JPEG bytes."""
522 # Try win32 (can capture occluded windows)
523 if self._system == 'Windows' and _win32gui:
524 try:
525 frame = self._capture_win32()
526 if frame:
527 return frame
528 except Exception as e:
529 logger.debug(f"Win32 window capture failed: {e}")
531 # Fallback: mss region capture (window must be visible)
532 if _mss:
533 try:
534 frame = self._capture_mss_region()
535 if frame:
536 return frame
537 except Exception as e:
538 logger.debug(f"MSS region capture failed: {e}")
540 return None
542 def capture_loop(self) -> Generator[bytes, None, None]:
543 """Yield JPEG frames of this window (same contract as FrameCapture)."""
544 self._running = True
545 interval = 1.0 / self.config.max_fps
546 adaptive_interval = interval
547 self._frame_count = 0
549 try:
550 from integrations.vision.frame_store import compute_frame_difference
551 except ImportError:
552 def compute_frame_difference(f1, f2):
553 if len(f1) != len(f2):
554 return 1.0
555 diff = sum(abs(a - b) for a, b in zip(f1[:1000], f2[:1000]))
556 return min(diff / (255 * min(len(f1), 1000)), 1.0)
558 try:
559 while self._running:
560 start = time.monotonic()
562 frame = self.capture_frame()
563 if frame is None:
564 time.sleep(interval)
565 continue
567 self._frame_count += 1
569 # Skip unchanged frames (unless keyframe)
570 is_keyframe = (self._frame_count %
571 self.config.keyframe_interval == 0)
572 if self._last_frame and not is_keyframe:
573 try:
574 diff = compute_frame_difference(
575 self._last_frame[:4096], frame[:4096])
576 if diff < self.config.min_change_threshold:
577 if self.config.adaptive_interval:
578 adaptive_interval = min(
579 adaptive_interval * 1.5,
580 self.config.max_backoff_seconds,
581 )
582 elapsed = time.monotonic() - start
583 time.sleep(max(0, adaptive_interval - elapsed))
584 continue
585 except Exception:
586 pass
588 adaptive_interval = interval
589 self._last_frame = frame
590 yield frame
592 elapsed = time.monotonic() - start
593 time.sleep(max(0, interval - elapsed))
594 finally:
595 self._running = False
596 self._cleanup()
598 def stop(self) -> None:
599 """Stop the capture loop."""
600 self._running = False
602 def is_running(self) -> bool:
603 return self._running
605 def get_window_info(self) -> WindowInfo:
606 """Return current window metadata (position may have changed)."""
607 enum = WindowEnumerator()
608 refreshed = enum.refresh_window_info(self._window)
609 if refreshed:
610 self._window = refreshed
611 return self._window
613 def get_stats(self) -> dict:
614 return {
615 'running': self._running,
616 'frame_count': self._frame_count,
617 'window': self._window.to_dict(),
618 'config': {
619 'max_fps': self.config.max_fps,
620 'quality': self.config.quality,
621 'scale_factor': self.config.scale_factor,
622 },
623 }
625 # ── Windows capture backend ────────────────────────────────
627 def _capture_win32(self) -> Optional[bytes]:
628 """Capture window via Win32 GDI (works even if window is behind others)."""
629 hwnd = self._window.hwnd
630 if not _win32gui.IsWindow(hwnd):
631 return None
633 # Get client area dimensions
634 left, top, right, bottom = _win32gui.GetClientRect(hwnd)
635 width = right - left
636 height = bottom - top
637 if width <= 0 or height <= 0:
638 return None
640 # Create device contexts
641 hwnd_dc = _win32gui.GetWindowDC(hwnd)
642 mfc_dc = _win32ui.CreateDCFromHandle(hwnd_dc)
643 save_dc = mfc_dc.CreateCompatibleDC()
645 # Create bitmap
646 bitmap = _win32ui.CreateBitmap()
647 bitmap.CreateCompatibleBitmap(mfc_dc, width, height)
648 save_dc.SelectObject(bitmap)
650 # BitBlt: copy window content to bitmap
651 # PW_RENDERFULLCONTENT = 0x00000002 for layered windows
652 try:
653 result = save_dc.BitBlt(
654 (0, 0), (width, height), mfc_dc,
655 (left, top), _win32con.SRCCOPY,
656 )
657 except Exception:
658 result = False
660 if not result and result is not None:
661 # Cleanup on failure
662 _win32gui.DeleteObject(bitmap.GetHandle())
663 save_dc.DeleteDC()
664 mfc_dc.DeleteDC()
665 _win32gui.ReleaseDC(hwnd, hwnd_dc)
666 return None
668 # Extract bitmap data
669 bmp_info = bitmap.GetInfo()
670 bmp_data = bitmap.GetBitmapBits(True)
672 # Cleanup GDI objects
673 _win32gui.DeleteObject(bitmap.GetHandle())
674 save_dc.DeleteDC()
675 mfc_dc.DeleteDC()
676 _win32gui.ReleaseDC(hwnd, hwnd_dc)
678 # Convert to JPEG via PIL
679 if _PIL_Image and bmp_data:
680 try:
681 img = _PIL_Image.frombuffer(
682 'RGB', (bmp_info['bmWidth'], bmp_info['bmHeight']),
683 bmp_data, 'raw', 'BGRX', 0, 1,
684 )
685 return self._encode_pil_image(img)
686 except Exception as e:
687 logger.debug(f"PIL conversion failed: {e}")
689 return None
691 # ── MSS region capture backend ─────────────────────────────
693 def _capture_mss_region(self) -> Optional[bytes]:
694 """Capture window region via mss (cross-platform, window must be visible)."""
695 if self._mss_instance is None:
696 self._mss_instance = _mss.mss()
698 x, y, w, h = self._window.rect
699 if w <= 0 or h <= 0:
700 return None
702 monitor = {'left': x, 'top': y, 'width': w, 'height': h}
703 sct_img = self._mss_instance.grab(monitor)
705 if _PIL_Image:
706 img = _PIL_Image.frombytes('RGB', sct_img.size,
707 sct_img.bgra, 'raw', 'BGRX')
708 return self._encode_pil_image(img)
709 return _mss.tools.to_png(sct_img.rgb, sct_img.size)
711 # ── Encoding (matches FrameCapture._encode_pil_image) ──────
713 def _encode_pil_image(self, img) -> bytes:
714 """Encode PIL Image to JPEG bytes with configured quality and scale."""
715 if self.config.scale_factor != 1.0:
716 new_size = (
717 int(img.width * self.config.scale_factor),
718 int(img.height * self.config.scale_factor),
719 )
720 img = img.resize(new_size,
721 _PIL_Image.LANCZOS if _PIL_Image else 1)
723 buf = io.BytesIO()
724 img.save(buf, format='JPEG', quality=self.config.quality, optimize=True)
725 return buf.getvalue()
727 def _cleanup(self) -> None:
728 """Release capture resources."""
729 if self._mss_instance:
730 try:
731 self._mss_instance.close()
732 except Exception:
733 pass
734 self._mss_instance = None
737# ════════════════════════════════════════════════════════════════════
738# Phase 1 of vlm_best_of_all_worlds_plan.md §1 — module-level helpers
739# the VLM stack uses for occlusion-tolerant capture and multi-monitor
740# enumeration. Lives in this file (not a sibling) so we have ONE
741# canonical home for window enumeration; the VLM stack imports from
742# here rather than maintaining a parallel implementation (Gate 4).
743# ════════════════════════════════════════════════════════════════════
745DWMWA_CLOAKED = 14 # DWM window-attribute index — non-zero = cloaked
748def _is_dwm_cloaked(hwnd) -> bool:
749 """True if the window is DWM-cloaked.
751 Cloaked windows include:
752 * DRM-protected content (Netflix desktop app, some banking apps
753 opted out of capture) — PrintWindow/BitBlt return black pixels
754 * Windows on other virtual desktops (cloaked while not on current
755 desktop) — capturing them returns last-frame snapshot, often stale
757 Either way, callers should be told the capture won't reflect live
758 content and they may want a different fallback. Best-effort: if
759 dwmapi isn't available (very old Windows), return False.
760 """
761 if not _win32gui:
762 return False
763 try:
764 import ctypes
765 cloaked = ctypes.c_int(0)
766 result = ctypes.windll.dwmapi.DwmGetWindowAttribute(
767 int(hwnd), DWMWA_CLOAKED,
768 ctypes.byref(cloaked), ctypes.sizeof(cloaked))
769 return result == 0 and cloaked.value != 0
770 except Exception:
771 return False
774# Cap on the inner loop of _compute_occlusion. Without it, the
775# nominally-O(N²) algorithm scales as N(N-1)/2 = 4950 ops at N=100,
776# 19900 at N=200. In practice typical desktops have <50 visible
777# windows; extreme outliers (terminal multiplexers, notification
778# stacks) rarely exceed 100. Capping at OCCLUSION_INNER_CAP+1
779# windows-above means even at N=500 we do at most 500 * 100 = 50k
780# cheap rect-intersection ops. Combined with the in-loop
781# short-circuit (overlap >= win_area → 100%), this is dominated by
782# the actual EnumWindows syscall overhead.
783OCCLUSION_INNER_CAP = 100
786def _compute_occlusion(windows: List[WindowInfo]) -> None:
787 """Annotate each window's ``is_occluded`` / ``occluded_pct`` in place.
789 Assumes windows are sorted top-to-bottom z-order (the EnumWindows
790 callback order). For each window, compute the union of intersections
791 with every window above it; cap at the window's own area to avoid
792 over-counting when multiple windows above overlap each other AND this
793 window. Threshold for is_occluded = > 5% covered (lets small overlay
794 bars / tray windows not count as 'occluded').
796 Performance: O(N × min(N, OCCLUSION_INNER_CAP)) with an inner-loop
797 short-circuit when overlap_area saturates win_area.
798 """
799 for i, win in enumerate(windows):
800 if win.minimized:
801 continue
802 wx, wy, ww, wh = win.rect
803 if ww <= 0 or wh <= 0:
804 continue
805 win_area = ww * wh
806 overlap_area = 0
807 # Inner loop capped: only the topmost OCCLUSION_INNER_CAP windows
808 # above can occlude this one. Anything deeper than that is
809 # almost certainly already 100% covered by closer-to-top windows.
810 upper_bound = min(i, OCCLUSION_INNER_CAP)
811 for j in range(upper_bound):
812 other = windows[j]
813 if other.minimized:
814 continue
815 ox, oy, ow, oh = other.rect
816 ix1 = max(wx, ox)
817 iy1 = max(wy, oy)
818 ix2 = min(wx + ww, ox + ow)
819 iy2 = min(wy + wh, oy + oh)
820 if ix1 < ix2 and iy1 < iy2:
821 overlap_area += (ix2 - ix1) * (iy2 - iy1)
822 # Short-circuit: once we hit 100% covered, more checks
823 # can't change the verdict. Saves ~half the inner-loop
824 # work on heavily-stacked desktops.
825 if overlap_area >= win_area:
826 overlap_area = win_area
827 break
828 win.occluded_pct = (overlap_area / win_area) * 100.0
829 win.is_occluded = win.occluded_pct > 5.0
832def _printwindow_with_fallback(hwnd: int, hdc: int, _printwindow=None) -> int:
833 """Try ``PrintWindow`` with ``PW_RENDERFULLCONTENT=0x02`` (DWM-
834 aware, captures Chrome / Edge / UWP correctly), fall back to
835 plain ``PrintWindow`` (flag=0) if the flag is unsupported on
836 pre-Win10-1903 systems.
838 Returns the BOOL result of whichever call succeeded, or 0 if
839 both failed.
841 ``_printwindow`` is an injection point for unit tests so the
842 fallback can be verified without a live HWND / GDI context.
843 Defaults to ``ctypes.windll.user32.PrintWindow``.
844 """
845 if _printwindow is None:
846 try:
847 import ctypes
848 _printwindow = ctypes.windll.user32.PrintWindow
849 except Exception:
850 return 0
851 PW_RENDERFULLCONTENT = 0x02
852 result = _printwindow(hwnd, hdc, PW_RENDERFULLCONTENT)
853 if not result:
854 # Older Win — flag unsupported. Retry without it. Worst case:
855 # captured frame is missing DWM-rendered content for layered
856 # windows, but it's still better than nothing.
857 result = _printwindow(hwnd, hdc, 0)
858 return result
861def _assign_monitors(windows: List[WindowInfo],
862 monitors: List[dict]) -> None:
863 """Set each window's ``monitor_idx`` based on which monitor its
864 rect's center point falls on. Monitors that fully contain the
865 window's center beat partial-overlap monitors (avoids ambiguity
866 for windows straddling two monitors)."""
867 for win in windows:
868 wx, wy, ww, wh = win.rect
869 cx, cy = wx + ww // 2, wy + wh // 2
870 win.monitor_idx = -1
871 for m in monitors:
872 mx, my, mw, mh = m['rect']
873 if mx <= cx < mx + mw and my <= cy < my + mh:
874 win.monitor_idx = m['idx']
875 break
878# DPI awareness has a single canonical home in core/dpi_awareness.py.
879# This module imports from there instead of duplicating the
880# SetProcessDpiAwareness ctypes call.
881from core.dpi_awareness import ensure_dpi_aware as _ensure_dpi_aware_for_enum
884def list_monitors() -> List[dict]:
885 """Enumerate physical displays.
887 Returns:
888 List of dicts: ``[{idx, rect: (x,y,w,h), scale_factor,
889 is_primary, name}]``. ``rect`` is in **physical** pixel
890 coords (handled by :func:`_ensure_dpi_aware_for_enum` on Win,
891 Quartz already returns physical coords on macOS, Xinerama
892 does the same on X11). Negative values are valid for
893 monitors left/above the primary. ``scale_factor`` is the
894 DPI scale (1.0 = 96 DPI; 1.5 = 144 DPI / 150% scaling).
895 Empty list when no backend is available for the host OS.
896 """
897 sysname = platform.system()
898 if sysname == 'Darwin':
899 return _list_monitors_macos()
900 if sysname == 'Linux':
901 return _list_monitors_linux()
902 if sysname != 'Windows':
903 return []
904 try:
905 import ctypes
906 from ctypes import wintypes
907 except ImportError:
908 return []
910 _ensure_dpi_aware_for_enum()
912 MONITORINFOF_PRIMARY = 0x00000001
914 class MONITORINFOEX(ctypes.Structure):
915 _fields_ = [
916 ('cbSize', wintypes.DWORD),
917 ('rcMonitor', wintypes.RECT),
918 ('rcWork', wintypes.RECT),
919 ('dwFlags', wintypes.DWORD),
920 ('szDevice', ctypes.c_wchar * 32),
921 ]
923 monitors: List[dict] = []
925 @ctypes.WINFUNCTYPE(
926 ctypes.c_int,
927 wintypes.HMONITOR,
928 wintypes.HDC,
929 ctypes.POINTER(wintypes.RECT),
930 wintypes.LPARAM,
931 )
932 def _enum_proc(hmon, _hdc, _lprect, _lparam):
933 info = MONITORINFOEX()
934 info.cbSize = ctypes.sizeof(MONITORINFOEX)
935 try:
936 ctypes.windll.user32.GetMonitorInfoW(hmon, ctypes.byref(info))
937 except Exception:
938 return 1
939 rect = info.rcMonitor
940 scale_factor = 1.0
941 try:
942 # MDT_EFFECTIVE_DPI = 0 (Win 8.1+); falls back to 96 if unavailable
943 dpi_x = ctypes.c_uint(96)
944 dpi_y = ctypes.c_uint(96)
945 ctypes.windll.shcore.GetDpiForMonitor(
946 hmon, 0, ctypes.byref(dpi_x), ctypes.byref(dpi_y))
947 scale_factor = dpi_x.value / 96.0
948 except (AttributeError, OSError):
949 pass
950 monitors.append({
951 'idx': len(monitors),
952 'rect': (
953 rect.left, rect.top,
954 rect.right - rect.left, rect.bottom - rect.top,
955 ),
956 'work_rect': (
957 info.rcWork.left, info.rcWork.top,
958 info.rcWork.right - info.rcWork.left,
959 info.rcWork.bottom - info.rcWork.top,
960 ),
961 'scale_factor': scale_factor,
962 'is_primary': bool(info.dwFlags & MONITORINFOF_PRIMARY),
963 'name': info.szDevice,
964 })
965 return 1
967 try:
968 ctypes.windll.user32.EnumDisplayMonitors(0, 0, _enum_proc, 0)
969 except Exception as e:
970 logger.debug(f"EnumDisplayMonitors failed: {e}")
971 return monitors
974def _list_monitors_macos() -> List[dict]:
975 """macOS list_monitors via Quartz NSScreen. Phase 2 of the VLM
976 plan §1. Requires pyobjc-Quartz (already shipped in the macOS
977 Nunba bundle); returns ``[]`` if not importable."""
978 try:
979 from AppKit import NSScreen
980 except ImportError:
981 try:
982 from Quartz import CGDisplayBounds, CGGetActiveDisplayList
983 except ImportError:
984 return []
985 # Fallback Quartz-only path.
986 return _list_monitors_macos_quartz()
987 monitors: List[dict] = []
988 screens = NSScreen.screens()
989 main_screen = NSScreen.mainScreen()
990 main_id = main_screen.deviceDescription()['NSScreenNumber'] if main_screen else None
991 for idx, screen in enumerate(screens):
992 frame = screen.frame()
993 scale = screen.backingScaleFactor() if hasattr(
994 screen, 'backingScaleFactor') else 1.0
995 sid = screen.deviceDescription().get('NSScreenNumber') \
996 if hasattr(screen, 'deviceDescription') else None
997 monitors.append({
998 'idx': idx,
999 'rect': (
1000 int(frame.origin.x), int(frame.origin.y),
1001 int(frame.size.width), int(frame.size.height),
1002 ),
1003 'work_rect': (
1004 int(frame.origin.x), int(frame.origin.y),
1005 int(frame.size.width), int(frame.size.height),
1006 ),
1007 'scale_factor': float(scale),
1008 'is_primary': (sid == main_id) if sid is not None else (idx == 0),
1009 'name': str(sid) if sid is not None else f'Display{idx}',
1010 })
1011 return monitors
1014def _list_monitors_macos_quartz() -> List[dict]:
1015 """Pure-Quartz path used when AppKit isn't importable (rare)."""
1016 try:
1017 from Quartz import (
1018 CGDisplayBounds, CGGetActiveDisplayList, CGMainDisplayID,
1019 CGDisplayPixelsWide, CGDisplayPixelsHigh,
1020 )
1021 except ImportError:
1022 return []
1023 import ctypes as _ct
1024 max_displays = 16
1025 active = (_ct.c_uint32 * max_displays)()
1026 count = _ct.c_uint32(0)
1027 err = CGGetActiveDisplayList(max_displays, active, _ct.byref(count))
1028 if err != 0:
1029 return []
1030 main_id = CGMainDisplayID()
1031 monitors: List[dict] = []
1032 for i in range(count.value):
1033 did = active[i]
1034 bounds = CGDisplayBounds(did)
1035 # Scale: physical pixels / logical points
1036 try:
1037 pw = CGDisplayPixelsWide(did)
1038 scale = pw / bounds.size.width if bounds.size.width else 1.0
1039 except Exception:
1040 scale = 1.0
1041 monitors.append({
1042 'idx': i,
1043 'rect': (
1044 int(bounds.origin.x), int(bounds.origin.y),
1045 int(bounds.size.width), int(bounds.size.height),
1046 ),
1047 'work_rect': (
1048 int(bounds.origin.x), int(bounds.origin.y),
1049 int(bounds.size.width), int(bounds.size.height),
1050 ),
1051 'scale_factor': float(scale),
1052 'is_primary': did == main_id,
1053 'name': f'Display{did}',
1054 })
1055 return monitors
1058def _list_monitors_linux() -> List[dict]:
1059 """Linux list_monitors via Xlib (X11) with xrandr fallback.
1060 Wayland portal path is in :func:`_list_monitors_wayland_portal`
1061 and called automatically when XDG_SESSION_TYPE=wayland."""
1062 if os.environ.get('XDG_SESSION_TYPE', '').lower() == 'wayland':
1063 wayland = _list_monitors_wayland_portal()
1064 if wayland:
1065 return wayland
1066 # Fall through to xrandr — works on XWayland and many Wayland
1067 # compositors that proxy X11 enum requests.
1068 monitors = _list_monitors_xrandr()
1069 if monitors:
1070 return monitors
1071 if _Xlib_display is not None:
1072 try:
1073 return _list_monitors_xlib()
1074 except Exception as e:
1075 logger.debug(f"xlib monitor enum failed: {e}")
1076 return []
1079def _list_monitors_xrandr() -> List[dict]:
1080 """xrandr CLI shellout — ubiquitous on X11 + many Wayland setups
1081 and avoids the python-xlib dependency."""
1082 try:
1083 out = subprocess.check_output(
1084 ['xrandr', '--listmonitors'], timeout=3, text=True)
1085 except Exception:
1086 return []
1087 monitors: List[dict] = []
1088 for line in out.splitlines():
1089 # Format: " 0: +*HDMI-1 1920/598x1080/336+0+0 HDMI-1"
1090 line = line.strip()
1091 m = re.match(
1092 r'(\d+):\s*\+?\*?(\S+)\s+(\d+)/\d+x(\d+)/\d+\+(-?\d+)\+(-?\d+)',
1093 line)
1094 if not m:
1095 continue
1096 idx = int(m.group(1))
1097 name = m.group(2)
1098 is_primary = '*' in line.split(':', 1)[1].split()[0]
1099 w, h, x, y = (int(m.group(3)), int(m.group(4)),
1100 int(m.group(5)), int(m.group(6)))
1101 monitors.append({
1102 'idx': idx, 'rect': (x, y, w, h), 'work_rect': (x, y, w, h),
1103 'scale_factor': 1.0, 'is_primary': is_primary, 'name': name,
1104 })
1105 return monitors
1108def _list_monitors_xlib():
1109 """Pure python-xlib fallback for Xinerama screens."""
1110 if _Xlib_display is None:
1111 return []
1112 disp = _Xlib_display.Display()
1113 try:
1114 from Xlib.ext import xinerama
1115 if not xinerama.query_version(disp):
1116 return []
1117 screens = xinerama.query_screens(disp).screens
1118 primary_idx = 0
1119 return [{
1120 'idx': i, 'rect': (s.x, s.y, s.width, s.height),
1121 'work_rect': (s.x, s.y, s.width, s.height),
1122 'scale_factor': 1.0,
1123 'is_primary': i == primary_idx,
1124 'name': f'Xinerama{i}',
1125 } for i, s in enumerate(screens)]
1126 finally:
1127 disp.close()
1130def _list_monitors_wayland_portal() -> List[dict]:
1131 """xdg-desktop-portal screencast / output-info via D-Bus.
1132 Phase 7 of the VLM plan §1. Stub-quality: returns ``[]`` when
1133 the portal isn't available. Full impl needs ``dbus-python``
1134 which isn't a hard dep; users on Wayland install it themselves
1135 (``pip install dbus-python``) and this function detects it."""
1136 try:
1137 import dbus # type: ignore
1138 except ImportError:
1139 logger.debug(
1140 'wayland: dbus-python missing; install for portal monitor '
1141 'enum, or rely on xrandr/XWayland fallback')
1142 return []
1143 try:
1144 bus = dbus.SessionBus()
1145 portal = bus.get_object(
1146 'org.freedesktop.portal.Desktop',
1147 '/org/freedesktop/portal/desktop')
1148 # The OutputInfo interface isn't standardized yet across
1149 # compositors. This is a best-effort probe; on most setups
1150 # we'll fall back to xrandr above anyway.
1151 _ = portal # placeholder for future probe
1152 except Exception as e:
1153 logger.debug(f'wayland portal probe failed: {e}')
1154 return []
1157def _capture_window_macos(wid: int, *, fmt: str = 'jpeg',
1158 quality: int = 70) -> Optional[bytes]:
1159 """macOS per-window capture via CGWindowListCreateImage.
1161 ``wid`` is the CGWindowID returned by CGWindowListCopyWindowInfo —
1162 NOT a generic process handle. Captures even when the window is
1163 occluded or off-screen (kCGWindowImageBoundsIgnoreFraming).
1165 Requires Screen Recording permission on macOS 10.15+ — first
1166 call surfaces the system prompt; subsequent calls succeed once
1167 granted. Returns None when permission is denied.
1168 """
1169 try:
1170 from Quartz import (
1171 CGWindowListCreateImage, CGRectNull,
1172 kCGWindowListOptionIncludingWindow,
1173 kCGWindowImageBoundsIgnoreFraming,
1174 kCGWindowImageDefault,
1175 )
1176 from Quartz.CoreGraphics import (
1177 CGImageGetWidth, CGImageGetHeight,
1178 CGImageGetBytesPerRow, CGImageGetDataProvider,
1179 CGDataProviderCopyData,
1180 )
1181 except ImportError:
1182 logger.debug('macOS capture: pyobjc-Quartz not installed')
1183 return None
1184 if _PIL_Image is None:
1185 return None
1186 try:
1187 image_ref = CGWindowListCreateImage(
1188 CGRectNull, kCGWindowListOptionIncludingWindow,
1189 int(wid),
1190 kCGWindowImageBoundsIgnoreFraming | kCGWindowImageDefault,
1191 )
1192 if image_ref is None:
1193 return None
1194 w = CGImageGetWidth(image_ref)
1195 h = CGImageGetHeight(image_ref)
1196 bpr = CGImageGetBytesPerRow(image_ref)
1197 provider = CGImageGetDataProvider(image_ref)
1198 data = CGDataProviderCopyData(provider)
1199 # CFData → bytes
1200 raw = bytes(data)
1201 # Quartz returns BGRA on little-endian Macs.
1202 img = _PIL_Image.frombuffer(
1203 'RGBA', (w, h), raw, 'raw', 'BGRA', bpr, 1).convert('RGB')
1204 buf = io.BytesIO()
1205 if fmt.lower() == 'png':
1206 img.save(buf, format='PNG', optimize=True)
1207 else:
1208 img.save(buf, format='JPEG', quality=quality, optimize=True)
1209 return buf.getvalue()
1210 except Exception as e:
1211 logger.debug(f'macOS capture failed for wid={wid}: {e}')
1212 return None
1215def _capture_window_linux(xid: int, *, fmt: str = 'jpeg',
1216 quality: int = 70) -> Optional[bytes]:
1217 """Linux per-window capture.
1219 Tries in order:
1220 1. X11 + XComposite redirect → captures occluded windows
1221 2. mss region capture using window rect from xdotool/xlib
1222 (visible windows only — fallback)
1224 Wayland: returns None unless the desktop portal granted
1225 capture permission for this window (rare for cross-app calls).
1226 """
1227 if os.environ.get('XDG_SESSION_TYPE', '').lower() == 'wayland':
1228 return _capture_window_wayland_portal(xid, fmt=fmt, quality=quality)
1229 # Try XComposite-aware path first
1230 composited = _capture_window_xcomposite(xid, fmt=fmt, quality=quality)
1231 if composited is not None:
1232 return composited
1233 # Fall back to mss region capture from the window's known rect
1234 enum = WindowEnumerator()
1235 fresh = enum._refresh_linux(WindowInfo(
1236 hwnd=xid, title='', process_name='', pid=0, rect=(0, 0, 0, 0)))
1237 if fresh is None or fresh.rect[2] <= 0 or fresh.rect[3] <= 0:
1238 return None
1239 if _mss is None or _PIL_Image is None:
1240 return None
1241 try:
1242 with _mss.mss() as sct:
1243 x, y, w, h = fresh.rect
1244 sct_img = sct.grab({'left': x, 'top': y,
1245 'width': w, 'height': h})
1246 img = _PIL_Image.frombytes(
1247 'RGB', sct_img.size, sct_img.bgra, 'raw', 'BGRX')
1248 buf = io.BytesIO()
1249 if fmt.lower() == 'png':
1250 img.save(buf, format='PNG', optimize=True)
1251 else:
1252 img.save(buf, format='JPEG', quality=quality, optimize=True)
1253 return buf.getvalue()
1254 except Exception as e:
1255 logger.debug(f'mss region capture failed for xid={xid}: {e}')
1256 return None
1259def _capture_window_xcomposite(xid: int, *, fmt: str, quality: int) -> Optional[bytes]:
1260 """X11 XComposite path — captures occluded windows by reading the
1261 off-screen pixmap the compositor maintains for each redirected
1262 window. Requires python-xlib + a compositor running (kwin / mutter
1263 / picom). Returns None if XComposite isn't available."""
1264 if _Xlib_display is None:
1265 return None
1266 try:
1267 from Xlib.ext import composite
1268 except ImportError:
1269 return None
1270 if _PIL_Image is None:
1271 return None
1272 try:
1273 disp = _Xlib_display.Display()
1274 try:
1275 composite.query_version(disp)
1276 win = disp.create_resource_object('window', int(xid))
1277 composite.redirect_window(
1278 win, composite.RedirectAutomatic)
1279 pixmap = composite.name_window_pixmap(win)
1280 geom = win.get_geometry()
1281 raw = pixmap.get_image(
1282 0, 0, geom.width, geom.height, 2, 0xffffffff)
1283 img = _PIL_Image.frombytes(
1284 'RGB', (geom.width, geom.height), raw.data,
1285 'raw', 'BGRX')
1286 buf = io.BytesIO()
1287 if fmt.lower() == 'png':
1288 img.save(buf, format='PNG', optimize=True)
1289 else:
1290 img.save(buf, format='JPEG', quality=quality, optimize=True)
1291 return buf.getvalue()
1292 finally:
1293 disp.close()
1294 except Exception as e:
1295 logger.debug(f'XComposite capture failed for xid={xid}: {e}')
1296 return None
1299def _capture_window_wayland_portal(wid: int, *, fmt: str, quality: int) -> Optional[bytes]:
1300 """xdg-desktop-portal Screenshot.PickWindow. Phase 7 of the VLM
1301 plan §1 — interactive (user must approve via portal UI), so this
1302 is best invoked sparingly. Returns None when dbus-python isn't
1303 installed or the portal denied."""
1304 try:
1305 import dbus # type: ignore
1306 except ImportError:
1307 logger.debug(
1308 'wayland capture: dbus-python missing; install for portal '
1309 'screencast or limit to in-app screenshots only')
1310 return None
1311 # Full Screenshot.PickWindow flow requires a GLib mainloop wired
1312 # to handle the portal Response signal. Not invoked from the
1313 # synchronous VLM action path — out of scope for Phase 7 stub.
1314 logger.info(
1315 f'wayland capture for wid={wid}: portal flow not yet wired; '
1316 f'returning None. Cross-app capture on Wayland needs an '
1317 f'event-loop integration.')
1318 return None
1321def capture_window_one_shot(hwnd: int, *, fmt: str = 'jpeg',
1322 quality: int = 70) -> Optional[bytes]:
1323 """Capture a single window's pixels even when it's occluded /
1324 not the foreground.
1326 Uses ``user32.PrintWindow`` with ``PW_RENDERFULLCONTENT = 0x02``
1327 which captures DWM-rendered content correctly for windows that
1328 don't respond to ``WM_PRINT`` (most modern Win10+ apps including
1329 Chrome / Edge / UWP). Falls back to plain ``PrintWindow`` (flag
1330 = 0) for older Windows where the flag is unsupported.
1332 Args:
1333 hwnd: Window handle from :func:`list_windows`.
1334 fmt: 'jpeg' (default) or 'png'.
1335 quality: JPEG quality 1–100 (ignored for png).
1337 Returns:
1338 Image bytes, or None if the window is gone / dimensions zero /
1339 capture failed.
1341 Failure modes (callers should handle):
1342 * DRM-protected / cloaked windows return all-black pixels. Check
1343 ``WindowInfo.is_protected`` before relying on the capture.
1344 * Pre-Win 10 1903 lacks PW_RENDERFULLCONTENT. This function
1345 downgrades to flag=0 with a debug log.
1346 * Window minimized: returns last-saved DWM thumbnail (may be stale).
1347 * macOS: uses CGWindowListCreateImage with kCGWindowListOptionIncludingWindow
1348 + kCGWindowImageBoundsIgnoreFraming so off-screen / occluded
1349 windows still capture.
1350 * Linux X11: uses XCompositeNameWindowPixmap when COMPOSITE
1351 extension is available (most modern desktops); else falls
1352 back to mss region capture which only works when the
1353 window is visible.
1354 * Linux Wayland: cross-app capture is portal-gated and
1355 per-app-permission; returns None when the portal denies.
1356 """
1357 sysname = platform.system()
1358 if sysname == 'Darwin':
1359 return _capture_window_macos(hwnd, fmt=fmt, quality=quality)
1360 if sysname == 'Linux':
1361 return _capture_window_linux(hwnd, fmt=fmt, quality=quality)
1362 if sysname != 'Windows' or not _win32gui or not _PIL_Image:
1363 return None
1364 try:
1365 import ctypes
1366 except ImportError:
1367 return None
1368 if not _win32gui.IsWindow(hwnd):
1369 return None
1370 try:
1371 left, top, right, bottom = _win32gui.GetClientRect(hwnd)
1372 except Exception:
1373 return None
1374 width = right - left
1375 height = bottom - top
1376 if width <= 0 or height <= 0:
1377 return None
1379 hwnd_dc = _win32gui.GetWindowDC(hwnd)
1380 if not hwnd_dc:
1381 return None
1382 mfc_dc = None
1383 save_dc = None
1384 bitmap = None
1385 try:
1386 mfc_dc = _win32ui.CreateDCFromHandle(hwnd_dc)
1387 save_dc = mfc_dc.CreateCompatibleDC()
1388 bitmap = _win32ui.CreateBitmap()
1389 bitmap.CreateCompatibleBitmap(mfc_dc, width, height)
1390 save_dc.SelectObject(bitmap)
1391 result = _printwindow_with_fallback(hwnd, save_dc.GetSafeHdc())
1392 if not result:
1393 return None
1394 bmp_info = bitmap.GetInfo()
1395 bmp_data = bitmap.GetBitmapBits(True)
1396 img = _PIL_Image.frombuffer(
1397 'RGB',
1398 (bmp_info['bmWidth'], bmp_info['bmHeight']),
1399 bmp_data, 'raw', 'BGRX', 0, 1,
1400 )
1401 buf = io.BytesIO()
1402 if fmt.lower() == 'png':
1403 img.save(buf, format='PNG', optimize=True)
1404 else:
1405 img.save(buf, format='JPEG', quality=quality, optimize=True)
1406 return buf.getvalue()
1407 except Exception as e:
1408 logger.debug(f"capture_window_one_shot failed for hwnd={hwnd}: {e}")
1409 return None
1410 finally:
1411 try:
1412 if bitmap is not None:
1413 _win32gui.DeleteObject(bitmap.GetHandle())
1414 if save_dc is not None:
1415 save_dc.DeleteDC()
1416 if mfc_dc is not None:
1417 mfc_dc.DeleteDC()
1418 _win32gui.ReleaseDC(hwnd, hwnd_dc)
1419 except Exception:
1420 pass
1423def list_windows(*, include_minimized: bool = False) -> List[dict]:
1424 """VLM-friendly thin wrapper: return list of dicts (not WindowInfo
1425 objects) ready to ship to the VLM grounding prompt.
1427 Calls into :class:`WindowEnumerator` so there's one canonical
1428 enumerator implementation for the whole codebase.
1429 """
1430 enum = WindowEnumerator()
1431 return [w.to_dict() for w in enum.list_windows(
1432 include_minimized=include_minimized)]