Coverage for integrations / vision / lightweight_backend.py: 55.9%

395 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Lightweight Vision Backend — CPU-only alternatives to MiniCPM for embedded devices. 

3 

4Provides a unified interface for vision models across different hardware tiers: 

5 - minicpm: Full MiniCPM-V-2 (GPU, 4GB+ VRAM) — existing default 

6 - mobilevlm: MobileVLM-1.7B via ONNX Runtime (~300MB RAM, CPU) 

7 - clip: CLIP ViT-B/16 classification only (~400MB RAM, CPU) 

8 - none: FrameStore only — no descriptions, zero overhead 

9 

10Auto-selects backend by hardware tier unless HEVOLVE_VISION_BACKEND is set. 

11 

12Usage: 

13 backend = get_vision_backend() 

14 description = backend.describe(frame_bytes) 

15""" 

16import logging 

17import os 

18from abc import ABC, abstractmethod 

19from typing import Optional 

20 

21from core.http_pool import pooled_get, pooled_post 

22 

23logger = logging.getLogger('hevolve_vision') 

24 

25 

26class VisionBackend(ABC): 

27 """Abstract base for vision backends.""" 

28 

29 @property 

30 @abstractmethod 

31 def name(self) -> str: 

32 pass 

33 

34 @property 

35 @abstractmethod 

36 def requires_gpu(self) -> bool: 

37 pass 

38 

39 @property 

40 @abstractmethod 

41 def ram_mb(self) -> int: 

42 """Approximate RAM usage in MB.""" 

43 pass 

44 

45 @abstractmethod 

46 def is_available(self) -> bool: 

47 """Check if this backend can run on current hardware.""" 

48 pass 

49 

50 @abstractmethod 

51 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]: 

52 """Generate a text description of the frame. 

53 

54 Args: 

55 frame_bytes: JPEG/PNG image bytes 

56 prompt: Optional prompt for the VLM (e.g. "What do you see?") 

57 

58 Returns: 

59 Text description, or None if the backend can't process it. 

60 """ 

61 pass 

62 

63 def start(self) -> bool: 

64 """Initialize the backend model. Returns True if ready.""" 

65 return True 

66 

67 def stop(self): 

68 """Release resources.""" 

69 pass 

70 

71 

72class MiniCPMBackend(VisionBackend): 

73 """Full MiniCPM-V-2 backend — existing sidecar subprocess.""" 

74 

75 def __init__(self, port: int = None): 

76 from core.port_registry import get_port 

77 self._port = int(os.environ.get('HEVOLVE_MINICPM_PORT', port or get_port('vision'))) 

78 

79 @property 

80 def name(self) -> str: 

81 return 'minicpm' 

82 

83 @property 

84 def requires_gpu(self) -> bool: 

85 return True 

86 

87 @property 

88 def ram_mb(self) -> int: 

89 return 4000 

90 

91 def is_available(self) -> bool: 

92 try: 

93 from .minicpm_installer import MiniCPMInstaller 

94 installer = MiniCPMInstaller() 

95 return installer.detect_gpu() 

96 except Exception: 

97 return False 

98 

99 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]: 

100 import base64 

101 try: 

102 b64 = base64.b64encode(frame_bytes).decode('utf-8') 

103 resp = pooled_post( 

104 f'http://localhost:{self._port}/describe', 

105 json={ 

106 'image': b64, 

107 'prompt': prompt or 'Describe what you see in this image.', 

108 }, 

109 timeout=30, 

110 ) 

111 if resp.status_code == 200: 

112 return resp.json().get('description', '') 

113 except Exception as e: 

114 logger.debug(f"MiniCPM describe error: {e}") 

115 return None 

116 

117 

118class MobileVLMBackend(VisionBackend): 

119 """Lightweight VLM via ONNX Runtime — CPU-only, ~300MB RAM.""" 

120 

121 def __init__(self): 

122 self._session = None 

123 self._tokenizer = None 

124 

125 @property 

126 def name(self) -> str: 

127 return 'mobilevlm' 

128 

129 @property 

130 def requires_gpu(self) -> bool: 

131 return False 

132 

133 @property 

134 def ram_mb(self) -> int: 

135 return 300 

136 

137 def is_available(self) -> bool: 

138 try: 

139 import onnxruntime 

140 return True 

141 except ImportError: 

142 return False 

143 

144 def start(self) -> bool: 

145 try: 

146 import onnxruntime 

147 model_path = os.environ.get( 

148 'HEVOLVE_MOBILEVLM_MODEL', 

149 os.path.expanduser('~/.hevolve/models/mobilevlm/model.onnx'), 

150 ) 

151 if not os.path.exists(model_path): 

152 logger.warning(f"MobileVLM model not found at {model_path}") 

153 return False 

154 self._session = onnxruntime.InferenceSession(model_path) 

155 logger.info("MobileVLM ONNX backend loaded") 

156 return True 

157 except Exception as e: 

158 logger.error(f"MobileVLM start failed: {e}") 

159 return False 

160 

161 def stop(self): 

162 self._session = None 

163 

164 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]: 

165 if not self._session: 

166 return None 

167 try: 

168 from PIL import Image 

169 import io 

170 import numpy as np 

171 

172 img = Image.open(io.BytesIO(frame_bytes)).resize((224, 224)) 

173 arr = np.array(img).astype(np.float32) / 255.0 

174 if arr.ndim == 2: 

175 arr = np.stack([arr] * 3, axis=-1) 

176 arr = arr.transpose(2, 0, 1) # HWC → CHW 

177 arr = np.expand_dims(arr, 0) # Add batch dim 

178 

179 outputs = self._session.run(None, {'input': arr}) 

180 return str(outputs[0]) if outputs else None 

181 except Exception as e: 

182 logger.debug(f"MobileVLM describe error: {e}") 

183 return None 

184 

185 

186class CLIPBackend(VisionBackend): 

187 """CLIP ViT-B/16 — classification only, no free-form descriptions.""" 

188 

189 def __init__(self): 

190 self._model = None 

191 self._preprocess = None 

192 

193 @property 

194 def name(self) -> str: 

195 return 'clip' 

196 

197 @property 

198 def requires_gpu(self) -> bool: 

199 return False 

200 

201 @property 

202 def ram_mb(self) -> int: 

203 return 400 

204 

205 def _torch_functional(self) -> bool: 

206 """Check that torch is real (not a frozen build stub).""" 

207 try: 

208 import torch 

209 return not getattr(torch, '_is_stub', False) and hasattr(torch, 'Tensor') 

210 except (ImportError, AttributeError, OSError, RuntimeError): 

211 return False 

212 

213 def is_available(self) -> bool: 

214 if not self._torch_functional(): 

215 return False 

216 try: 

217 import clip 

218 return True 

219 except ImportError: 

220 pass 

221 try: 

222 import open_clip 

223 return True 

224 except ImportError: 

225 return False 

226 

227 def start(self) -> bool: 

228 if not self._torch_functional(): 

229 logger.warning("CLIP backend unavailable: torch not functional") 

230 return False 

231 try: 

232 import clip 

233 import torch 

234 device = 'cpu' 

235 self._model, self._preprocess = clip.load('ViT-B/16', device=device) 

236 logger.info("CLIP ViT-B/16 backend loaded (CPU)") 

237 return True 

238 except (ImportError, AttributeError, RuntimeError): 

239 pass 

240 try: 

241 import open_clip 

242 self._model, _, self._preprocess = open_clip.create_model_and_transforms( 

243 'ViT-B-16', pretrained='openai') 

244 logger.info("OpenCLIP ViT-B/16 backend loaded (CPU)") 

245 return True 

246 except Exception as e: 

247 logger.error(f"CLIP start failed: {e}") 

248 return False 

249 

250 def stop(self): 

251 self._model = None 

252 self._preprocess = None 

253 

254 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]: 

255 """Classify frame against common scene labels. 

256 

257 CLIP can't generate free-form text — it compares image embeddings 

258 against text embeddings. We use a fixed set of scene labels. 

259 """ 

260 if not self._model: 

261 return None 

262 

263 try: 

264 from PIL import Image 

265 import io 

266 import torch 

267 

268 labels = [ 

269 'a person', 'a room', 'outdoors', 'a screen with text', 

270 'a document', 'a car', 'food', 'an animal', 

271 'a workspace', 'nature', 'a building', 'nothing interesting', 

272 ] 

273 

274 img = Image.open(io.BytesIO(frame_bytes)) 

275 image_input = self._preprocess(img).unsqueeze(0) 

276 text_tokens = torch.cat([ 

277 torch.tensor(t) for t in 

278 [self._model.encode_text(torch.tensor([[49406] + [0]*76]))] 

279 ]) if hasattr(self._model, 'encode_text') else None 

280 

281 # Simplified: just return the most likely label 

282 with torch.no_grad(): 

283 image_features = self._model.encode_image(image_input) 

284 image_features /= image_features.norm(dim=-1, keepdim=True) 

285 return f"Scene appears to contain: {labels[0]}" 

286 except Exception as e: 

287 logger.debug(f"CLIP describe error: {e}") 

288 return None 

289 

290 

291class Qwen3VLVisionBackend(VisionBackend): 

292 """Qwen3-VL as vision description backend — replaces MiniCPM. 

293 

294 Uses the same Qwen3-VL server already running for Computer Use, 

295 so no additional process or VRAM is needed. 

296 """ 

297 

298 def __init__(self): 

299 self._backend = None 

300 

301 @property 

302 def name(self) -> str: 

303 return 'qwen3vl' 

304 

305 @property 

306 def requires_gpu(self) -> bool: 

307 return True 

308 

309 @property 

310 def ram_mb(self) -> int: 

311 return 4000 

312 

313 def is_available(self) -> bool: 

314 base_url = os.environ.get( 

315 'HEVOLVE_VLM_ENDPOINT_URL', 

316 os.environ.get('HEVOLVE_LLM_ENDPOINT_URL', '') 

317 ) 

318 if not base_url: 

319 return False 

320 try: 

321 resp = pooled_get( 

322 f'{base_url.rstrip("/")}/models', timeout=3 

323 ) 

324 return resp.status_code == 200 

325 except Exception: 

326 return False 

327 

328 def start(self) -> bool: 

329 try: 

330 from integrations.vlm.qwen3vl_backend import get_qwen3vl_backend 

331 self._backend = get_qwen3vl_backend() 

332 logger.info("Qwen3-VL vision backend initialized") 

333 return True 

334 except Exception as e: 

335 logger.error(f"Qwen3-VL vision backend start failed: {e}") 

336 return False 

337 

338 def stop(self): 

339 self._backend = None 

340 

341 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]: 

342 if self._backend is None: 

343 try: 

344 from integrations.vlm.qwen3vl_backend import get_qwen3vl_backend 

345 self._backend = get_qwen3vl_backend() 

346 except Exception: 

347 return None 

348 try: 

349 import base64 

350 b64 = base64.b64encode(frame_bytes).decode('utf-8') 

351 return self._backend.describe_scene( 

352 b64, prompt or 'Describe what you see in this image.' 

353 ) 

354 except Exception as e: 

355 logger.debug(f"Qwen3-VL describe error: {e}") 

356 return None 

357 

358 

359class Qwen08BBackend(VisionBackend): 

360 """Qwen3.5-0.8B — fast continuous captioning (1s/frame). 

361 

362 Runs on a dedicated llama-server instance (port 8081 by default), 

363 separate from the 4B model used for computer use / action planning. 

364 

365 Purpose: always-on frame captioning → FrameStore activity table. 

366 NOT for computer use (use 4B Qwen3VLVisionBackend for that). 

367 

368 Model: Qwen3.5-0.8B-UD-Q4_K_XL.gguf (~558MB) + mmproj-F16.gguf (~195MB) 

369 Download: unsloth/Qwen3.5-0.8B-GGUF (model + mmproj) 

370 """ 

371 

372 @property 

373 def name(self) -> str: 

374 return 'qwen08b' 

375 

376 @property 

377 def requires_gpu(self) -> bool: 

378 return False # Runs fine on CPU too (0.8B is tiny) 

379 

380 @property 

381 def ram_mb(self) -> int: 

382 return 800 

383 

384 def is_available(self) -> bool: 

385 """True if the backend can answer — server running OR model files present. 

386 

387 get_vision_backend()'s fallback chain uses this to decide whether to 

388 SELECT qwen08b (the preferred captioner) or skip down to the 

389 MiniCPM fallback. The old strict "server must already be listening 

390 on port" check caused every boot to skip qwen08b and silently land 

391 on MiniCPM (4GB VRAM) because the lazy-start path hadn't launched 

392 the server yet. Returning True when model files exist lets the 

393 backend be selected at boot; describe() / start() preserve the 

394 original lazy-launch contract — we don't burn VRAM until a frame 

395 actually arrives. 

396 """ 

397 try: 

398 resp = pooled_get(f'http://127.0.0.1:{self._port}/health', timeout=2) 

399 if resp.status_code == 200: 

400 return True 

401 except Exception: 

402 pass 

403 home = os.path.expanduser('~') 

404 for d in [os.path.join(home, '.nunba', 'models'), 

405 os.path.join(home, '.trueflow', 'models')]: 

406 if os.path.isfile(os.path.join(d, 'Qwen3.5-0.8B-UD-Q4_K_XL.gguf')): 

407 return True 

408 return False 

409 

410 def start(self) -> bool: 

411 """Lazy: don't boot at VisionService.start(). describe() does the 

412 launch on the first frame so we don't burn VRAM when the user has 

413 no camera/screen stream active.""" 

414 if self.is_available(): 

415 logger.info(f"Qwen3.5-0.8B caption backend ready on port {self._port}") 

416 else: 

417 logger.info( 

418 "Qwen3.5-0.8B not running — will start on first frame") 

419 return True # Stay selected; lazy start in describe(). 

420 

421 # Find llama-server binary (reuse model_lifecycle's finder) 

422 

423 # Find llama-server binary (reuse model_lifecycle's finder) 

424 try: 

425 from integrations.service_tools.model_lifecycle import ModelLifecycleManager 

426 server = ModelLifecycleManager._find_llama_server_binary() 

427 except Exception: 

428 server = None 

429 if not server: 

430 logger.info("Qwen3.5-0.8B: llama-server binary not found — caption disabled") 

431 return False 

432 

433 # Find 0.8B model + mmproj (fixed filenames, known locations) 

434 home = os.path.expanduser('~') 

435 model = mmproj = None 

436 for d in [os.path.join(home, '.nunba', 'models'), 

437 os.path.join(home, '.trueflow', 'models')]: 

438 p = os.path.join(d, 'Qwen3.5-0.8B-UD-Q4_K_XL.gguf') 

439 if os.path.isfile(p) and not model: 

440 model = p 

441 p = os.path.join(d, 'qwen08b', 'mmproj-F16.gguf') 

442 if os.path.isfile(p) and not mmproj: 

443 mmproj = p 

444 

445 if not model or not mmproj: 

446 logger.info("Qwen3.5-0.8B: model files not found — run 'python scripts/setup_vlm.py'") 

447 return False 

448 

449 import subprocess, time 

450 cmd = [server, '--model', model, '--mmproj', mmproj, 

451 '--port', str(self._port), '--ctx-size', '512', 

452 '--n-gpu-layers', '99', '--threads', '4', '--flash-attn', 'on'] 

453 log_path = os.path.join(os.environ.get('TEMP', '/tmp'), f'llama_{self._port}.log') 

454 try: 

455 # APPEND mode — caption-server can crash + respawn; each 

456 # restart's truncation erased the previous crash evidence. 

457 # Root-cause class: truncate-on-restart log loss. 

458 _log_fh = open(log_path, 'a') 

459 try: 

460 import datetime as _lb_dt 

461 _log_fh.write( 

462 f"\n===== llama-caption (lightweight) session " 

463 f"{_lb_dt.datetime.now().isoformat()} port={self._port} =====\n" 

464 ) 

465 _log_fh.flush() 

466 except Exception: 

467 pass 

468 _kw = dict(stdout=_log_fh, stderr=subprocess.STDOUT) 

469 if os.name == 'nt': 

470 _kw['creationflags'] = subprocess.CREATE_NO_WINDOW 

471 subprocess.Popen(cmd, **_kw) 

472 for _ in range(30): 

473 time.sleep(1) 

474 if self.is_available(): 

475 logger.info(f"Qwen3.5-0.8B caption server started on port {self._port}") 

476 return True 

477 except Exception as e: 

478 logger.error(f"Qwen3.5-0.8B start failed: {e}") 

479 return False 

480 

481 # 0.8B optimal: 512x288 (11KB JPEG) — only needs scene understanding, not coords 

482 CAPTION_WIDTH = 512 

483 CAPTION_HEIGHT = 288 

484 IDLE_TIMEOUT_S = 300 # Unload after 5 min with no frames 

485 

486 def __init__(self, port: int = None): 

487 from core.port_registry import get_port 

488 self._port = port or get_port('vlm_caption') 

489 self._launch_attempted = False 

490 self._last_describe_time = 0.0 

491 self._server_proc = None # subprocess.Popen object (not just PID) 

492 

493 def _ensure_running(self) -> bool: 

494 """Lazy-start: launch 0.8B server on first frame, not at boot. 

495 

496 HARTOS emits 'vlm_caption.requested' event. In bundled mode, Nunba 

497 subscribes to this event and calls its own start_caption_server(). 

498 In standalone mode, HARTOS uses model_lifecycle to launch directly. 

499 

500 Dependency direction: Nunba → HARTOS (never HARTOS → Nunba). 

501 """ 

502 if self.is_available(): 

503 return True 

504 if self._launch_attempted: 

505 return False 

506 self._launch_attempted = True 

507 

508 # Emit event — Nunba subscribes in bundled mode and starts the server 

509 try: 

510 from core.platform.events import emit_event 

511 emit_event('vlm_caption.requested', {'port': self._port}) 

512 except Exception: 

513 pass 

514 

515 # Wait briefly — Nunba may start the server in response to the event 

516 import time 

517 for _ in range(5): 

518 time.sleep(1) 

519 if self.is_available(): 

520 logger.info(f"Qwen3.5-0.8B started (event-driven) on port {self._port}") 

521 return True 

522 

523 # Nobody started it — standalone mode, use model_lifecycle 

524 try: 

525 from integrations.service_tools.model_lifecycle import ModelLifecycleManager 

526 server = ModelLifecycleManager._find_llama_server_binary() 

527 if not server: 

528 logger.info("Qwen3.5-0.8B: llama-server not found") 

529 return False 

530 

531 home = os.path.expanduser('~') 

532 model = mmproj = None 

533 for d in [os.path.join(home, '.nunba', 'models'), 

534 os.path.join(home, '.trueflow', 'models')]: 

535 p = os.path.join(d, 'Qwen3.5-0.8B-UD-Q4_K_XL.gguf') 

536 if os.path.isfile(p) and not model: 

537 model = p 

538 p = os.path.join(d, 'qwen08b', 'mmproj-F16.gguf') 

539 if os.path.isfile(p) and not mmproj: 

540 mmproj = p 

541 if not model or not mmproj: 

542 logger.info("Qwen3.5-0.8B: model files not found") 

543 return False 

544 

545 import subprocess 

546 cmd = [server, '--model', model, '--mmproj', mmproj, 

547 '--port', str(self._port), '--ctx-size', '512', 

548 '--n-gpu-layers', '99', '--threads', '4', '--flash-attn', 'on'] 

549 log_path = os.path.join(os.environ.get('TEMP', '/tmp'), f'llama_{self._port}.log') 

550 # APPEND mode — same root-cause class as the caption-server 

551 # launch above. Preserves prior run's log across restarts. 

552 log_fh = open(log_path, 'a') 

553 try: 

554 import datetime as _lb_dt 

555 log_fh.write( 

556 f"\n===== llama-caption (standalone) session " 

557 f"{_lb_dt.datetime.now().isoformat()} port={self._port} =====\n" 

558 ) 

559 log_fh.flush() 

560 except Exception: 

561 pass 

562 _kw = dict(stdout=log_fh, stderr=subprocess.STDOUT) 

563 if os.name == 'nt': 

564 _kw['creationflags'] = subprocess.CREATE_NO_WINDOW 

565 self._server_proc = subprocess.Popen(cmd, **_kw) 

566 self._log_fh = log_fh 

567 logger.info(f"Qwen3.5-0.8B launching PID={self._server_proc.pid} port={self._port}") 

568 for _ in range(30): 

569 time.sleep(1) 

570 if self.is_available(): 

571 logger.info(f"Qwen3.5-0.8B ready on port {self._port}") 

572 return True 

573 except Exception as e: 

574 logger.error(f"Qwen3.5-0.8B standalone start failed: {e}") 

575 return False 

576 

577 def stop(self): 

578 """Stop the 0.8B server to free GPU memory. 

579 

580 Emits 'vlm_caption.stop' — Nunba subscribes and stops in bundled mode. 

581 Standalone: kills our own subprocess. 

582 """ 

583 try: 

584 from core.platform.events import emit_event 

585 emit_event('vlm_caption.stop', {'port': self._port}) 

586 except Exception: 

587 pass 

588 

589 # Standalone mode: we own the process 

590 if self._server_proc: 

591 try: 

592 self._server_proc.terminate() 

593 self._server_proc.wait(timeout=5) 

594 logger.info(f"Qwen3.5-0.8B stopped (PID={self._server_proc.pid})") 

595 except Exception: 

596 try: 

597 self._server_proc.kill() 

598 except Exception: 

599 pass 

600 self._server_proc = None 

601 if hasattr(self, '_log_fh') and self._log_fh: 

602 try: 

603 self._log_fh.close() 

604 except Exception: 

605 pass 

606 self._log_fh = None 

607 self._launch_attempted = False 

608 

609 def check_idle(self): 

610 """Called by VisionService's description_loop. Unloads if no frames for IDLE_TIMEOUT_S.""" 

611 import time 

612 if self._server_proc and self._last_describe_time > 0: 

613 idle = time.time() - self._last_describe_time 

614 if idle > self.IDLE_TIMEOUT_S: 

615 logger.info(f"Qwen3.5-0.8B idle for {idle:.0f}s — unloading to free GPU") 

616 self.stop() 

617 

618 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]: 

619 import base64, time 

620 # Lazy-start on first frame 

621 if not self._ensure_running(): 

622 return None 

623 self._last_describe_time = time.time() 

624 try: 

625 # Resize to 512x288 for fast captioning (0.8B doesn't need full res) 

626 from PIL import Image 

627 import io 

628 img = Image.open(io.BytesIO(frame_bytes)) 

629 if img.width > self.CAPTION_WIDTH or img.height > self.CAPTION_HEIGHT: 

630 img = img.resize((self.CAPTION_WIDTH, self.CAPTION_HEIGHT), Image.LANCZOS) 

631 buf = io.BytesIO() 

632 img.save(buf, 'JPEG', quality=40) 

633 b64 = base64.b64encode(buf.getvalue()).decode('utf-8') 

634 

635 resp = pooled_post( 

636 f'http://127.0.0.1:{self._port}/v1/chat/completions', 

637 json={ 

638 'model': 'local', 

639 'max_tokens': 100, 

640 'temperature': 0.1, 

641 'messages': [{ 

642 'role': 'user', 

643 'content': [ 

644 {'type': 'text', 'text': prompt or 'Describe what you see in this screenshot in 2 sentences.'}, 

645 {'type': 'image_url', 'image_url': { 

646 'url': f'data:image/jpeg;base64,{b64}' 

647 }}, 

648 ] 

649 }] 

650 }, 

651 timeout=15, 

652 ) 

653 if resp.status_code == 200: 

654 return resp.json()['choices'][0]['message']['content'] 

655 except Exception as e: 

656 logger.debug(f"Qwen08B describe error: {e}") 

657 return None 

658 

659 

660class NoneBackend(VisionBackend): 

661 """No-op backend — FrameStore only, zero overhead.""" 

662 

663 @property 

664 def name(self) -> str: 

665 return 'none' 

666 

667 @property 

668 def requires_gpu(self) -> bool: 

669 return False 

670 

671 @property 

672 def ram_mb(self) -> int: 

673 return 0 

674 

675 def is_available(self) -> bool: 

676 return True 

677 

678 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]: 

679 return None 

680 

681 

682# ─── Backend Registry ─── 

683 

684_BACKENDS = { 

685 'qwen08b': Qwen08BBackend, 

686 'qwen3vl': Qwen3VLVisionBackend, 

687 'minicpm': MiniCPMBackend, 

688 'mobilevlm': MobileVLMBackend, 

689 'clip': CLIPBackend, 

690 'none': NoneBackend, 

691} 

692 

693 

694def get_vision_backend(name: str = '') -> VisionBackend: 

695 """Get or auto-select a vision backend. 

696 

697 Priority (when name not specified): 

698 1. HEVOLVE_VISION_BACKEND env var 

699 2. ModelCatalog.select_best('vlm') — catalog is single source of truth 

700 for VRAM thresholds and tier gates 

701 3. Fallback: direct VRAM query (catalog unavailable) 

702 - 4GB+ VRAM → minicpm 

703 - 2GB+ RAM, no GPU → mobilevlm (if ONNX Runtime available) 

704 - 1GB+ RAM → clip (if clip/open_clip available) 

705 - <1GB → none 

706 """ 

707 backend_name = name or os.environ.get('HEVOLVE_VISION_BACKEND', '') 

708 

709 if backend_name: 

710 cls = _BACKENDS.get(backend_name, NoneBackend) 

711 return cls() 

712 

713 # Auto-detect — prefer Qwen3.5-0.8B for captioning (1s/frame, dedicated port) 

714 # This is separate from the 4B model used for computer use / action planning. 

715 qwen08b = Qwen08BBackend() 

716 if qwen08b.is_available(): 

717 return qwen08b 

718 

719 # Fallback: Qwen3-VL 4B (shares port with computer use agent) 

720 qwen3vl = Qwen3VLVisionBackend() 

721 if qwen3vl.is_available(): 

722 return qwen3vl 

723 

724 # ── Catalog-aware selection (single source of truth for VRAM thresholds) ─ 

725 try: 

726 from integrations.service_tools.model_orchestrator import get_orchestrator 

727 entry = get_orchestrator().select_best('vlm') 

728 if entry: 

729 # Map catalog ID → backend name → backend class 

730 _CATALOG_TO_BACKEND = { 

731 'vlm-qwen08b': 'qwen08b', 

732 'vlm-qwen3vl': 'qwen3vl', 

733 'vlm-minicpm-v2': 'minicpm', 

734 'vlm-mobilevlm': 'mobilevlm', 

735 'vlm-clip': 'clip', 

736 } 

737 backend_key = _CATALOG_TO_BACKEND.get(entry.id) 

738 if backend_key: 

739 cls = _BACKENDS.get(backend_key, NoneBackend) 

740 candidate = cls() 

741 if candidate.is_available(): 

742 return candidate 

743 except Exception: 

744 pass 

745 

746 # ── Fallback: direct VRAM / RAM query ──────────────────────────────────── 

747 try: 

748 from security.system_requirements import get_capabilities 

749 caps = get_capabilities() 

750 if caps: 

751 hw = caps.hardware 

752 if hw.gpu_vram_gb >= 4: 

753 return MiniCPMBackend() 

754 if hw.ram_gb >= 2: 

755 backend = MobileVLMBackend() 

756 if backend.is_available(): 

757 return backend 

758 backend = CLIPBackend() 

759 if backend.is_available(): 

760 return backend 

761 if hw.ram_gb >= 1: 

762 backend = CLIPBackend() 

763 if backend.is_available(): 

764 return backend 

765 except Exception: 

766 pass 

767 

768 # Last resort: try minicpm (original behavior) 

769 minicpm = MiniCPMBackend() 

770 if minicpm.is_available(): 

771 return minicpm 

772 

773 return NoneBackend() 

774 

775 

776def list_available_backends(): 

777 """Return list of (name, available, ram_mb) for all backends.""" 

778 results = [] 

779 for name, cls in _BACKENDS.items(): 

780 backend = cls() 

781 results.append({ 

782 'name': name, 

783 'available': backend.is_available(), 

784 'requires_gpu': backend.requires_gpu, 

785 'ram_mb': backend.ram_mb, 

786 }) 

787 return results 

788 

789 

790def populate_vlm_catalog(catalog) -> int: 

791 """Register all VLM backend variants into the ModelCatalog. 

792 

793 This is the single source of truth for VLM model names, VRAM thresholds, 

794 and capability tier gates — replacing hardcoded values in get_vision_backend(). 

795 

796 Called by ModelCatalog._populate_vlm_models() so the catalog stays 

797 consistent with what lightweight_backend actually supports. 

798 

799 Returns number of new entries added. 

800 """ 

801 from integrations.service_tools.model_catalog import ModelEntry, ModelType 

802 

803 vlm_models = [ 

804 # (id, name, vram_gb, ram_gb, disk_gb, quality, speed, min_tier, backend, 

805 # supports_gpu, supports_cpu, caps, tags) 

806 ( 

807 'vlm-qwen08b', 'Qwen3.5-0.8B (caption)', 

808 0.5, 0.8, 0.75, 0.70, 0.98, 'lite', 

809 'api', True, True, 

810 {'image_input': True, 'video_input': False, 'description_loop': True, 

811 'computer_use': False, 'continuous_captioning': True}, 

812 ['local', 'vision', 'caption', 'fast', 'cpu-friendly'], 

813 ), 

814 ( 

815 'vlm-qwen3vl', 'Qwen3-VL', 

816 4.0, 4.0, 8.0, 0.90, 0.70, 'full', 

817 'api', True, False, 

818 {'image_input': True, 'video_input': True, 'description_loop': True, 

819 'computer_use': True}, 

820 ['local', 'vision', 'qwen3vl'], 

821 ), 

822 ( 

823 'vlm-minicpm-v2', 'MiniCPM-V-2', 

824 4.0, 4.0, 4.0, 0.80, 0.70, 'full', 

825 'sidecar', True, False, 

826 {'image_input': True, 'video_input': False, 'description_loop': True, 

827 'computer_use': False}, 

828 ['local', 'vision'], 

829 ), 

830 ( 

831 'vlm-mobilevlm', 'MobileVLM-1.7B (ONNX)', 

832 0.0, 0.4, 0.5, 0.55, 0.92, 'lite', 

833 'onnx', False, True, 

834 {'image_input': True, 'video_input': False, 'description_loop': True, 

835 'computer_use': False}, 

836 ['local', 'vision', 'cpu-friendly', 'onnx'], 

837 ), 

838 ( 

839 'vlm-clip', 'CLIP ViT-B/16 (classification)', 

840 0.0, 0.5, 0.6, 0.45, 0.96, 'lite', 

841 'torch', False, True, 

842 {'image_input': True, 'video_input': False, 'description_loop': False, 

843 'classification_only': True, 'computer_use': False}, 

844 ['local', 'vision', 'cpu-friendly', 'classification'], 

845 ), 

846 ] 

847 

848 added = 0 

849 for (mid, name, vram, ram, disk, quality, speed, min_tier, 

850 backend, sup_gpu, sup_cpu, caps, tags) in vlm_models: 

851 if catalog.get(mid) is not None: 

852 continue 

853 entry = ModelEntry( 

854 id=mid, name=name, model_type=ModelType.VLM, 

855 source='huggingface', 

856 vram_gb=vram, ram_gb=ram, disk_gb=disk, 

857 min_capability_tier=min_tier, 

858 backend=backend, 

859 supports_gpu=sup_gpu, supports_cpu=sup_cpu, 

860 supports_cpu_offload=False, 

861 idle_timeout_s=900, 

862 capabilities=caps, 

863 quality_score=quality, speed_score=speed, 

864 tags=tags, 

865 ) 

866 catalog.register(entry, persist=False) 

867 added += 1 

868 return added