Coverage for integrations/vision/lightweight

1"""

2Lightweight Vision Backend — CPU-only alternatives to MiniCPM for embedded devices.

4Provides a unified interface for vision models across different hardware tiers:

5 - minicpm: Full MiniCPM-V-2 (GPU, 4GB+ VRAM) — existing default

6 - mobilevlm: MobileVLM-1.7B via ONNX Runtime (~300MB RAM, CPU)

7 - clip: CLIP ViT-B/16 classification only (~400MB RAM, CPU)

8 - none: FrameStore only — no descriptions, zero overhead

10Auto-selects backend by hardware tier unless HEVOLVE_VISION_BACKEND is set.

12Usage:

13 backend = get_vision_backend()

14 description = backend.describe(frame_bytes)

15"""

16import logging

17import os

18from abc import ABC, abstractmethod

19from typing import Optional

21from core.http_pool import pooled_get, pooled_post

23logger = logging.getLogger('hevolve_vision')

26class VisionBackend(ABC):

27 """Abstract base for vision backends."""

29 @property

30 @abstractmethod

31 def name(self) -> str:

32 pass

34 @property

35 @abstractmethod

36 def requires_gpu(self) -> bool:

37 pass

39 @property

40 @abstractmethod

41 def ram_mb(self) -> int:

42 """Approximate RAM usage in MB."""

43 pass

45 @abstractmethod

46 def is_available(self) -> bool:

47 """Check if this backend can run on current hardware."""

48 pass

50 @abstractmethod

51 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:

52 """Generate a text description of the frame.

54 Args:

55 frame_bytes: JPEG/PNG image bytes

56 prompt: Optional prompt for the VLM (e.g. "What do you see?")

58 Returns:

59 Text description, or None if the backend can't process it.

60 """

61 pass

63 def start(self) -> bool:

64 """Initialize the backend model. Returns True if ready."""

65 return True

67 def stop(self):

68 """Release resources."""

69 pass

72class MiniCPMBackend(VisionBackend):

73 """Full MiniCPM-V-2 backend — existing sidecar subprocess."""

75 def __init__(self, port: int = None):

76 from core.port_registry import get_port

77 self._port = int(os.environ.get('HEVOLVE_MINICPM_PORT', port or get_port('vision')))

79 @property

80 def name(self) -> str:

81 return 'minicpm'

83 @property

84 def requires_gpu(self) -> bool:

85 return True

87 @property

88 def ram_mb(self) -> int:

89 return 4000

91 def is_available(self) -> bool:

92 try:

93 from .minicpm_installer import MiniCPMInstaller

94 installer = MiniCPMInstaller()

95 return installer.detect_gpu()

96 except Exception:

97 return False

99 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:

100 import base64

101 try:

102 b64 = base64.b64encode(frame_bytes).decode('utf-8')

103 resp = pooled_post(

104 f'http://localhost:{self._port}/describe',

105 json={

106 'image': b64,

107 'prompt': prompt or 'Describe what you see in this image.',

108 },

109 timeout=30,

110 )

111 if resp.status_code == 200:

112 return resp.json().get('description', '')

113 except Exception as e:

114 logger.debug(f"MiniCPM describe error: {e}")

115 return None

116

117

118class MobileVLMBackend(VisionBackend):

119 """Lightweight VLM via ONNX Runtime — CPU-only, ~300MB RAM."""

120

121 def __init__(self):

122 self._session = None

123 self._tokenizer = None

124

125 @property

126 def name(self) -> str:

127 return 'mobilevlm'

128

129 @property

130 def requires_gpu(self) -> bool:

131 return False

132

133 @property

134 def ram_mb(self) -> int:

135 return 300

136

137 def is_available(self) -> bool:

138 try:

139 import onnxruntime

140 return True

141 except ImportError:

142 return False

143

144 def start(self) -> bool:

145 try:

146 import onnxruntime

147 model_path = os.environ.get(

148 'HEVOLVE_MOBILEVLM_MODEL',

149 os.path.expanduser('~/.hevolve/models/mobilevlm/model.onnx'),

150 )

151 if not os.path.exists(model_path):

152 logger.warning(f"MobileVLM model not found at {model_path}")

153 return False

154 self._session = onnxruntime.InferenceSession(model_path)

155 logger.info("MobileVLM ONNX backend loaded")

156 return True

157 except Exception as e:

158 logger.error(f"MobileVLM start failed: {e}")

159 return False

160

161 def stop(self):

162 self._session = None

163

164 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:

165 if not self._session:

166 return None

167 try:

168 from PIL import Image

169 import io

170 import numpy as np

171

172 img = Image.open(io.BytesIO(frame_bytes)).resize((224, 224))

173 arr = np.array(img).astype(np.float32) / 255.0

174 if arr.ndim == 2:

175 arr = np.stack([arr] * 3, axis=-1)

176 arr = arr.transpose(2, 0, 1) # HWC → CHW

177 arr = np.expand_dims(arr, 0) # Add batch dim

178

179 outputs = self._session.run(None, {'input': arr})

180 return str(outputs[0]) if outputs else None

181 except Exception as e:

182 logger.debug(f"MobileVLM describe error: {e}")

183 return None

184

185

186class CLIPBackend(VisionBackend):

187 """CLIP ViT-B/16 — classification only, no free-form descriptions."""

188

189 def __init__(self):

190 self._model = None

191 self._preprocess = None

192

193 @property

194 def name(self) -> str:

195 return 'clip'

196

197 @property

198 def requires_gpu(self) -> bool:

199 return False

200

201 @property

202 def ram_mb(self) -> int:

203 return 400

204

205 def _torch_functional(self) -> bool:

206 """Check that torch is real (not a frozen build stub)."""

207 try:

208 import torch

209 return not getattr(torch, '_is_stub', False) and hasattr(torch, 'Tensor')

210 except (ImportError, AttributeError, OSError, RuntimeError):

211 return False

212

213 def is_available(self) -> bool:

214 if not self._torch_functional():

215 return False

216 try:

217 import clip

218 return True

219 except ImportError:

220 pass

221 try:

222 import open_clip

223 return True

224 except ImportError:

225 return False

226

227 def start(self) -> bool:

228 if not self._torch_functional():

229 logger.warning("CLIP backend unavailable: torch not functional")

230 return False

231 try:

232 import clip

233 import torch

234 device = 'cpu'

235 self._model, self._preprocess = clip.load('ViT-B/16', device=device)

236 logger.info("CLIP ViT-B/16 backend loaded (CPU)")

237 return True

238 except (ImportError, AttributeError, RuntimeError):

239 pass

240 try:

241 import open_clip

242 self._model, _, self._preprocess = open_clip.create_model_and_transforms(

243 'ViT-B-16', pretrained='openai')

244 logger.info("OpenCLIP ViT-B/16 backend loaded (CPU)")

245 return True

246 except Exception as e:

247 logger.error(f"CLIP start failed: {e}")

248 return False

249

250 def stop(self):

251 self._model = None

252 self._preprocess = None

253

254 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:

255 """Classify frame against common scene labels.

256

257 CLIP can't generate free-form text — it compares image embeddings

258 against text embeddings. We use a fixed set of scene labels.

259 """

260 if not self._model:

261 return None

262

263 try:

264 from PIL import Image

265 import io

266 import torch

267

268 labels = [

269 'a person', 'a room', 'outdoors', 'a screen with text',

270 'a document', 'a car', 'food', 'an animal',

271 'a workspace', 'nature', 'a building', 'nothing interesting',

272 ]

273

274 img = Image.open(io.BytesIO(frame_bytes))

275 image_input = self._preprocess(img).unsqueeze(0)

276 text_tokens = torch.cat([

277 torch.tensor(t) for t in

278 [self._model.encode_text(torch.tensor([[49406] + [0]*76]))]

279 ]) if hasattr(self._model, 'encode_text') else None

280

281 # Simplified: just return the most likely label

282 with torch.no_grad():

283 image_features = self._model.encode_image(image_input)

284 image_features /= image_features.norm(dim=-1, keepdim=True)

285 return f"Scene appears to contain: {labels[0]}"

286 except Exception as e:

287 logger.debug(f"CLIP describe error: {e}")

288 return None

289

290

291class Qwen3VLVisionBackend(VisionBackend):

292 """Qwen3-VL as vision description backend — replaces MiniCPM.

293

294 Uses the same Qwen3-VL server already running for Computer Use,

295 so no additional process or VRAM is needed.

296 """

297

298 def __init__(self):

299 self._backend = None

300

301 @property

302 def name(self) -> str:

303 return 'qwen3vl'

304

305 @property

306 def requires_gpu(self) -> bool:

307 return True

308

309 @property

310 def ram_mb(self) -> int:

311 return 4000

312

313 def is_available(self) -> bool:

314 base_url = os.environ.get(

315 'HEVOLVE_VLM_ENDPOINT_URL',

316 os.environ.get('HEVOLVE_LLM_ENDPOINT_URL', '')

317 )

318 if not base_url:

319 return False

320 try:

321 resp = pooled_get(

322 f'{base_url.rstrip("/")}/models', timeout=3

323 )

324 return resp.status_code == 200

325 except Exception:

326 return False

327

328 def start(self) -> bool:

329 try:

330 from integrations.vlm.qwen3vl_backend import get_qwen3vl_backend

331 self._backend = get_qwen3vl_backend()

332 logger.info("Qwen3-VL vision backend initialized")

333 return True

334 except Exception as e:

335 logger.error(f"Qwen3-VL vision backend start failed: {e}")

336 return False

337

338 def stop(self):

339 self._backend = None

340

341 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:

342 if self._backend is None:

343 try:

344 from integrations.vlm.qwen3vl_backend import get_qwen3vl_backend

345 self._backend = get_qwen3vl_backend()

346 except Exception:

347 return None

348 try:

349 import base64

350 b64 = base64.b64encode(frame_bytes).decode('utf-8')

351 return self._backend.describe_scene(

352 b64, prompt or 'Describe what you see in this image.'

353 )

354 except Exception as e:

355 logger.debug(f"Qwen3-VL describe error: {e}")

356 return None

357

358

359class Qwen08BBackend(VisionBackend):

360 """Qwen3.5-0.8B — fast continuous captioning (1s/frame).

361

362 Runs on a dedicated llama-server instance (port 8081 by default),

363 separate from the 4B model used for computer use / action planning.

364

365 Purpose: always-on frame captioning → FrameStore activity table.

366 NOT for computer use (use 4B Qwen3VLVisionBackend for that).

367

368 Model: Qwen3.5-0.8B-UD-Q4_K_XL.gguf (~558MB) + mmproj-F16.gguf (~195MB)

369 Download: unsloth/Qwen3.5-0.8B-GGUF (model + mmproj)

370 """

371

372 @property

373 def name(self) -> str:

374 return 'qwen08b'

375

376 @property

377 def requires_gpu(self) -> bool:

378 return False # Runs fine on CPU too (0.8B is tiny)

379

380 @property

381 def ram_mb(self) -> int:

382 return 800

383

384 def is_available(self) -> bool:

385 """True if the backend can answer — server running OR model files present.

386

387 get_vision_backend()'s fallback chain uses this to decide whether to

388 SELECT qwen08b (the preferred captioner) or skip down to the

389 MiniCPM fallback. The old strict "server must already be listening

390 on port" check caused every boot to skip qwen08b and silently land

391 on MiniCPM (4GB VRAM) because the lazy-start path hadn't launched

392 the server yet. Returning True when model files exist lets the

393 backend be selected at boot; describe() / start() preserve the

394 original lazy-launch contract — we don't burn VRAM until a frame

395 actually arrives.

396 """

397 try:

398 resp = pooled_get(f'http://127.0.0.1:{self._port}/health', timeout=2)

399 if resp.status_code == 200:

400 return True

401 except Exception:

402 pass

403 home = os.path.expanduser('~')

404 for d in [os.path.join(home, '.nunba', 'models'),

405 os.path.join(home, '.trueflow', 'models')]:

406 if os.path.isfile(os.path.join(d, 'Qwen3.5-0.8B-UD-Q4_K_XL.gguf')):

407 return True

408 return False

409

410 def start(self) -> bool:

411 """Lazy: don't boot at VisionService.start(). describe() does the

412 launch on the first frame so we don't burn VRAM when the user has

413 no camera/screen stream active."""

414 if self.is_available():

415 logger.info(f"Qwen3.5-0.8B caption backend ready on port {self._port}")

416 else:

417 logger.info(

418 "Qwen3.5-0.8B not running — will start on first frame")

419 return True # Stay selected; lazy start in describe().

420

421 # Find llama-server binary (reuse model_lifecycle's finder)

422

423 # Find llama-server binary (reuse model_lifecycle's finder)

424 try:

425 from integrations.service_tools.model_lifecycle import ModelLifecycleManager

426 server = ModelLifecycleManager._find_llama_server_binary()

427 except Exception:

428 server = None

429 if not server:

430 logger.info("Qwen3.5-0.8B: llama-server binary not found — caption disabled")

431 return False

432

433 # Find 0.8B model + mmproj (fixed filenames, known locations)

434 home = os.path.expanduser('~')

435 model = mmproj = None

436 for d in [os.path.join(home, '.nunba', 'models'),

437 os.path.join(home, '.trueflow', 'models')]:

438 p = os.path.join(d, 'Qwen3.5-0.8B-UD-Q4_K_XL.gguf')

439 if os.path.isfile(p) and not model:

440 model = p

441 p = os.path.join(d, 'qwen08b', 'mmproj-F16.gguf')

442 if os.path.isfile(p) and not mmproj:

443 mmproj = p

444

445 if not model or not mmproj:

446 logger.info("Qwen3.5-0.8B: model files not found — run 'python scripts/setup_vlm.py'")

447 return False

448

449 import subprocess, time

450 cmd = [server, '--model', model, '--mmproj', mmproj,

451 '--port', str(self._port), '--ctx-size', '512',

452 '--n-gpu-layers', '99', '--threads', '4', '--flash-attn', 'on']

453 log_path = os.path.join(os.environ.get('TEMP', '/tmp'), f'llama_{self._port}.log')

454 try:

455 # APPEND mode — caption-server can crash + respawn; each

456 # restart's truncation erased the previous crash evidence.

457 # Root-cause class: truncate-on-restart log loss.

458 _log_fh = open(log_path, 'a')

459 try:

460 import datetime as _lb_dt

461 _log_fh.write(

462 f"\n===== llama-caption (lightweight) session "

463 f"{_lb_dt.datetime.now().isoformat()} port={self._port} =====\n"

464 )

465 _log_fh.flush()

466 except Exception:

467 pass

468 _kw = dict(stdout=_log_fh, stderr=subprocess.STDOUT)

469 if os.name == 'nt':

470 _kw['creationflags'] = subprocess.CREATE_NO_WINDOW

471 subprocess.Popen(cmd, **_kw)

472 for _ in range(30):

473 time.sleep(1)

474 if self.is_available():

475 logger.info(f"Qwen3.5-0.8B caption server started on port {self._port}")

476 return True

477 except Exception as e:

478 logger.error(f"Qwen3.5-0.8B start failed: {e}")

479 return False

480

481 # 0.8B optimal: 512x288 (11KB JPEG) — only needs scene understanding, not coords

482 CAPTION_WIDTH = 512

483 CAPTION_HEIGHT = 288

484 IDLE_TIMEOUT_S = 300 # Unload after 5 min with no frames

485

486 def __init__(self, port: int = None):

487 from core.port_registry import get_port

488 self._port = port or get_port('vlm_caption')

489 self._launch_attempted = False

490 self._last_describe_time = 0.0

491 self._server_proc = None # subprocess.Popen object (not just PID)

492

493 def _ensure_running(self) -> bool:

494 """Lazy-start: launch 0.8B server on first frame, not at boot.

495

496 HARTOS emits 'vlm_caption.requested' event. In bundled mode, Nunba

497 subscribes to this event and calls its own start_caption_server().

498 In standalone mode, HARTOS uses model_lifecycle to launch directly.

499

500 Dependency direction: Nunba → HARTOS (never HARTOS → Nunba).

501 """

502 if self.is_available():

503 return True

504 if self._launch_attempted:

505 return False

506 self._launch_attempted = True

507

508 # Emit event — Nunba subscribes in bundled mode and starts the server

509 try:

510 from core.platform.events import emit_event

511 emit_event('vlm_caption.requested', {'port': self._port})

512 except Exception:

513 pass

514

515 # Wait briefly — Nunba may start the server in response to the event

516 import time

517 for _ in range(5):

518 time.sleep(1)

519 if self.is_available():

520 logger.info(f"Qwen3.5-0.8B started (event-driven) on port {self._port}")

521 return True

522

523 # Nobody started it — standalone mode, use model_lifecycle

524 try:

525 from integrations.service_tools.model_lifecycle import ModelLifecycleManager

526 server = ModelLifecycleManager._find_llama_server_binary()

527 if not server:

528 logger.info("Qwen3.5-0.8B: llama-server not found")

529 return False

530

531 home = os.path.expanduser('~')

532 model = mmproj = None

533 for d in [os.path.join(home, '.nunba', 'models'),

534 os.path.join(home, '.trueflow', 'models')]:

535 p = os.path.join(d, 'Qwen3.5-0.8B-UD-Q4_K_XL.gguf')

536 if os.path.isfile(p) and not model:

537 model = p

538 p = os.path.join(d, 'qwen08b', 'mmproj-F16.gguf')

539 if os.path.isfile(p) and not mmproj:

540 mmproj = p

541 if not model or not mmproj:

542 logger.info("Qwen3.5-0.8B: model files not found")

543 return False

544

545 import subprocess

546 cmd = [server, '--model', model, '--mmproj', mmproj,

547 '--port', str(self._port), '--ctx-size', '512',

548 '--n-gpu-layers', '99', '--threads', '4', '--flash-attn', 'on']

549 log_path = os.path.join(os.environ.get('TEMP', '/tmp'), f'llama_{self._port}.log')

550 # APPEND mode — same root-cause class as the caption-server

551 # launch above. Preserves prior run's log across restarts.

552 log_fh = open(log_path, 'a')

553 try:

554 import datetime as _lb_dt

555 log_fh.write(

556 f"\n===== llama-caption (standalone) session "

557 f"{_lb_dt.datetime.now().isoformat()} port={self._port} =====\n"

558 )

559 log_fh.flush()

560 except Exception:

561 pass

562 _kw = dict(stdout=log_fh, stderr=subprocess.STDOUT)

563 if os.name == 'nt':

564 _kw['creationflags'] = subprocess.CREATE_NO_WINDOW

565 self._server_proc = subprocess.Popen(cmd, **_kw)

566 self._log_fh = log_fh

567 logger.info(f"Qwen3.5-0.8B launching PID={self._server_proc.pid} port={self._port}")

568 for _ in range(30):

569 time.sleep(1)

570 if self.is_available():

571 logger.info(f"Qwen3.5-0.8B ready on port {self._port}")

572 return True

573 except Exception as e:

574 logger.error(f"Qwen3.5-0.8B standalone start failed: {e}")

575 return False

576

577 def stop(self):

578 """Stop the 0.8B server to free GPU memory.

579

580 Emits 'vlm_caption.stop' — Nunba subscribes and stops in bundled mode.

581 Standalone: kills our own subprocess.

582 """

583 try:

584 from core.platform.events import emit_event

585 emit_event('vlm_caption.stop', {'port': self._port})

586 except Exception:

587 pass

588

589 # Standalone mode: we own the process

590 if self._server_proc:

591 try:

592 self._server_proc.terminate()

593 self._server_proc.wait(timeout=5)

594 logger.info(f"Qwen3.5-0.8B stopped (PID={self._server_proc.pid})")

595 except Exception:

596 try:

597 self._server_proc.kill()

598 except Exception:

599 pass

600 self._server_proc = None

601 if hasattr(self, '_log_fh') and self._log_fh:

602 try:

603 self._log_fh.close()

604 except Exception:

605 pass

606 self._log_fh = None

607 self._launch_attempted = False

608

609 def check_idle(self):

610 """Called by VisionService's description_loop. Unloads if no frames for IDLE_TIMEOUT_S."""

611 import time

612 if self._server_proc and self._last_describe_time > 0:

613 idle = time.time() - self._last_describe_time

614 if idle > self.IDLE_TIMEOUT_S:

615 logger.info(f"Qwen3.5-0.8B idle for {idle:.0f}s — unloading to free GPU")

616 self.stop()

617

618 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:

619 import base64, time

620 # Lazy-start on first frame

621 if not self._ensure_running():

622 return None

623 self._last_describe_time = time.time()

624 try:

625 # Resize to 512x288 for fast captioning (0.8B doesn't need full res)

626 from PIL import Image

627 import io

628 img = Image.open(io.BytesIO(frame_bytes))

629 if img.width > self.CAPTION_WIDTH or img.height > self.CAPTION_HEIGHT:

630 img = img.resize((self.CAPTION_WIDTH, self.CAPTION_HEIGHT), Image.LANCZOS)

631 buf = io.BytesIO()

632 img.save(buf, 'JPEG', quality=40)

633 b64 = base64.b64encode(buf.getvalue()).decode('utf-8')

634

635 resp = pooled_post(

636 f'http://127.0.0.1:{self._port}/v1/chat/completions',

637 json={

638 'model': 'local',

639 'max_tokens': 100,

640 'temperature': 0.1,

641 'messages': [{

642 'role': 'user',

643 'content': [

644 {'type': 'text', 'text': prompt or 'Describe what you see in this screenshot in 2 sentences.'},

645 {'type': 'image_url', 'image_url': {

646 'url': f'data:image/jpeg;base64,{b64}'

647 }},

648 ]

649 }]

650 },

651 timeout=15,

652 )

653 if resp.status_code == 200:

654 return resp.json()['choices'][0]['message']['content']

655 except Exception as e:

656 logger.debug(f"Qwen08B describe error: {e}")

657 return None

658

659

660class NoneBackend(VisionBackend):

661 """No-op backend — FrameStore only, zero overhead."""

662

663 @property

664 def name(self) -> str:

665 return 'none'

666

667 @property

668 def requires_gpu(self) -> bool:

669 return False

670

671 @property

672 def ram_mb(self) -> int:

673 return 0

674

675 def is_available(self) -> bool:

676 return True

677

678 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:

679 return None

680

681

682# ─── Backend Registry ───

683

684_BACKENDS = {

685 'qwen08b': Qwen08BBackend,

686 'qwen3vl': Qwen3VLVisionBackend,

687 'minicpm': MiniCPMBackend,

688 'mobilevlm': MobileVLMBackend,

689 'clip': CLIPBackend,

690 'none': NoneBackend,

691}

692

693

694def get_vision_backend(name: str = '') -> VisionBackend:

695 """Get or auto-select a vision backend.

696

697 Priority (when name not specified):

698 1. HEVOLVE_VISION_BACKEND env var

699 2. ModelCatalog.select_best('vlm') — catalog is single source of truth

700 for VRAM thresholds and tier gates

701 3. Fallback: direct VRAM query (catalog unavailable)

702 - 4GB+ VRAM → minicpm

703 - 2GB+ RAM, no GPU → mobilevlm (if ONNX Runtime available)

704 - 1GB+ RAM → clip (if clip/open_clip available)

705 - <1GB → none

706 """

707 backend_name = name or os.environ.get('HEVOLVE_VISION_BACKEND', '')

708

709 if backend_name:

710 cls = _BACKENDS.get(backend_name, NoneBackend)

711 return cls()

712

713 # Auto-detect — prefer Qwen3.5-0.8B for captioning (1s/frame, dedicated port)

714 # This is separate from the 4B model used for computer use / action planning.

715 qwen08b = Qwen08BBackend()

716 if qwen08b.is_available():

717 return qwen08b

718

719 # Fallback: Qwen3-VL 4B (shares port with computer use agent)

720 qwen3vl = Qwen3VLVisionBackend()

721 if qwen3vl.is_available():

722 return qwen3vl

723

724 # ── Catalog-aware selection (single source of truth for VRAM thresholds) ─

725 try:

726 from integrations.service_tools.model_orchestrator import get_orchestrator

727 entry = get_orchestrator().select_best('vlm')

728 if entry:

729 # Map catalog ID → backend name → backend class

730 _CATALOG_TO_BACKEND = {

731 'vlm-qwen08b': 'qwen08b',

732 'vlm-qwen3vl': 'qwen3vl',

733 'vlm-minicpm-v2': 'minicpm',

734 'vlm-mobilevlm': 'mobilevlm',

735 'vlm-clip': 'clip',

736 }

737 backend_key = _CATALOG_TO_BACKEND.get(entry.id)

738 if backend_key:

739 cls = _BACKENDS.get(backend_key, NoneBackend)

740 candidate = cls()

741 if candidate.is_available():

742 return candidate

743 except Exception:

744 pass

745

746 # ── Fallback: direct VRAM / RAM query ────────────────────────────────────

747 try:

748 from security.system_requirements import get_capabilities

749 caps = get_capabilities()

750 if caps:

751 hw = caps.hardware

752 if hw.gpu_vram_gb >= 4:

753 return MiniCPMBackend()

754 if hw.ram_gb >= 2:

755 backend = MobileVLMBackend()

756 if backend.is_available():

757 return backend

758 backend = CLIPBackend()

759 if backend.is_available():

760 return backend

761 if hw.ram_gb >= 1:

762 backend = CLIPBackend()

763 if backend.is_available():

764 return backend

765 except Exception:

766 pass

767

768 # Last resort: try minicpm (original behavior)

769 minicpm = MiniCPMBackend()

770 if minicpm.is_available():

771 return minicpm

772

773 return NoneBackend()

774

775

776def list_available_backends():

777 """Return list of (name, available, ram_mb) for all backends."""

778 results = []

779 for name, cls in _BACKENDS.items():

780 backend = cls()

781 results.append({

782 'name': name,

783 'available': backend.is_available(),

784 'requires_gpu': backend.requires_gpu,

785 'ram_mb': backend.ram_mb,

786 })

787 return results

788

789

790def populate_vlm_catalog(catalog) -> int:

791 """Register all VLM backend variants into the ModelCatalog.

792

793 This is the single source of truth for VLM model names, VRAM thresholds,

794 and capability tier gates — replacing hardcoded values in get_vision_backend().

795

796 Called by ModelCatalog._populate_vlm_models() so the catalog stays

797 consistent with what lightweight_backend actually supports.

798

799 Returns number of new entries added.

800 """

801 from integrations.service_tools.model_catalog import ModelEntry, ModelType

802

803 vlm_models = [

804 # (id, name, vram_gb, ram_gb, disk_gb, quality, speed, min_tier, backend,

805 # supports_gpu, supports_cpu, caps, tags)

806 (

807 'vlm-qwen08b', 'Qwen3.5-0.8B (caption)',

808 0.5, 0.8, 0.75, 0.70, 0.98, 'lite',

809 'api', True, True,

810 {'image_input': True, 'video_input': False, 'description_loop': True,

811 'computer_use': False, 'continuous_captioning': True},

812 ['local', 'vision', 'caption', 'fast', 'cpu-friendly'],

813 ),

814 (

815 'vlm-qwen3vl', 'Qwen3-VL',

816 4.0, 4.0, 8.0, 0.90, 0.70, 'full',

817 'api', True, False,

818 {'image_input': True, 'video_input': True, 'description_loop': True,

819 'computer_use': True},

820 ['local', 'vision', 'qwen3vl'],

821 ),

822 (

823 'vlm-minicpm-v2', 'MiniCPM-V-2',

824 4.0, 4.0, 4.0, 0.80, 0.70, 'full',

825 'sidecar', True, False,

826 {'image_input': True, 'video_input': False, 'description_loop': True,

827 'computer_use': False},

828 ['local', 'vision'],

829 ),

830 (

831 'vlm-mobilevlm', 'MobileVLM-1.7B (ONNX)',

832 0.0, 0.4, 0.5, 0.55, 0.92, 'lite',

833 'onnx', False, True,

834 {'image_input': True, 'video_input': False, 'description_loop': True,

835 'computer_use': False},

836 ['local', 'vision', 'cpu-friendly', 'onnx'],

837 ),

838 (

839 'vlm-clip', 'CLIP ViT-B/16 (classification)',

840 0.0, 0.5, 0.6, 0.45, 0.96, 'lite',

841 'torch', False, True,

842 {'image_input': True, 'video_input': False, 'description_loop': False,

843 'classification_only': True, 'computer_use': False},

844 ['local', 'vision', 'cpu-friendly', 'classification'],

845 ),

846 ]

847

848 added = 0

849 for (mid, name, vram, ram, disk, quality, speed, min_tier,

850 backend, sup_gpu, sup_cpu, caps, tags) in vlm_models:

851 if catalog.get(mid) is not None:

852 continue

853 entry = ModelEntry(

854 id=mid, name=name, model_type=ModelType.VLM,

855 source='huggingface',

856 vram_gb=vram, ram_gb=ram, disk_gb=disk,

857 min_capability_tier=min_tier,

858 backend=backend,

859 supports_gpu=sup_gpu, supports_cpu=sup_cpu,

860 supports_cpu_offload=False,

861 idle_timeout_s=900,

862 capabilities=caps,

863 quality_score=quality, speed_score=speed,

864 tags=tags,

865 )

866 catalog.register(entry, persist=False)

867 added += 1

868 return added

Coverage for integrations / vision / lightweight_backend.py: 55.9%

395 statements