Coverage for integrations / vision / lightweight_backend.py: 55.9%
395 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Lightweight Vision Backend — CPU-only alternatives to MiniCPM for embedded devices.
4Provides a unified interface for vision models across different hardware tiers:
5 - minicpm: Full MiniCPM-V-2 (GPU, 4GB+ VRAM) — existing default
6 - mobilevlm: MobileVLM-1.7B via ONNX Runtime (~300MB RAM, CPU)
7 - clip: CLIP ViT-B/16 classification only (~400MB RAM, CPU)
8 - none: FrameStore only — no descriptions, zero overhead
10Auto-selects backend by hardware tier unless HEVOLVE_VISION_BACKEND is set.
12Usage:
13 backend = get_vision_backend()
14 description = backend.describe(frame_bytes)
15"""
16import logging
17import os
18from abc import ABC, abstractmethod
19from typing import Optional
21from core.http_pool import pooled_get, pooled_post
23logger = logging.getLogger('hevolve_vision')
26class VisionBackend(ABC):
27 """Abstract base for vision backends."""
29 @property
30 @abstractmethod
31 def name(self) -> str:
32 pass
34 @property
35 @abstractmethod
36 def requires_gpu(self) -> bool:
37 pass
39 @property
40 @abstractmethod
41 def ram_mb(self) -> int:
42 """Approximate RAM usage in MB."""
43 pass
45 @abstractmethod
46 def is_available(self) -> bool:
47 """Check if this backend can run on current hardware."""
48 pass
50 @abstractmethod
51 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:
52 """Generate a text description of the frame.
54 Args:
55 frame_bytes: JPEG/PNG image bytes
56 prompt: Optional prompt for the VLM (e.g. "What do you see?")
58 Returns:
59 Text description, or None if the backend can't process it.
60 """
61 pass
63 def start(self) -> bool:
64 """Initialize the backend model. Returns True if ready."""
65 return True
67 def stop(self):
68 """Release resources."""
69 pass
72class MiniCPMBackend(VisionBackend):
73 """Full MiniCPM-V-2 backend — existing sidecar subprocess."""
75 def __init__(self, port: int = None):
76 from core.port_registry import get_port
77 self._port = int(os.environ.get('HEVOLVE_MINICPM_PORT', port or get_port('vision')))
79 @property
80 def name(self) -> str:
81 return 'minicpm'
83 @property
84 def requires_gpu(self) -> bool:
85 return True
87 @property
88 def ram_mb(self) -> int:
89 return 4000
91 def is_available(self) -> bool:
92 try:
93 from .minicpm_installer import MiniCPMInstaller
94 installer = MiniCPMInstaller()
95 return installer.detect_gpu()
96 except Exception:
97 return False
99 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:
100 import base64
101 try:
102 b64 = base64.b64encode(frame_bytes).decode('utf-8')
103 resp = pooled_post(
104 f'http://localhost:{self._port}/describe',
105 json={
106 'image': b64,
107 'prompt': prompt or 'Describe what you see in this image.',
108 },
109 timeout=30,
110 )
111 if resp.status_code == 200:
112 return resp.json().get('description', '')
113 except Exception as e:
114 logger.debug(f"MiniCPM describe error: {e}")
115 return None
118class MobileVLMBackend(VisionBackend):
119 """Lightweight VLM via ONNX Runtime — CPU-only, ~300MB RAM."""
121 def __init__(self):
122 self._session = None
123 self._tokenizer = None
125 @property
126 def name(self) -> str:
127 return 'mobilevlm'
129 @property
130 def requires_gpu(self) -> bool:
131 return False
133 @property
134 def ram_mb(self) -> int:
135 return 300
137 def is_available(self) -> bool:
138 try:
139 import onnxruntime
140 return True
141 except ImportError:
142 return False
144 def start(self) -> bool:
145 try:
146 import onnxruntime
147 model_path = os.environ.get(
148 'HEVOLVE_MOBILEVLM_MODEL',
149 os.path.expanduser('~/.hevolve/models/mobilevlm/model.onnx'),
150 )
151 if not os.path.exists(model_path):
152 logger.warning(f"MobileVLM model not found at {model_path}")
153 return False
154 self._session = onnxruntime.InferenceSession(model_path)
155 logger.info("MobileVLM ONNX backend loaded")
156 return True
157 except Exception as e:
158 logger.error(f"MobileVLM start failed: {e}")
159 return False
161 def stop(self):
162 self._session = None
164 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:
165 if not self._session:
166 return None
167 try:
168 from PIL import Image
169 import io
170 import numpy as np
172 img = Image.open(io.BytesIO(frame_bytes)).resize((224, 224))
173 arr = np.array(img).astype(np.float32) / 255.0
174 if arr.ndim == 2:
175 arr = np.stack([arr] * 3, axis=-1)
176 arr = arr.transpose(2, 0, 1) # HWC → CHW
177 arr = np.expand_dims(arr, 0) # Add batch dim
179 outputs = self._session.run(None, {'input': arr})
180 return str(outputs[0]) if outputs else None
181 except Exception as e:
182 logger.debug(f"MobileVLM describe error: {e}")
183 return None
186class CLIPBackend(VisionBackend):
187 """CLIP ViT-B/16 — classification only, no free-form descriptions."""
189 def __init__(self):
190 self._model = None
191 self._preprocess = None
193 @property
194 def name(self) -> str:
195 return 'clip'
197 @property
198 def requires_gpu(self) -> bool:
199 return False
201 @property
202 def ram_mb(self) -> int:
203 return 400
205 def _torch_functional(self) -> bool:
206 """Check that torch is real (not a frozen build stub)."""
207 try:
208 import torch
209 return not getattr(torch, '_is_stub', False) and hasattr(torch, 'Tensor')
210 except (ImportError, AttributeError, OSError, RuntimeError):
211 return False
213 def is_available(self) -> bool:
214 if not self._torch_functional():
215 return False
216 try:
217 import clip
218 return True
219 except ImportError:
220 pass
221 try:
222 import open_clip
223 return True
224 except ImportError:
225 return False
227 def start(self) -> bool:
228 if not self._torch_functional():
229 logger.warning("CLIP backend unavailable: torch not functional")
230 return False
231 try:
232 import clip
233 import torch
234 device = 'cpu'
235 self._model, self._preprocess = clip.load('ViT-B/16', device=device)
236 logger.info("CLIP ViT-B/16 backend loaded (CPU)")
237 return True
238 except (ImportError, AttributeError, RuntimeError):
239 pass
240 try:
241 import open_clip
242 self._model, _, self._preprocess = open_clip.create_model_and_transforms(
243 'ViT-B-16', pretrained='openai')
244 logger.info("OpenCLIP ViT-B/16 backend loaded (CPU)")
245 return True
246 except Exception as e:
247 logger.error(f"CLIP start failed: {e}")
248 return False
250 def stop(self):
251 self._model = None
252 self._preprocess = None
254 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:
255 """Classify frame against common scene labels.
257 CLIP can't generate free-form text — it compares image embeddings
258 against text embeddings. We use a fixed set of scene labels.
259 """
260 if not self._model:
261 return None
263 try:
264 from PIL import Image
265 import io
266 import torch
268 labels = [
269 'a person', 'a room', 'outdoors', 'a screen with text',
270 'a document', 'a car', 'food', 'an animal',
271 'a workspace', 'nature', 'a building', 'nothing interesting',
272 ]
274 img = Image.open(io.BytesIO(frame_bytes))
275 image_input = self._preprocess(img).unsqueeze(0)
276 text_tokens = torch.cat([
277 torch.tensor(t) for t in
278 [self._model.encode_text(torch.tensor([[49406] + [0]*76]))]
279 ]) if hasattr(self._model, 'encode_text') else None
281 # Simplified: just return the most likely label
282 with torch.no_grad():
283 image_features = self._model.encode_image(image_input)
284 image_features /= image_features.norm(dim=-1, keepdim=True)
285 return f"Scene appears to contain: {labels[0]}"
286 except Exception as e:
287 logger.debug(f"CLIP describe error: {e}")
288 return None
291class Qwen3VLVisionBackend(VisionBackend):
292 """Qwen3-VL as vision description backend — replaces MiniCPM.
294 Uses the same Qwen3-VL server already running for Computer Use,
295 so no additional process or VRAM is needed.
296 """
298 def __init__(self):
299 self._backend = None
301 @property
302 def name(self) -> str:
303 return 'qwen3vl'
305 @property
306 def requires_gpu(self) -> bool:
307 return True
309 @property
310 def ram_mb(self) -> int:
311 return 4000
313 def is_available(self) -> bool:
314 base_url = os.environ.get(
315 'HEVOLVE_VLM_ENDPOINT_URL',
316 os.environ.get('HEVOLVE_LLM_ENDPOINT_URL', '')
317 )
318 if not base_url:
319 return False
320 try:
321 resp = pooled_get(
322 f'{base_url.rstrip("/")}/models', timeout=3
323 )
324 return resp.status_code == 200
325 except Exception:
326 return False
328 def start(self) -> bool:
329 try:
330 from integrations.vlm.qwen3vl_backend import get_qwen3vl_backend
331 self._backend = get_qwen3vl_backend()
332 logger.info("Qwen3-VL vision backend initialized")
333 return True
334 except Exception as e:
335 logger.error(f"Qwen3-VL vision backend start failed: {e}")
336 return False
338 def stop(self):
339 self._backend = None
341 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:
342 if self._backend is None:
343 try:
344 from integrations.vlm.qwen3vl_backend import get_qwen3vl_backend
345 self._backend = get_qwen3vl_backend()
346 except Exception:
347 return None
348 try:
349 import base64
350 b64 = base64.b64encode(frame_bytes).decode('utf-8')
351 return self._backend.describe_scene(
352 b64, prompt or 'Describe what you see in this image.'
353 )
354 except Exception as e:
355 logger.debug(f"Qwen3-VL describe error: {e}")
356 return None
359class Qwen08BBackend(VisionBackend):
360 """Qwen3.5-0.8B — fast continuous captioning (1s/frame).
362 Runs on a dedicated llama-server instance (port 8081 by default),
363 separate from the 4B model used for computer use / action planning.
365 Purpose: always-on frame captioning → FrameStore activity table.
366 NOT for computer use (use 4B Qwen3VLVisionBackend for that).
368 Model: Qwen3.5-0.8B-UD-Q4_K_XL.gguf (~558MB) + mmproj-F16.gguf (~195MB)
369 Download: unsloth/Qwen3.5-0.8B-GGUF (model + mmproj)
370 """
372 @property
373 def name(self) -> str:
374 return 'qwen08b'
376 @property
377 def requires_gpu(self) -> bool:
378 return False # Runs fine on CPU too (0.8B is tiny)
380 @property
381 def ram_mb(self) -> int:
382 return 800
384 def is_available(self) -> bool:
385 """True if the backend can answer — server running OR model files present.
387 get_vision_backend()'s fallback chain uses this to decide whether to
388 SELECT qwen08b (the preferred captioner) or skip down to the
389 MiniCPM fallback. The old strict "server must already be listening
390 on port" check caused every boot to skip qwen08b and silently land
391 on MiniCPM (4GB VRAM) because the lazy-start path hadn't launched
392 the server yet. Returning True when model files exist lets the
393 backend be selected at boot; describe() / start() preserve the
394 original lazy-launch contract — we don't burn VRAM until a frame
395 actually arrives.
396 """
397 try:
398 resp = pooled_get(f'http://127.0.0.1:{self._port}/health', timeout=2)
399 if resp.status_code == 200:
400 return True
401 except Exception:
402 pass
403 home = os.path.expanduser('~')
404 for d in [os.path.join(home, '.nunba', 'models'),
405 os.path.join(home, '.trueflow', 'models')]:
406 if os.path.isfile(os.path.join(d, 'Qwen3.5-0.8B-UD-Q4_K_XL.gguf')):
407 return True
408 return False
410 def start(self) -> bool:
411 """Lazy: don't boot at VisionService.start(). describe() does the
412 launch on the first frame so we don't burn VRAM when the user has
413 no camera/screen stream active."""
414 if self.is_available():
415 logger.info(f"Qwen3.5-0.8B caption backend ready on port {self._port}")
416 else:
417 logger.info(
418 "Qwen3.5-0.8B not running — will start on first frame")
419 return True # Stay selected; lazy start in describe().
421 # Find llama-server binary (reuse model_lifecycle's finder)
423 # Find llama-server binary (reuse model_lifecycle's finder)
424 try:
425 from integrations.service_tools.model_lifecycle import ModelLifecycleManager
426 server = ModelLifecycleManager._find_llama_server_binary()
427 except Exception:
428 server = None
429 if not server:
430 logger.info("Qwen3.5-0.8B: llama-server binary not found — caption disabled")
431 return False
433 # Find 0.8B model + mmproj (fixed filenames, known locations)
434 home = os.path.expanduser('~')
435 model = mmproj = None
436 for d in [os.path.join(home, '.nunba', 'models'),
437 os.path.join(home, '.trueflow', 'models')]:
438 p = os.path.join(d, 'Qwen3.5-0.8B-UD-Q4_K_XL.gguf')
439 if os.path.isfile(p) and not model:
440 model = p
441 p = os.path.join(d, 'qwen08b', 'mmproj-F16.gguf')
442 if os.path.isfile(p) and not mmproj:
443 mmproj = p
445 if not model or not mmproj:
446 logger.info("Qwen3.5-0.8B: model files not found — run 'python scripts/setup_vlm.py'")
447 return False
449 import subprocess, time
450 cmd = [server, '--model', model, '--mmproj', mmproj,
451 '--port', str(self._port), '--ctx-size', '512',
452 '--n-gpu-layers', '99', '--threads', '4', '--flash-attn', 'on']
453 log_path = os.path.join(os.environ.get('TEMP', '/tmp'), f'llama_{self._port}.log')
454 try:
455 # APPEND mode — caption-server can crash + respawn; each
456 # restart's truncation erased the previous crash evidence.
457 # Root-cause class: truncate-on-restart log loss.
458 _log_fh = open(log_path, 'a')
459 try:
460 import datetime as _lb_dt
461 _log_fh.write(
462 f"\n===== llama-caption (lightweight) session "
463 f"{_lb_dt.datetime.now().isoformat()} port={self._port} =====\n"
464 )
465 _log_fh.flush()
466 except Exception:
467 pass
468 _kw = dict(stdout=_log_fh, stderr=subprocess.STDOUT)
469 if os.name == 'nt':
470 _kw['creationflags'] = subprocess.CREATE_NO_WINDOW
471 subprocess.Popen(cmd, **_kw)
472 for _ in range(30):
473 time.sleep(1)
474 if self.is_available():
475 logger.info(f"Qwen3.5-0.8B caption server started on port {self._port}")
476 return True
477 except Exception as e:
478 logger.error(f"Qwen3.5-0.8B start failed: {e}")
479 return False
481 # 0.8B optimal: 512x288 (11KB JPEG) — only needs scene understanding, not coords
482 CAPTION_WIDTH = 512
483 CAPTION_HEIGHT = 288
484 IDLE_TIMEOUT_S = 300 # Unload after 5 min with no frames
486 def __init__(self, port: int = None):
487 from core.port_registry import get_port
488 self._port = port or get_port('vlm_caption')
489 self._launch_attempted = False
490 self._last_describe_time = 0.0
491 self._server_proc = None # subprocess.Popen object (not just PID)
493 def _ensure_running(self) -> bool:
494 """Lazy-start: launch 0.8B server on first frame, not at boot.
496 HARTOS emits 'vlm_caption.requested' event. In bundled mode, Nunba
497 subscribes to this event and calls its own start_caption_server().
498 In standalone mode, HARTOS uses model_lifecycle to launch directly.
500 Dependency direction: Nunba → HARTOS (never HARTOS → Nunba).
501 """
502 if self.is_available():
503 return True
504 if self._launch_attempted:
505 return False
506 self._launch_attempted = True
508 # Emit event — Nunba subscribes in bundled mode and starts the server
509 try:
510 from core.platform.events import emit_event
511 emit_event('vlm_caption.requested', {'port': self._port})
512 except Exception:
513 pass
515 # Wait briefly — Nunba may start the server in response to the event
516 import time
517 for _ in range(5):
518 time.sleep(1)
519 if self.is_available():
520 logger.info(f"Qwen3.5-0.8B started (event-driven) on port {self._port}")
521 return True
523 # Nobody started it — standalone mode, use model_lifecycle
524 try:
525 from integrations.service_tools.model_lifecycle import ModelLifecycleManager
526 server = ModelLifecycleManager._find_llama_server_binary()
527 if not server:
528 logger.info("Qwen3.5-0.8B: llama-server not found")
529 return False
531 home = os.path.expanduser('~')
532 model = mmproj = None
533 for d in [os.path.join(home, '.nunba', 'models'),
534 os.path.join(home, '.trueflow', 'models')]:
535 p = os.path.join(d, 'Qwen3.5-0.8B-UD-Q4_K_XL.gguf')
536 if os.path.isfile(p) and not model:
537 model = p
538 p = os.path.join(d, 'qwen08b', 'mmproj-F16.gguf')
539 if os.path.isfile(p) and not mmproj:
540 mmproj = p
541 if not model or not mmproj:
542 logger.info("Qwen3.5-0.8B: model files not found")
543 return False
545 import subprocess
546 cmd = [server, '--model', model, '--mmproj', mmproj,
547 '--port', str(self._port), '--ctx-size', '512',
548 '--n-gpu-layers', '99', '--threads', '4', '--flash-attn', 'on']
549 log_path = os.path.join(os.environ.get('TEMP', '/tmp'), f'llama_{self._port}.log')
550 # APPEND mode — same root-cause class as the caption-server
551 # launch above. Preserves prior run's log across restarts.
552 log_fh = open(log_path, 'a')
553 try:
554 import datetime as _lb_dt
555 log_fh.write(
556 f"\n===== llama-caption (standalone) session "
557 f"{_lb_dt.datetime.now().isoformat()} port={self._port} =====\n"
558 )
559 log_fh.flush()
560 except Exception:
561 pass
562 _kw = dict(stdout=log_fh, stderr=subprocess.STDOUT)
563 if os.name == 'nt':
564 _kw['creationflags'] = subprocess.CREATE_NO_WINDOW
565 self._server_proc = subprocess.Popen(cmd, **_kw)
566 self._log_fh = log_fh
567 logger.info(f"Qwen3.5-0.8B launching PID={self._server_proc.pid} port={self._port}")
568 for _ in range(30):
569 time.sleep(1)
570 if self.is_available():
571 logger.info(f"Qwen3.5-0.8B ready on port {self._port}")
572 return True
573 except Exception as e:
574 logger.error(f"Qwen3.5-0.8B standalone start failed: {e}")
575 return False
577 def stop(self):
578 """Stop the 0.8B server to free GPU memory.
580 Emits 'vlm_caption.stop' — Nunba subscribes and stops in bundled mode.
581 Standalone: kills our own subprocess.
582 """
583 try:
584 from core.platform.events import emit_event
585 emit_event('vlm_caption.stop', {'port': self._port})
586 except Exception:
587 pass
589 # Standalone mode: we own the process
590 if self._server_proc:
591 try:
592 self._server_proc.terminate()
593 self._server_proc.wait(timeout=5)
594 logger.info(f"Qwen3.5-0.8B stopped (PID={self._server_proc.pid})")
595 except Exception:
596 try:
597 self._server_proc.kill()
598 except Exception:
599 pass
600 self._server_proc = None
601 if hasattr(self, '_log_fh') and self._log_fh:
602 try:
603 self._log_fh.close()
604 except Exception:
605 pass
606 self._log_fh = None
607 self._launch_attempted = False
609 def check_idle(self):
610 """Called by VisionService's description_loop. Unloads if no frames for IDLE_TIMEOUT_S."""
611 import time
612 if self._server_proc and self._last_describe_time > 0:
613 idle = time.time() - self._last_describe_time
614 if idle > self.IDLE_TIMEOUT_S:
615 logger.info(f"Qwen3.5-0.8B idle for {idle:.0f}s — unloading to free GPU")
616 self.stop()
618 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:
619 import base64, time
620 # Lazy-start on first frame
621 if not self._ensure_running():
622 return None
623 self._last_describe_time = time.time()
624 try:
625 # Resize to 512x288 for fast captioning (0.8B doesn't need full res)
626 from PIL import Image
627 import io
628 img = Image.open(io.BytesIO(frame_bytes))
629 if img.width > self.CAPTION_WIDTH or img.height > self.CAPTION_HEIGHT:
630 img = img.resize((self.CAPTION_WIDTH, self.CAPTION_HEIGHT), Image.LANCZOS)
631 buf = io.BytesIO()
632 img.save(buf, 'JPEG', quality=40)
633 b64 = base64.b64encode(buf.getvalue()).decode('utf-8')
635 resp = pooled_post(
636 f'http://127.0.0.1:{self._port}/v1/chat/completions',
637 json={
638 'model': 'local',
639 'max_tokens': 100,
640 'temperature': 0.1,
641 'messages': [{
642 'role': 'user',
643 'content': [
644 {'type': 'text', 'text': prompt or 'Describe what you see in this screenshot in 2 sentences.'},
645 {'type': 'image_url', 'image_url': {
646 'url': f'data:image/jpeg;base64,{b64}'
647 }},
648 ]
649 }]
650 },
651 timeout=15,
652 )
653 if resp.status_code == 200:
654 return resp.json()['choices'][0]['message']['content']
655 except Exception as e:
656 logger.debug(f"Qwen08B describe error: {e}")
657 return None
660class NoneBackend(VisionBackend):
661 """No-op backend — FrameStore only, zero overhead."""
663 @property
664 def name(self) -> str:
665 return 'none'
667 @property
668 def requires_gpu(self) -> bool:
669 return False
671 @property
672 def ram_mb(self) -> int:
673 return 0
675 def is_available(self) -> bool:
676 return True
678 def describe(self, frame_bytes: bytes, prompt: str = '') -> Optional[str]:
679 return None
682# ─── Backend Registry ───
684_BACKENDS = {
685 'qwen08b': Qwen08BBackend,
686 'qwen3vl': Qwen3VLVisionBackend,
687 'minicpm': MiniCPMBackend,
688 'mobilevlm': MobileVLMBackend,
689 'clip': CLIPBackend,
690 'none': NoneBackend,
691}
694def get_vision_backend(name: str = '') -> VisionBackend:
695 """Get or auto-select a vision backend.
697 Priority (when name not specified):
698 1. HEVOLVE_VISION_BACKEND env var
699 2. ModelCatalog.select_best('vlm') — catalog is single source of truth
700 for VRAM thresholds and tier gates
701 3. Fallback: direct VRAM query (catalog unavailable)
702 - 4GB+ VRAM → minicpm
703 - 2GB+ RAM, no GPU → mobilevlm (if ONNX Runtime available)
704 - 1GB+ RAM → clip (if clip/open_clip available)
705 - <1GB → none
706 """
707 backend_name = name or os.environ.get('HEVOLVE_VISION_BACKEND', '')
709 if backend_name:
710 cls = _BACKENDS.get(backend_name, NoneBackend)
711 return cls()
713 # Auto-detect — prefer Qwen3.5-0.8B for captioning (1s/frame, dedicated port)
714 # This is separate from the 4B model used for computer use / action planning.
715 qwen08b = Qwen08BBackend()
716 if qwen08b.is_available():
717 return qwen08b
719 # Fallback: Qwen3-VL 4B (shares port with computer use agent)
720 qwen3vl = Qwen3VLVisionBackend()
721 if qwen3vl.is_available():
722 return qwen3vl
724 # ── Catalog-aware selection (single source of truth for VRAM thresholds) ─
725 try:
726 from integrations.service_tools.model_orchestrator import get_orchestrator
727 entry = get_orchestrator().select_best('vlm')
728 if entry:
729 # Map catalog ID → backend name → backend class
730 _CATALOG_TO_BACKEND = {
731 'vlm-qwen08b': 'qwen08b',
732 'vlm-qwen3vl': 'qwen3vl',
733 'vlm-minicpm-v2': 'minicpm',
734 'vlm-mobilevlm': 'mobilevlm',
735 'vlm-clip': 'clip',
736 }
737 backend_key = _CATALOG_TO_BACKEND.get(entry.id)
738 if backend_key:
739 cls = _BACKENDS.get(backend_key, NoneBackend)
740 candidate = cls()
741 if candidate.is_available():
742 return candidate
743 except Exception:
744 pass
746 # ── Fallback: direct VRAM / RAM query ────────────────────────────────────
747 try:
748 from security.system_requirements import get_capabilities
749 caps = get_capabilities()
750 if caps:
751 hw = caps.hardware
752 if hw.gpu_vram_gb >= 4:
753 return MiniCPMBackend()
754 if hw.ram_gb >= 2:
755 backend = MobileVLMBackend()
756 if backend.is_available():
757 return backend
758 backend = CLIPBackend()
759 if backend.is_available():
760 return backend
761 if hw.ram_gb >= 1:
762 backend = CLIPBackend()
763 if backend.is_available():
764 return backend
765 except Exception:
766 pass
768 # Last resort: try minicpm (original behavior)
769 minicpm = MiniCPMBackend()
770 if minicpm.is_available():
771 return minicpm
773 return NoneBackend()
776def list_available_backends():
777 """Return list of (name, available, ram_mb) for all backends."""
778 results = []
779 for name, cls in _BACKENDS.items():
780 backend = cls()
781 results.append({
782 'name': name,
783 'available': backend.is_available(),
784 'requires_gpu': backend.requires_gpu,
785 'ram_mb': backend.ram_mb,
786 })
787 return results
790def populate_vlm_catalog(catalog) -> int:
791 """Register all VLM backend variants into the ModelCatalog.
793 This is the single source of truth for VLM model names, VRAM thresholds,
794 and capability tier gates — replacing hardcoded values in get_vision_backend().
796 Called by ModelCatalog._populate_vlm_models() so the catalog stays
797 consistent with what lightweight_backend actually supports.
799 Returns number of new entries added.
800 """
801 from integrations.service_tools.model_catalog import ModelEntry, ModelType
803 vlm_models = [
804 # (id, name, vram_gb, ram_gb, disk_gb, quality, speed, min_tier, backend,
805 # supports_gpu, supports_cpu, caps, tags)
806 (
807 'vlm-qwen08b', 'Qwen3.5-0.8B (caption)',
808 0.5, 0.8, 0.75, 0.70, 0.98, 'lite',
809 'api', True, True,
810 {'image_input': True, 'video_input': False, 'description_loop': True,
811 'computer_use': False, 'continuous_captioning': True},
812 ['local', 'vision', 'caption', 'fast', 'cpu-friendly'],
813 ),
814 (
815 'vlm-qwen3vl', 'Qwen3-VL',
816 4.0, 4.0, 8.0, 0.90, 0.70, 'full',
817 'api', True, False,
818 {'image_input': True, 'video_input': True, 'description_loop': True,
819 'computer_use': True},
820 ['local', 'vision', 'qwen3vl'],
821 ),
822 (
823 'vlm-minicpm-v2', 'MiniCPM-V-2',
824 4.0, 4.0, 4.0, 0.80, 0.70, 'full',
825 'sidecar', True, False,
826 {'image_input': True, 'video_input': False, 'description_loop': True,
827 'computer_use': False},
828 ['local', 'vision'],
829 ),
830 (
831 'vlm-mobilevlm', 'MobileVLM-1.7B (ONNX)',
832 0.0, 0.4, 0.5, 0.55, 0.92, 'lite',
833 'onnx', False, True,
834 {'image_input': True, 'video_input': False, 'description_loop': True,
835 'computer_use': False},
836 ['local', 'vision', 'cpu-friendly', 'onnx'],
837 ),
838 (
839 'vlm-clip', 'CLIP ViT-B/16 (classification)',
840 0.0, 0.5, 0.6, 0.45, 0.96, 'lite',
841 'torch', False, True,
842 {'image_input': True, 'video_input': False, 'description_loop': False,
843 'classification_only': True, 'computer_use': False},
844 ['local', 'vision', 'cpu-friendly', 'classification'],
845 ),
846 ]
848 added = 0
849 for (mid, name, vram, ram, disk, quality, speed, min_tier,
850 backend, sup_gpu, sup_cpu, caps, tags) in vlm_models:
851 if catalog.get(mid) is not None:
852 continue
853 entry = ModelEntry(
854 id=mid, name=name, model_type=ModelType.VLM,
855 source='huggingface',
856 vram_gb=vram, ram_gb=ram, disk_gb=disk,
857 min_capability_tier=min_tier,
858 backend=backend,
859 supports_gpu=sup_gpu, supports_cpu=sup_cpu,
860 supports_cpu_offload=False,
861 idle_timeout_s=900,
862 capabilities=caps,
863 quality_score=quality, speed_score=speed,
864 tags=tags,
865 )
866 catalog.register(entry, persist=False)
867 added += 1
868 return added