Coverage for core / gpu_tier.py: 0.0%
22 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2core.gpu_tier — single source of truth for GPU capability tiers.
4WHY THIS EXISTS
5───────────────
6The 24 / 10 / 4 GB tier thresholds were duplicated in TWO places:
7 - main.py `/backend/health` (Python, server-side classification).
8 - landing-page/src/components/chat/GpuTierBadge.jsx (JavaScript,
9 client-side label generation).
11When commit 2acf21a raised the speculative-decoding gate from 8 GB to
1210 GB, the backend was updated but the frontend label still said "10GB+"
13because it was hard-coded — they happened to match by accident. Two
14months later when product wanted to test 12 GB as the new break-point,
15ONE side would have shipped and the other wouldn't, producing tier
16labels that disagreed with the server's actual behaviour.
18This module is the canonical table. The frontend now FETCHES it via
19`GET /api/v1/system/tiers` so the labels and thresholds shipped to the
20user are always derived from the same source the backend classifier
21uses. Future threshold tweaks happen here ONCE.
23WHO CONSUMES IT
24───────────────
25- main.py /backend/health → classify(vram_gb, cuda_available)
26- main.py /api/v1/system/tiers → tier_table() (frontend bootstrap)
27- landing-page/src/components/chat/GpuTierBadge.jsx → fetches /tiers
29THRESHOLD RATIONALE
30───────────────────
31ULTRA ≥ 24 GB → 70B-class model viable (Llama-70B Q4 ≈ 38GB; with
32 offload + KV-cache budget, 24GB is the practical floor
33 where ULTRA tier actually unlocks behaviour the lower
34 tiers can't access).
35FULL ≥ 10 GB → main + draft speculative decoding fits with 4GB TTS
36 headroom (raised from 8GB in commit 2acf21a).
37STANDARD 4-10 GB → main-only (no speculation; ~1.3-2.0s slower per reply
38 on chat).
39NONE < 4 GB / no CUDA → CPU fallback.
40"""
41from __future__ import annotations
43from enum import Enum
44from typing import Dict, List
47class GpuTier(str, Enum):
48 """GPU capability tier — string-valued Enum so JSON serialization is
49 transparent and the wire format matches the Python value."""
50 ULTRA = 'ultra'
51 FULL = 'full'
52 STANDARD = 'standard'
53 NONE = 'none'
56# Single source of truth. Keys are tier names; values are the MINIMUM
57# VRAM (GB) required for that tier. Order matters for `classify` — it
58# walks descending so the first match wins. Keep this descending.
59TIER_THRESHOLDS: Dict[GpuTier, float] = {
60 GpuTier.ULTRA: 24.0,
61 GpuTier.FULL: 10.0,
62 GpuTier.STANDARD: 4.0,
63 GpuTier.NONE: 0.0,
64}
67# Operator-readable tier descriptions. The frontend pulls these directly
68# so the label + tooltip text in GpuTierBadge.jsx never drifts from what
69# the backend documents. Keep these <120 chars; the chip tooltip wraps
70# but the badge label itself is rendered as the short form.
71TIER_DESCRIPTIONS: Dict[GpuTier, Dict[str, str]] = {
72 GpuTier.ULTRA: {
73 'label': 'Ultra GPU',
74 'short': 'Ultra',
75 'description': (
76 '24GB+ VRAM. 70B-class models viable with speculative '
77 'decoding + full TTS headroom.'
78 ),
79 },
80 GpuTier.FULL: {
81 'label': 'Full GPU',
82 'short': 'Full',
83 'description': (
84 '10GB+ VRAM. Draft + main speculative decoding active. '
85 'Replies ~40% faster than Standard.'
86 ),
87 },
88 GpuTier.STANDARD: {
89 'label': 'Standard GPU',
90 'short': 'Standard',
91 'description': (
92 'Heavy model only. Upgrade to 10GB+ VRAM for ~40% faster '
93 'replies (speculative decoding unlocks at 10GB to leave room for voice).'
94 ),
95 },
96 GpuTier.NONE: {
97 'label': 'CPU',
98 'short': 'CPU',
99 'description': (
100 'No CUDA GPU detected (or under 4GB VRAM). Chat runs on CPU. '
101 'A 10GB+ NVIDIA GPU unlocks speculative decoding.'
102 ),
103 },
104}
107def classify(vram_gb: float, cuda_available: bool) -> GpuTier:
108 """Return the GPU tier for the given VRAM size and CUDA availability.
110 No CUDA → always NONE (regardless of VRAM size — an integrated GPU
111 can have lots of "shared" RAM but no CUDA path).
113 Args:
114 vram_gb: Total VRAM in gigabytes (NOT free — total). Read from
115 `vram_manager.get_total_vram()`.
116 cuda_available: True iff `torch.cuda.is_available()` returned True
117 at probe time (via vram_manager.detect_gpu).
119 Returns:
120 The most-capable tier whose threshold is ≤ vram_gb, or NONE if no
121 CUDA / vram_gb < STANDARD threshold.
122 """
123 if not cuda_available:
124 return GpuTier.NONE
125 # Walk the table in declared (descending) order; first match wins.
126 for tier, threshold in TIER_THRESHOLDS.items():
127 if tier is GpuTier.NONE:
128 # Don't classify a CUDA-available system with vram > 0 as NONE
129 # via the threshold ladder; that case is "under STANDARD" which
130 # we treat as NONE below.
131 break
132 if vram_gb >= threshold:
133 return tier
134 return GpuTier.NONE
137def tier_table() -> List[Dict[str, object]]:
138 """Return the canonical tier table for the /api/v1/system/tiers endpoint.
140 Format chosen to be consumed directly by the React component without
141 re-keying — the frontend just renders the array."""
142 return [
143 {
144 'name': tier.value,
145 'min_vram_gb': TIER_THRESHOLDS[tier],
146 'label': TIER_DESCRIPTIONS[tier]['label'],
147 'short': TIER_DESCRIPTIONS[tier]['short'],
148 'description': TIER_DESCRIPTIONS[tier]['description'],
149 }
150 for tier in TIER_THRESHOLDS
151 ]
154__all__ = [
155 'GpuTier',
156 'TIER_THRESHOLDS',
157 'TIER_DESCRIPTIONS',
158 'classify',
159 'tier_table',
160]