Coverage for core / gpu_tier.py: 0.0%

22 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2core.gpu_tier — single source of truth for GPU capability tiers. 

3 

4WHY THIS EXISTS 

5─────────────── 

6The 24 / 10 / 4 GB tier thresholds were duplicated in TWO places: 

7 - main.py `/backend/health` (Python, server-side classification). 

8 - landing-page/src/components/chat/GpuTierBadge.jsx (JavaScript, 

9 client-side label generation). 

10 

11When commit 2acf21a raised the speculative-decoding gate from 8 GB to 

1210 GB, the backend was updated but the frontend label still said "10GB+" 

13because it was hard-coded — they happened to match by accident. Two 

14months later when product wanted to test 12 GB as the new break-point, 

15ONE side would have shipped and the other wouldn't, producing tier 

16labels that disagreed with the server's actual behaviour. 

17 

18This module is the canonical table. The frontend now FETCHES it via 

19`GET /api/v1/system/tiers` so the labels and thresholds shipped to the 

20user are always derived from the same source the backend classifier 

21uses. Future threshold tweaks happen here ONCE. 

22 

23WHO CONSUMES IT 

24─────────────── 

25- main.py /backend/health → classify(vram_gb, cuda_available) 

26- main.py /api/v1/system/tiers → tier_table() (frontend bootstrap) 

27- landing-page/src/components/chat/GpuTierBadge.jsx → fetches /tiers 

28 

29THRESHOLD RATIONALE 

30─────────────────── 

31ULTRA ≥ 24 GB → 70B-class model viable (Llama-70B Q4 ≈ 38GB; with 

32 offload + KV-cache budget, 24GB is the practical floor 

33 where ULTRA tier actually unlocks behaviour the lower 

34 tiers can't access). 

35FULL ≥ 10 GB → main + draft speculative decoding fits with 4GB TTS 

36 headroom (raised from 8GB in commit 2acf21a). 

37STANDARD 4-10 GB → main-only (no speculation; ~1.3-2.0s slower per reply 

38 on chat). 

39NONE < 4 GB / no CUDA → CPU fallback. 

40""" 

41from __future__ import annotations 

42 

43from enum import Enum 

44from typing import Dict, List 

45 

46 

47class GpuTier(str, Enum): 

48 """GPU capability tier — string-valued Enum so JSON serialization is 

49 transparent and the wire format matches the Python value.""" 

50 ULTRA = 'ultra' 

51 FULL = 'full' 

52 STANDARD = 'standard' 

53 NONE = 'none' 

54 

55 

56# Single source of truth. Keys are tier names; values are the MINIMUM 

57# VRAM (GB) required for that tier. Order matters for `classify` — it 

58# walks descending so the first match wins. Keep this descending. 

59TIER_THRESHOLDS: Dict[GpuTier, float] = { 

60 GpuTier.ULTRA: 24.0, 

61 GpuTier.FULL: 10.0, 

62 GpuTier.STANDARD: 4.0, 

63 GpuTier.NONE: 0.0, 

64} 

65 

66 

67# Operator-readable tier descriptions. The frontend pulls these directly 

68# so the label + tooltip text in GpuTierBadge.jsx never drifts from what 

69# the backend documents. Keep these <120 chars; the chip tooltip wraps 

70# but the badge label itself is rendered as the short form. 

71TIER_DESCRIPTIONS: Dict[GpuTier, Dict[str, str]] = { 

72 GpuTier.ULTRA: { 

73 'label': 'Ultra GPU', 

74 'short': 'Ultra', 

75 'description': ( 

76 '24GB+ VRAM. 70B-class models viable with speculative ' 

77 'decoding + full TTS headroom.' 

78 ), 

79 }, 

80 GpuTier.FULL: { 

81 'label': 'Full GPU', 

82 'short': 'Full', 

83 'description': ( 

84 '10GB+ VRAM. Draft + main speculative decoding active. ' 

85 'Replies ~40% faster than Standard.' 

86 ), 

87 }, 

88 GpuTier.STANDARD: { 

89 'label': 'Standard GPU', 

90 'short': 'Standard', 

91 'description': ( 

92 'Heavy model only. Upgrade to 10GB+ VRAM for ~40% faster ' 

93 'replies (speculative decoding unlocks at 10GB to leave room for voice).' 

94 ), 

95 }, 

96 GpuTier.NONE: { 

97 'label': 'CPU', 

98 'short': 'CPU', 

99 'description': ( 

100 'No CUDA GPU detected (or under 4GB VRAM). Chat runs on CPU. ' 

101 'A 10GB+ NVIDIA GPU unlocks speculative decoding.' 

102 ), 

103 }, 

104} 

105 

106 

107def classify(vram_gb: float, cuda_available: bool) -> GpuTier: 

108 """Return the GPU tier for the given VRAM size and CUDA availability. 

109 

110 No CUDA → always NONE (regardless of VRAM size — an integrated GPU 

111 can have lots of "shared" RAM but no CUDA path). 

112 

113 Args: 

114 vram_gb: Total VRAM in gigabytes (NOT free — total). Read from 

115 `vram_manager.get_total_vram()`. 

116 cuda_available: True iff `torch.cuda.is_available()` returned True 

117 at probe time (via vram_manager.detect_gpu). 

118 

119 Returns: 

120 The most-capable tier whose threshold is ≤ vram_gb, or NONE if no 

121 CUDA / vram_gb < STANDARD threshold. 

122 """ 

123 if not cuda_available: 

124 return GpuTier.NONE 

125 # Walk the table in declared (descending) order; first match wins. 

126 for tier, threshold in TIER_THRESHOLDS.items(): 

127 if tier is GpuTier.NONE: 

128 # Don't classify a CUDA-available system with vram > 0 as NONE 

129 # via the threshold ladder; that case is "under STANDARD" which 

130 # we treat as NONE below. 

131 break 

132 if vram_gb >= threshold: 

133 return tier 

134 return GpuTier.NONE 

135 

136 

137def tier_table() -> List[Dict[str, object]]: 

138 """Return the canonical tier table for the /api/v1/system/tiers endpoint. 

139 

140 Format chosen to be consumed directly by the React component without 

141 re-keying — the frontend just renders the array.""" 

142 return [ 

143 { 

144 'name': tier.value, 

145 'min_vram_gb': TIER_THRESHOLDS[tier], 

146 'label': TIER_DESCRIPTIONS[tier]['label'], 

147 'short': TIER_DESCRIPTIONS[tier]['short'], 

148 'description': TIER_DESCRIPTIONS[tier]['description'], 

149 } 

150 for tier in TIER_THRESHOLDS 

151 ] 

152 

153 

154__all__ = [ 

155 'GpuTier', 

156 'TIER_THRESHOLDS', 

157 'TIER_DESCRIPTIONS', 

158 'classify', 

159 'tier_table', 

160]