Coverage for core/gpu

1"""

2core.gpu_tier — single source of truth for GPU capability tiers.

4WHY THIS EXISTS

5───────────────

6The 24 / 10 / 4 GB tier thresholds were duplicated in TWO places:

7 - main.py `/backend/health` (Python, server-side classification).

8 - landing-page/src/components/chat/GpuTierBadge.jsx (JavaScript,

9 client-side label generation).

11When commit 2acf21a raised the speculative-decoding gate from 8 GB to

1210 GB, the backend was updated but the frontend label still said "10GB+"

13because it was hard-coded — they happened to match by accident. Two

14months later when product wanted to test 12 GB as the new break-point,

15ONE side would have shipped and the other wouldn't, producing tier

16labels that disagreed with the server's actual behaviour.

18This module is the canonical table. The frontend now FETCHES it via

19`GET /api/v1/system/tiers` so the labels and thresholds shipped to the

20user are always derived from the same source the backend classifier

21uses. Future threshold tweaks happen here ONCE.

23WHO CONSUMES IT

24───────────────

25- main.py /backend/health → classify(vram_gb, cuda_available)

26- main.py /api/v1/system/tiers → tier_table() (frontend bootstrap)

27- landing-page/src/components/chat/GpuTierBadge.jsx → fetches /tiers

29THRESHOLD RATIONALE

30───────────────────

31ULTRA ≥ 24 GB → 70B-class model viable (Llama-70B Q4 ≈ 38GB; with

32 offload + KV-cache budget, 24GB is the practical floor

33 where ULTRA tier actually unlocks behaviour the lower

34 tiers can't access).

35FULL ≥ 10 GB → main + draft speculative decoding fits with 4GB TTS

36 headroom (raised from 8GB in commit 2acf21a).

37STANDARD 4-10 GB → main-only (no speculation; ~1.3-2.0s slower per reply

38 on chat).

39NONE < 4 GB / no CUDA → CPU fallback.

40"""

41from __future__ import annotations

43from enum import Enum

44from typing import Dict, List

47class GpuTier(str, Enum):

48 """GPU capability tier — string-valued Enum so JSON serialization is

49 transparent and the wire format matches the Python value."""

50 ULTRA = 'ultra'

51 FULL = 'full'

52 STANDARD = 'standard'

53 NONE = 'none'

56# Single source of truth. Keys are tier names; values are the MINIMUM

57# VRAM (GB) required for that tier. Order matters for `classify` — it

58# walks descending so the first match wins. Keep this descending.

59TIER_THRESHOLDS: Dict[GpuTier, float] = {

60 GpuTier.ULTRA: 24.0,

61 GpuTier.FULL: 10.0,

62 GpuTier.STANDARD: 4.0,

63 GpuTier.NONE: 0.0,

64}

67# Operator-readable tier descriptions. The frontend pulls these directly

68# so the label + tooltip text in GpuTierBadge.jsx never drifts from what

69# the backend documents. Keep these <120 chars; the chip tooltip wraps

70# but the badge label itself is rendered as the short form.

71TIER_DESCRIPTIONS: Dict[GpuTier, Dict[str, str]] = {

72 GpuTier.ULTRA: {

73 'label': 'Ultra GPU',

74 'short': 'Ultra',

75 'description': (

76 '24GB+ VRAM. 70B-class models viable with speculative '

77 'decoding + full TTS headroom.'

78 ),

79 },

80 GpuTier.FULL: {

81 'label': 'Full GPU',

82 'short': 'Full',

83 'description': (

84 '10GB+ VRAM. Draft + main speculative decoding active. '

85 'Replies ~40% faster than Standard.'

86 ),

87 },

88 GpuTier.STANDARD: {

89 'label': 'Standard GPU',

90 'short': 'Standard',

91 'description': (

92 'Heavy model only. Upgrade to 10GB+ VRAM for ~40% faster '

93 'replies (speculative decoding unlocks at 10GB to leave room for voice).'

94 ),

95 },

96 GpuTier.NONE: {

97 'label': 'CPU',

98 'short': 'CPU',

99 'description': (

100 'No CUDA GPU detected (or under 4GB VRAM). Chat runs on CPU. '

101 'A 10GB+ NVIDIA GPU unlocks speculative decoding.'

102 ),

103 },

104}

105

106

107def classify(vram_gb: float, cuda_available: bool) -> GpuTier:

108 """Return the GPU tier for the given VRAM size and CUDA availability.

109

110 No CUDA → always NONE (regardless of VRAM size — an integrated GPU

111 can have lots of "shared" RAM but no CUDA path).

112

113 Args:

114 vram_gb: Total VRAM in gigabytes (NOT free — total). Read from

115 `vram_manager.get_total_vram()`.

116 cuda_available: True iff `torch.cuda.is_available()` returned True

117 at probe time (via vram_manager.detect_gpu).

118

119 Returns:

120 The most-capable tier whose threshold is ≤ vram_gb, or NONE if no

121 CUDA / vram_gb < STANDARD threshold.

122 """

123 if not cuda_available:

124 return GpuTier.NONE

125 # Walk the table in declared (descending) order; first match wins.

126 for tier, threshold in TIER_THRESHOLDS.items():

127 if tier is GpuTier.NONE:

128 # Don't classify a CUDA-available system with vram > 0 as NONE

129 # via the threshold ladder; that case is "under STANDARD" which

130 # we treat as NONE below.

131 break

132 if vram_gb >= threshold:

133 return tier

134 return GpuTier.NONE

135

136

137def tier_table() -> List[Dict[str, object]]:

138 """Return the canonical tier table for the /api/v1/system/tiers endpoint.

139

140 Format chosen to be consumed directly by the React component without

141 re-keying — the frontend just renders the array."""

142 return [

143 {

144 'name': tier.value,

145 'min_vram_gb': TIER_THRESHOLDS[tier],

146 'label': TIER_DESCRIPTIONS[tier]['label'],

147 'short': TIER_DESCRIPTIONS[tier]['short'],

148 'description': TIER_DESCRIPTIONS[tier]['description'],

149 }

150 for tier in TIER_THRESHOLDS

151 ]

152

153

154__all__ = [

155 'GpuTier',

156 'TIER_THRESHOLDS',

157 'TIER_DESCRIPTIONS',

158 'classify',

159 'tier_table',

160]

Coverage for core / gpu_tier.py: 0.0%

22 statements