Coverage for integrations / agent_engine / gaia_dataset.py: 64.5%

62 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2GAIA dataset loader — adapter between the public `gaia-benchmark/GAIA` 

3HuggingFace dataset and hive_benchmark_prover's problem contract. 

4 

5GAIA (General AI Assistants, Mialon et al. 2023) ships 466 real-world 

6agent tasks across 3 difficulty levels. Public scores: 

7 - Best human (2023): 92.0% 

8 - GPT-4 + plugins: 15.0% 

9 - Claude 3 Opus: 17.0% 

10 - GPT-4o (Mar 2024 card): 32.0% 

11Frontier single-model scores still sit below 65% — the clean signal 

12that sum-of-many-agents should beat any-single-model on real agentic 

13work is here, not in MMLU. 

14 

15Design: 

16- DO NOT auto-download at import time. The dataset is gated + large; 

17 loading it silently on boot would surprise users. Load is explicit 

18 via load_gaia_problems(). 

19- Three-layer fallback: 

20 1. Local cached JSON at ~/.hevolve/benchmarks/gaia_mini.json 

21 2. HuggingFace `datasets` library (if installed + auth present) 

22 3. Synthetic stubs (hive_benchmark_prover already provides these 

23 when load returns empty) 

24- Level filter + limit so the caller can sample without loading the 

25 full 466-task set. Our mini rotation pulls 30 problems per run. 

26 

27Each returned problem is a dict with fields: 

28 id unique problem id 

29 type 'agent' (to match BUILTIN_BENCHMARKS['gaia_mini']) 

30 level 1 | 2 | 3 

31 prompt the Question field from GAIA 

32 answer the Final answer field (for scoring — NOT shown to node) 

33 tools hint about expected tools (browser, code, search) 

34 has_file bool — problem comes with an attachment 

35""" 

36from __future__ import annotations 

37 

38import json 

39import logging 

40import os 

41from typing import List, Optional 

42 

43logger = logging.getLogger('hevolve_social') 

44 

45 

46_CACHE_PATH_ENV = 'HEVOLVE_GAIA_CACHE' 

47 

48 

49def _resolve_cache_path() -> str: 

50 override = os.environ.get(_CACHE_PATH_ENV, '') 

51 if override: 

52 return override 

53 root = os.path.join( 

54 os.path.expanduser('~'), '.hevolve', 'benchmarks', 

55 ) 

56 return os.path.join(root, 'gaia_mini.json') 

57 

58 

59def _try_cache(path: str) -> List[dict]: 

60 if not os.path.exists(path): 

61 return [] 

62 try: 

63 with open(path, 'r', encoding='utf-8') as fh: 

64 data = json.load(fh) 

65 if isinstance(data, list): 

66 return data 

67 if isinstance(data, dict) and isinstance(data.get('problems'), list): 

68 return data['problems'] 

69 except Exception as exc: 

70 logger.debug(f'[gaia] cache read failed {path}: {exc}') 

71 return [] 

72 

73 

74def _try_huggingface(levels: List[int], limit: int) -> List[dict]: 

75 """Attempt to load GAIA via the HuggingFace datasets library. 

76 

77 Gated dataset — requires HF token + accepted dataset terms. We 

78 do NOT prompt. Silent fallback if any step fails; the caller 

79 receives an empty list and hive_benchmark_prover uses stubs. 

80 """ 

81 try: 

82 from datasets import load_dataset # type: ignore 

83 except ImportError: 

84 return [] 

85 try: 

86 # The GAIA dataset has a 'validation' split with labeled answers 

87 # across all 3 levels. We read only the validation split so 

88 # we never train on the gated test set. 

89 ds = load_dataset('gaia-benchmark/GAIA', '2023_all', split='validation') 

90 except Exception as exc: 

91 logger.debug(f'[gaia] HF load failed: {exc}') 

92 return [] 

93 

94 out: List[dict] = [] 

95 level_set = set(int(l) for l in levels) 

96 # Iterate a limited window — GAIA validation has ~165 items 

97 for i, row in enumerate(ds): 

98 lvl = int(row.get('Level', 0) or 0) 

99 if lvl not in level_set: 

100 continue 

101 question = str(row.get('Question', '') or '').strip() 

102 if not question: 

103 continue 

104 out.append({ 

105 'id': f'gaia_L{lvl}_{row.get("task_id", i)}', 

106 'type': 'agent', 

107 'level': lvl, 

108 'prompt': question, 

109 'answer': str(row.get('Final answer', '') or '').strip(), 

110 'tools': row.get('Annotator Metadata', {}).get('Tools', '') 

111 if isinstance(row.get('Annotator Metadata'), dict) else '', 

112 'has_file': bool(row.get('file_name')), 

113 'dataset': 'gaia-benchmark/GAIA', 

114 }) 

115 if len(out) >= limit: 

116 break 

117 return out 

118 

119 

120def load_gaia_problems( 

121 levels: Optional[List[int]] = None, 

122 limit: int = 30, 

123) -> List[dict]: 

124 """Return up to `limit` GAIA validation problems filtered by level. 

125 

126 Resolution order: 

127 1. `HEVOLVE_GAIA_CACHE` env path (raw JSON list or {"problems": [...]}). 

128 2. Default user cache at `~/.hevolve/benchmarks/gaia_mini.json`. 

129 3. HuggingFace `datasets` library (gated — needs HF auth). 

130 Returns `[]` when none of the above are available; caller is 

131 expected to generate synthetic stubs. 

132 """ 

133 levels = levels or [1, 2, 3] 

134 limit = max(1, int(limit or 30)) 

135 

136 cache_path = _resolve_cache_path() 

137 cached = _try_cache(cache_path) 

138 if cached: 

139 level_set = set(int(l) for l in levels) 

140 filtered = [p for p in cached 

141 if int(p.get('level', 0) or 0) in level_set] 

142 return filtered[:limit] if filtered else cached[:limit] 

143 

144 hf = _try_huggingface(levels, limit) 

145 if hf: 

146 return hf 

147 

148 return [] 

149 

150 

151def save_cache(problems: List[dict]) -> bool: 

152 """Persist a loaded problem set so subsequent runs skip the HF call. 

153 

154 Called by a one-off prefetch script; NOT invoked automatically so 

155 we never surprise the user with a large download.""" 

156 path = _resolve_cache_path() 

157 try: 

158 os.makedirs(os.path.dirname(path), exist_ok=True) 

159 with open(path, 'w', encoding='utf-8') as fh: 

160 json.dump({'problems': problems}, fh, indent=2) 

161 return True 

162 except Exception as exc: 

163 logger.warning(f'[gaia] cache write failed: {exc}') 

164 return False