Coverage for integrations/agent_engine/gaia

1"""

2GAIA dataset loader — adapter between the public `gaia-benchmark/GAIA`

3HuggingFace dataset and hive_benchmark_prover's problem contract.

5GAIA (General AI Assistants, Mialon et al. 2023) ships 466 real-world

6agent tasks across 3 difficulty levels. Public scores:

7 - Best human (2023): 92.0%

8 - GPT-4 + plugins: 15.0%

9 - Claude 3 Opus: 17.0%

10 - GPT-4o (Mar 2024 card): 32.0%

11Frontier single-model scores still sit below 65% — the clean signal

12that sum-of-many-agents should beat any-single-model on real agentic

13work is here, not in MMLU.

15Design:

16- DO NOT auto-download at import time. The dataset is gated + large;

17 loading it silently on boot would surprise users. Load is explicit

18 via load_gaia_problems().

19- Three-layer fallback:

20 1. Local cached JSON at ~/.hevolve/benchmarks/gaia_mini.json

21 2. HuggingFace `datasets` library (if installed + auth present)

22 3. Synthetic stubs (hive_benchmark_prover already provides these

23 when load returns empty)

24- Level filter + limit so the caller can sample without loading the

25 full 466-task set. Our mini rotation pulls 30 problems per run.

27Each returned problem is a dict with fields:

28 id unique problem id

29 type 'agent' (to match BUILTIN_BENCHMARKS['gaia_mini'])

30 level 1 | 2 | 3

31 prompt the Question field from GAIA

32 answer the Final answer field (for scoring — NOT shown to node)

33 tools hint about expected tools (browser, code, search)

34 has_file bool — problem comes with an attachment

35"""

36from __future__ import annotations

38import json

39import logging

40import os

41from typing import List, Optional

43logger = logging.getLogger('hevolve_social')

46_CACHE_PATH_ENV = 'HEVOLVE_GAIA_CACHE'

49def _resolve_cache_path() -> str:

50 override = os.environ.get(_CACHE_PATH_ENV, '')

51 if override:

52 return override

53 root = os.path.join(

54 os.path.expanduser('~'), '.hevolve', 'benchmarks',

55 )

56 return os.path.join(root, 'gaia_mini.json')

59def _try_cache(path: str) -> List[dict]:

60 if not os.path.exists(path):

61 return []

62 try:

63 with open(path, 'r', encoding='utf-8') as fh:

64 data = json.load(fh)

65 if isinstance(data, list):

66 return data

67 if isinstance(data, dict) and isinstance(data.get('problems'), list):

68 return data['problems']

69 except Exception as exc:

70 logger.debug(f'[gaia] cache read failed {path}: {exc}')

71 return []

74def _try_huggingface(levels: List[int], limit: int) -> List[dict]:

75 """Attempt to load GAIA via the HuggingFace datasets library.

77 Gated dataset — requires HF token + accepted dataset terms. We

78 do NOT prompt. Silent fallback if any step fails; the caller

79 receives an empty list and hive_benchmark_prover uses stubs.

80 """

81 try:

82 from datasets import load_dataset # type: ignore

83 except ImportError:

84 return []

85 try:

86 # The GAIA dataset has a 'validation' split with labeled answers

87 # across all 3 levels. We read only the validation split so

88 # we never train on the gated test set.

89 ds = load_dataset('gaia-benchmark/GAIA', '2023_all', split='validation')

90 except Exception as exc:

91 logger.debug(f'[gaia] HF load failed: {exc}')

92 return []

94 out: List[dict] = []

95 level_set = set(int(l) for l in levels)

96 # Iterate a limited window — GAIA validation has ~165 items

97 for i, row in enumerate(ds):

98 lvl = int(row.get('Level', 0) or 0)

99 if lvl not in level_set:

100 continue

101 question = str(row.get('Question', '') or '').strip()

102 if not question:

103 continue

104 out.append({

105 'id': f'gaia_L{lvl}_{row.get("task_id", i)}',

106 'type': 'agent',

107 'level': lvl,

108 'prompt': question,

109 'answer': str(row.get('Final answer', '') or '').strip(),

110 'tools': row.get('Annotator Metadata', {}).get('Tools', '')

111 if isinstance(row.get('Annotator Metadata'), dict) else '',

112 'has_file': bool(row.get('file_name')),

113 'dataset': 'gaia-benchmark/GAIA',

114 })

115 if len(out) >= limit:

116 break

117 return out

118

119

120def load_gaia_problems(

121 levels: Optional[List[int]] = None,

122 limit: int = 30,

123) -> List[dict]:

124 """Return up to `limit` GAIA validation problems filtered by level.

125

126 Resolution order:

127 1. `HEVOLVE_GAIA_CACHE` env path (raw JSON list or {"problems": [...]}).

128 2. Default user cache at `~/.hevolve/benchmarks/gaia_mini.json`.

129 3. HuggingFace `datasets` library (gated — needs HF auth).

130 Returns `[]` when none of the above are available; caller is

131 expected to generate synthetic stubs.

132 """

133 levels = levels or [1, 2, 3]

134 limit = max(1, int(limit or 30))

135

136 cache_path = _resolve_cache_path()

137 cached = _try_cache(cache_path)

138 if cached:

139 level_set = set(int(l) for l in levels)

140 filtered = [p for p in cached

141 if int(p.get('level', 0) or 0) in level_set]

142 return filtered[:limit] if filtered else cached[:limit]

143

144 hf = _try_huggingface(levels, limit)

145 if hf:

146 return hf

147

148 return []

149

150

151def save_cache(problems: List[dict]) -> bool:

152 """Persist a loaded problem set so subsequent runs skip the HF call.

153

154 Called by a one-off prefetch script; NOT invoked automatically so

155 we never surprise the user with a large download."""

156 path = _resolve_cache_path()

157 try:

158 os.makedirs(os.path.dirname(path), exist_ok=True)

159 with open(path, 'w', encoding='utf-8') as fh:

160 json.dump({'problems': problems}, fh, indent=2)

161 return True

162 except Exception as exc:

163 logger.warning(f'[gaia] cache write failed: {exc}')

164 return False

Coverage for integrations / agent_engine / gaia_dataset.py: 64.5%

62 statements