Coverage for integrations / agent_engine / gaia_dataset.py: 64.5%
62 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2GAIA dataset loader — adapter between the public `gaia-benchmark/GAIA`
3HuggingFace dataset and hive_benchmark_prover's problem contract.
5GAIA (General AI Assistants, Mialon et al. 2023) ships 466 real-world
6agent tasks across 3 difficulty levels. Public scores:
7 - Best human (2023): 92.0%
8 - GPT-4 + plugins: 15.0%
9 - Claude 3 Opus: 17.0%
10 - GPT-4o (Mar 2024 card): 32.0%
11Frontier single-model scores still sit below 65% — the clean signal
12that sum-of-many-agents should beat any-single-model on real agentic
13work is here, not in MMLU.
15Design:
16- DO NOT auto-download at import time. The dataset is gated + large;
17 loading it silently on boot would surprise users. Load is explicit
18 via load_gaia_problems().
19- Three-layer fallback:
20 1. Local cached JSON at ~/.hevolve/benchmarks/gaia_mini.json
21 2. HuggingFace `datasets` library (if installed + auth present)
22 3. Synthetic stubs (hive_benchmark_prover already provides these
23 when load returns empty)
24- Level filter + limit so the caller can sample without loading the
25 full 466-task set. Our mini rotation pulls 30 problems per run.
27Each returned problem is a dict with fields:
28 id unique problem id
29 type 'agent' (to match BUILTIN_BENCHMARKS['gaia_mini'])
30 level 1 | 2 | 3
31 prompt the Question field from GAIA
32 answer the Final answer field (for scoring — NOT shown to node)
33 tools hint about expected tools (browser, code, search)
34 has_file bool — problem comes with an attachment
35"""
36from __future__ import annotations
38import json
39import logging
40import os
41from typing import List, Optional
43logger = logging.getLogger('hevolve_social')
46_CACHE_PATH_ENV = 'HEVOLVE_GAIA_CACHE'
49def _resolve_cache_path() -> str:
50 override = os.environ.get(_CACHE_PATH_ENV, '')
51 if override:
52 return override
53 root = os.path.join(
54 os.path.expanduser('~'), '.hevolve', 'benchmarks',
55 )
56 return os.path.join(root, 'gaia_mini.json')
59def _try_cache(path: str) -> List[dict]:
60 if not os.path.exists(path):
61 return []
62 try:
63 with open(path, 'r', encoding='utf-8') as fh:
64 data = json.load(fh)
65 if isinstance(data, list):
66 return data
67 if isinstance(data, dict) and isinstance(data.get('problems'), list):
68 return data['problems']
69 except Exception as exc:
70 logger.debug(f'[gaia] cache read failed {path}: {exc}')
71 return []
74def _try_huggingface(levels: List[int], limit: int) -> List[dict]:
75 """Attempt to load GAIA via the HuggingFace datasets library.
77 Gated dataset — requires HF token + accepted dataset terms. We
78 do NOT prompt. Silent fallback if any step fails; the caller
79 receives an empty list and hive_benchmark_prover uses stubs.
80 """
81 try:
82 from datasets import load_dataset # type: ignore
83 except ImportError:
84 return []
85 try:
86 # The GAIA dataset has a 'validation' split with labeled answers
87 # across all 3 levels. We read only the validation split so
88 # we never train on the gated test set.
89 ds = load_dataset('gaia-benchmark/GAIA', '2023_all', split='validation')
90 except Exception as exc:
91 logger.debug(f'[gaia] HF load failed: {exc}')
92 return []
94 out: List[dict] = []
95 level_set = set(int(l) for l in levels)
96 # Iterate a limited window — GAIA validation has ~165 items
97 for i, row in enumerate(ds):
98 lvl = int(row.get('Level', 0) or 0)
99 if lvl not in level_set:
100 continue
101 question = str(row.get('Question', '') or '').strip()
102 if not question:
103 continue
104 out.append({
105 'id': f'gaia_L{lvl}_{row.get("task_id", i)}',
106 'type': 'agent',
107 'level': lvl,
108 'prompt': question,
109 'answer': str(row.get('Final answer', '') or '').strip(),
110 'tools': row.get('Annotator Metadata', {}).get('Tools', '')
111 if isinstance(row.get('Annotator Metadata'), dict) else '',
112 'has_file': bool(row.get('file_name')),
113 'dataset': 'gaia-benchmark/GAIA',
114 })
115 if len(out) >= limit:
116 break
117 return out
120def load_gaia_problems(
121 levels: Optional[List[int]] = None,
122 limit: int = 30,
123) -> List[dict]:
124 """Return up to `limit` GAIA validation problems filtered by level.
126 Resolution order:
127 1. `HEVOLVE_GAIA_CACHE` env path (raw JSON list or {"problems": [...]}).
128 2. Default user cache at `~/.hevolve/benchmarks/gaia_mini.json`.
129 3. HuggingFace `datasets` library (gated — needs HF auth).
130 Returns `[]` when none of the above are available; caller is
131 expected to generate synthetic stubs.
132 """
133 levels = levels or [1, 2, 3]
134 limit = max(1, int(limit or 30))
136 cache_path = _resolve_cache_path()
137 cached = _try_cache(cache_path)
138 if cached:
139 level_set = set(int(l) for l in levels)
140 filtered = [p for p in cached
141 if int(p.get('level', 0) or 0) in level_set]
142 return filtered[:limit] if filtered else cached[:limit]
144 hf = _try_huggingface(levels, limit)
145 if hf:
146 return hf
148 return []
151def save_cache(problems: List[dict]) -> bool:
152 """Persist a loaded problem set so subsequent runs skip the HF call.
154 Called by a one-off prefetch script; NOT invoked automatically so
155 we never surprise the user with a large download."""
156 path = _resolve_cache_path()
157 try:
158 os.makedirs(os.path.dirname(path), exist_ok=True)
159 with open(path, 'w', encoding='utf-8') as fh:
160 json.dump({'problems': problems}, fh, indent=2)
161 return True
162 except Exception as exc:
163 logger.warning(f'[gaia] cache write failed: {exc}')
164 return False