Coverage for integrations / vlm / parser.py: 97.6%

168 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2integrations.vlm.parser — single source of truth for VLM response parsing. 

3 

4Phase 5 of memory/vlm_best_of_all_worlds_plan.md §4. Replaces three 

5parallel parsers that drifted apart over time: 

6 

7 * ``local_loop._parse_vlm_response`` — JSON shape, used by 

8 inline-prompt branch 

9 * ``qwen3vl_backend._parse_unified_response`` — JSON shape with 

10 UI_Elements, used by parse_and_reason / parse_screen 

11 * ``qwen3vl_backend._parse_action_response`` — free-text shape with 

12 <point>x,y</point> / TYPE: / DONE / scroll, used by point_and_act 

13 and the taskbar shortcut 

14 

15The first two duplicated their JSON extraction. This module 

16exposes: 

17 

18 ``extract_json(text)`` — single canonical JSON extractor 

19 (handles ```json blocks, raw {}, 

20 depth-counted nested objects) 

21 ``ParsedAction`` dataclass — normalized result; same fields 

22 regardless of input shape so 

23 downstream code stops branching 

24 on which parser ran. 

25 ``parse_vlm_action(raw, ...)`` — single entry point keyed on 

26 ``expected_shape='action_json' | 

27 'som_bbox' | 'point_only'``. 

28 

29Old parsers are shimmed onto this module — see the docstrings on 

30each shim for byte-equivalence notes. 

31""" 

32 

33import json 

34import re 

35import logging 

36from dataclasses import dataclass, field, asdict 

37from typing import List, Optional, Callable 

38 

39logger = logging.getLogger('hevolve.vlm.parser') 

40 

41 

42# ─── Pre-compiled regex (module-load cost is one-time) ──────────────── 

43 

44_CODE_BLOCK_RE = re.compile(r'```(?:json)?\s*(\{.*?\})\s*```', re.DOTALL) 

45_RAW_BRACE_RE = re.compile(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', re.DOTALL) 

46_POINT_RE = re.compile(r'<point>\s*(\d+)\s*,\s*(\d+)\s*</point>') 

47_TYPE_PREFIX_RE = re.compile(r'^TYPE:\s*(.+)$', re.IGNORECASE) 

48_TYPE_FREETEXT_RE = re.compile( 

49 r'(?:type|enter|input)\s*[:\-"\']+\s*(.+?)(?:\s*<|$)', 

50 re.IGNORECASE, 

51) 

52_NUMBER_RE = re.compile(r'\d+') 

53 

54 

55# ─── ParsedAction dataclass ────────────────────────────────────────── 

56 

57@dataclass 

58class ParsedAction: 

59 """Normalized result of parsing any of the three VLM response 

60 shapes. Fields not relevant to a given shape stay at their 

61 default values; consumers read whichever fields apply. 

62 

63 Back-compat conversion methods: 

64 ``to_action_json_dict()`` reproduces what ``_parse_vlm_response`` 

65 and ``_parse_unified_response`` historically returned. 

66 ``to_point_action_dict()`` reproduces the ``_parse_action_response`` 

67 shape (the dict point_and_act builds its result from). 

68 """ 

69 raw: str = '' 

70 action: str = 'none' # 'left_click', 'type', 'scroll_down', 'done', 'none', ... 

71 reasoning: str = '' 

72 done: bool = False 

73 status: str = '' # 'IN_PROGRESS', 'DONE' (JSON shapes) 

74 text: str = '' # for 'type' actions 

75 norm_x: Optional[int] = None # 0-1000 normalized 

76 norm_y: Optional[int] = None 

77 screen_x: int = 0 

78 screen_y: int = 0 

79 box_id: Optional[int] = None 

80 coordinate: Optional[List[int]] = None 

81 next_action: str = '' # original 'Next Action' string 

82 ui_elements: List[dict] = field(default_factory=list) 

83 parsed_content_list: List[dict] = field(default_factory=list) 

84 

85 def to_action_json_dict(self) -> dict: 

86 """Convert to the dict legacy ``_parse_vlm_response`` / 

87 ``_parse_unified_response`` callers consume. 

88 

89 Keeps original casing of 'Next Action' / 'Status' / 'Reasoning' 

90 + 'Box ID' to avoid breaking downstream key access. 

91 """ 

92 out = { 

93 'Next Action': self.next_action or 'None', 

94 'Status': self.status or ('DONE' if self.done else 'IN_PROGRESS'), 

95 'Reasoning': self.reasoning or self.raw[:500], 

96 } 

97 if self.text: 

98 out['value'] = self.text 

99 if self.coordinate is not None: 

100 out['coordinate'] = self.coordinate 

101 if self.box_id is not None: 

102 out['Box ID'] = self.box_id 

103 if self.ui_elements: 

104 out['UI_Elements'] = self.ui_elements 

105 if self.parsed_content_list: 

106 out['parsed_content_list'] = self.parsed_content_list 

107 return out 

108 

109 def to_point_action_dict(self) -> dict: 

110 """Convert to the dict shape ``_parse_action_response`` returns 

111 for point_and_act. Includes only keys the legacy code populated. 

112 """ 

113 result = { 

114 'action': self.action, 

115 'screen_x': self.screen_x, 

116 'screen_y': self.screen_y, 

117 'text': self.text, 

118 'done': self.done, 

119 'reasoning': self.reasoning, 

120 'raw': self.raw, 

121 } 

122 if self.norm_x is not None: 

123 result['norm_x'] = self.norm_x 

124 if self.norm_y is not None: 

125 result['norm_y'] = self.norm_y 

126 return result 

127 

128 

129# ─── Extract JSON ───────────────────────────────────────────────────── 

130 

131def extract_json(text: str) -> Optional[dict]: 

132 """Extract a JSON object from VLM text. 

133 

134 Tries in order: 

135 1. Markdown `````json``... fenced block (most reliable — 

136 models trained on instruction data tend to fence JSON). 

137 2. Depth-counted brace walk (correctly handles nested objects; 

138 returns the OUTER object). 

139 3. Simple raw ``{...}`` match (last-resort cheap path; only 

140 reached when the depth-counted walk found nothing 

141 JSON-parseable). 

142 

143 The legacy ``_parse_unified_response`` had raw-brace BEFORE 

144 depth-counted, which on nested input like 

145 ``{"outer": {"inner": [{...}]}}`` returned the innermost 

146 ``{...}`` instead of the full object — a partial extraction 

147 bug that silently lost UI_Elements / Reasoning fields. This 

148 implementation fixes that by trying depth-counted first. 

149 

150 Returns ``None`` when nothing parseable was found. 

151 """ 

152 if not text: 

153 return None 

154 m = _CODE_BLOCK_RE.search(text) 

155 if m: 

156 try: 

157 return json.loads(m.group(1)) 

158 except json.JSONDecodeError: 

159 pass 

160 # Depth-counted nested-brace walk — returns OUTER object on success. 

161 depth = 0 

162 start_idx = None 

163 for i, ch in enumerate(text): 

164 if ch == '{': 

165 if depth == 0: 

166 start_idx = i 

167 depth += 1 

168 elif ch == '}': 

169 depth -= 1 

170 if depth == 0 and start_idx is not None: 

171 try: 

172 return json.loads(text[start_idx:i + 1]) 

173 except json.JSONDecodeError: 

174 start_idx = None 

175 # Last resort: simple raw-brace. Only reached when depth-counted 

176 # found no balanced top-level object (e.g. truncated mid-stream). 

177 m = _RAW_BRACE_RE.search(text) 

178 if m: 

179 try: 

180 return json.loads(m.group(0)) 

181 except json.JSONDecodeError: 

182 pass 

183 return None 

184 

185 

186# ─── parse_vlm_action: single entry point ───────────────────────────── 

187 

188def parse_vlm_action( 

189 raw: str, 

190 *, 

191 expected_shape: str = 'action_json', 

192 task: str = '', 

193 screen_w: Optional[int] = None, 

194 screen_h: Optional[int] = None, 

195 detect_action_type: Optional[Callable[[str, str], str]] = None, 

196 scroll_down_keywords: tuple = (), 

197 scroll_up_keywords: tuple = (), 

198) -> ParsedAction: 

199 """Parse *raw* VLM response into a normalized :class:`ParsedAction`. 

200 

201 Args: 

202 raw: VLM response string. 

203 expected_shape: Which schema to expect. 

204 ``'action_json'`` — Single-action JSON dict with keys 

205 ``Next Action``, ``Status``, ``Reasoning``, ``coordinate``, 

206 ``value``, ``Box ID`` (optional). Local-loop inline branch 

207 + parse_and_reason action_json. 

208 ``'som_bbox'`` — Same as action_json but additionally 

209 extracts ``UI_Elements`` / ``parsed_content_list`` for 

210 the SoM-bbox view. 

211 ``'point_only'`` — Free-text response with 

212 ``<point>x,y</point>`` markers, ``TYPE:`` prefix, 

213 ``DONE``, or scroll keywords. Used by ``point_and_act`` 

214 and the taskbar shortcut. 

215 task: Original task string (only used by ``'point_only'`` for 

216 action-type detection). 

217 screen_w, screen_h: Screen dimensions for norm→screen-px 

218 scaling on ``'point_only'`` (caller passes 

219 ``pyautogui.size()``; falls back to no scaling if None). 

220 detect_action_type: Callable ``(task, raw) -> action_type`` 

221 injected by ``Qwen3VLBackend`` so this parser doesn't 

222 need to know about the backend's keyword tables. 

223 scroll_down_keywords, scroll_up_keywords: Tuples of substrings 

224 the ``'point_only'`` parser checks against task+raw to 

225 detect scroll intent. 

226 """ 

227 raw = (raw or '').strip() 

228 

229 if expected_shape in ('action_json', 'som_bbox'): 

230 return _parse_json_shape(raw, include_som=(expected_shape == 'som_bbox')) 

231 

232 if expected_shape == 'point_only': 

233 return _parse_point_shape( 

234 raw, task=task, 

235 screen_w=screen_w, screen_h=screen_h, 

236 detect_action_type=detect_action_type, 

237 scroll_down_keywords=scroll_down_keywords, 

238 scroll_up_keywords=scroll_up_keywords, 

239 ) 

240 

241 raise ValueError(f"Unknown expected_shape: {expected_shape!r}") 

242 

243 

244def _parse_json_shape(raw: str, *, include_som: bool) -> ParsedAction: 

245 """Common JSON-shape parser used by both 'action_json' and 

246 'som_bbox'. ``include_som`` adds UI_Elements + parsed_content_list 

247 population from the parsed dict.""" 

248 pa = ParsedAction(raw=raw) 

249 parsed = extract_json(raw) 

250 if parsed is None: 

251 # Fallback shape — treat as DONE so the loop terminates safely. 

252 pa.next_action = 'None' 

253 pa.status = 'DONE' 

254 pa.done = True 

255 pa.action = 'none' 

256 pa.reasoning = raw[:500] or 'Empty / unparseable VLM response' 

257 if not raw: 

258 pa.reasoning = 'Empty VLM response' 

259 return pa 

260 

261 pa.next_action = parsed.get('Next Action', '') or '' 

262 pa.status = parsed.get('Status', '') or '' 

263 pa.reasoning = parsed.get('Reasoning', '') or raw[:500] 

264 pa.text = parsed.get('value', '') or '' 

265 pa.coordinate = parsed.get('coordinate') 

266 pa.box_id = parsed.get('Box ID') 

267 pa.done = pa.status.upper() == 'DONE' 

268 # Normalize action: 'left_click' → 'left_click', 'Left Click' → 'left_click' 

269 pa.action = (pa.next_action or 'none').lower().replace(' ', '_') 

270 if include_som: 

271 pa.ui_elements = parsed.get('UI_Elements', []) or [] 

272 pa.parsed_content_list = parsed.get('parsed_content_list', []) or [] 

273 return pa 

274 

275 

276def _parse_point_shape( 

277 raw: str, *, 

278 task: str, 

279 screen_w: Optional[int], 

280 screen_h: Optional[int], 

281 detect_action_type: Optional[Callable[[str, str], str]], 

282 scroll_down_keywords: tuple, 

283 scroll_up_keywords: tuple, 

284) -> ParsedAction: 

285 """Free-text shape parser — extracts <point>x,y</point>, TYPE:, 

286 DONE, scroll keywords. Mirrors the legacy 

287 ``_parse_action_response`` behaviour byte-for-byte except it 

288 returns a ParsedAction instead of a 3-tuple (the shim adapts).""" 

289 pa = ParsedAction(raw=raw) 

290 

291 # DONE — task complete signal. 

292 if 'DONE' in raw.upper(): 

293 pa.action = 'done' 

294 pa.done = True 

295 pa.reasoning = raw 

296 return pa 

297 

298 # TYPE: prefix variant (most reliable when present). 

299 m = _TYPE_PREFIX_RE.match(raw) 

300 if m: 

301 text = m.group(1).strip() 

302 pa.action = 'type' 

303 pa.text = text 

304 pa.reasoning = f'type "{text}"' 

305 return pa 

306 

307 # Free-text "type X" variant — only when NO point marker is present 

308 # (otherwise the point should win). 

309 if '<point>' not in raw: 

310 m = _TYPE_FREETEXT_RE.search(raw) 

311 if m: 

312 text = m.group(1).strip().strip('"\'') 

313 pa.action = 'type' 

314 pa.text = text 

315 pa.reasoning = f'type "{text}"' 

316 return pa 

317 

318 # Scroll keywords (task or raw). 

319 raw_lower = raw.lower() 

320 task_lower = task.lower() if task else '' 

321 if any(kw in task_lower or kw in raw_lower for kw in scroll_down_keywords): 

322 pa.action = 'scroll_down' 

323 pa.reasoning = 'scroll down' 

324 return pa 

325 if any(kw in task_lower or kw in raw_lower for kw in scroll_up_keywords): 

326 pa.action = 'scroll_up' 

327 pa.reasoning = 'scroll up' 

328 return pa 

329 

330 # Coordinate extraction — <point> first, then number-pair fallback. 

331 action_type = (detect_action_type(task, raw) 

332 if detect_action_type else 'left_click') 

333 m = _POINT_RE.search(raw) 

334 if m: 

335 nx, ny = int(m.group(1)), int(m.group(2)) 

336 pa.action = action_type 

337 pa.norm_x = nx 

338 pa.norm_y = ny 

339 if screen_w and screen_h: 

340 pa.screen_x = int(nx * screen_w / 1000) 

341 pa.screen_y = int(ny * screen_h / 1000) 

342 pa.reasoning = f'{action_type} at ({nx},{ny}) normalized' 

343 return pa 

344 

345 nums = _NUMBER_RE.findall(raw) 

346 if len(nums) >= 2: 

347 nx, ny = int(nums[0]), int(nums[1]) 

348 if 0 <= nx <= 1000 and 0 <= ny <= 1000: 

349 pa.action = action_type 

350 pa.norm_x = nx 

351 pa.norm_y = ny 

352 if screen_w and screen_h: 

353 pa.screen_x = int(nx * screen_w / 1000) 

354 pa.screen_y = int(ny * screen_h / 1000) 

355 pa.reasoning = f'fallback {action_type} ({nx},{ny})' 

356 return pa 

357 

358 # Couldn't extract anything actionable. 

359 logger.warning(f"Could not parse point_only response: {raw[:100]}") 

360 pa.action = 'none' 

361 pa.reasoning = raw[:500] or 'unparseable point_only response' 

362 return pa