Coverage for integrations / channels / media / links.py: 58.2%

153 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-12 04:49 +0000

1""" 

2Link Processor for URL handling. 

3 

4Provides URL detection, fetching, preview generation, and summarization. 

5""" 

6 

7import asyncio 

8import re 

9from dataclasses import dataclass, field 

10from enum import Enum 

11from typing import Optional, List, Dict, Any, Union 

12from urllib.parse import urlparse, urljoin 

13import logging 

14 

15logger = logging.getLogger(__name__) 

16 

17 

18class LinkType(Enum): 

19 """Types of links.""" 

20 WEBPAGE = "webpage" 

21 IMAGE = "image" 

22 VIDEO = "video" 

23 AUDIO = "audio" 

24 DOCUMENT = "document" 

25 SOCIAL = "social" 

26 EMBED = "embed" 

27 UNKNOWN = "unknown" 

28 

29 

30@dataclass 

31class OpenGraphData: 

32 """Open Graph metadata from a URL.""" 

33 title: Optional[str] = None 

34 description: Optional[str] = None 

35 image: Optional[str] = None 

36 url: Optional[str] = None 

37 type: Optional[str] = None 

38 site_name: Optional[str] = None 

39 locale: Optional[str] = None 

40 video: Optional[str] = None 

41 audio: Optional[str] = None 

42 

43 def to_dict(self) -> Dict[str, Any]: 

44 return { 

45 "title": self.title, 

46 "description": self.description, 

47 "image": self.image, 

48 "url": self.url, 

49 "type": self.type, 

50 "site_name": self.site_name, 

51 "locale": self.locale, 

52 "video": self.video, 

53 "audio": self.audio 

54 } 

55 

56 

57@dataclass 

58class LinkPreview: 

59 """Preview data for a link.""" 

60 url: str 

61 final_url: str 

62 title: Optional[str] = None 

63 description: Optional[str] = None 

64 image: Optional[str] = None 

65 favicon: Optional[str] = None 

66 site_name: Optional[str] = None 

67 link_type: LinkType = LinkType.WEBPAGE 

68 open_graph: Optional[OpenGraphData] = None 

69 twitter_card: Optional[Dict[str, str]] = None 

70 metadata: Dict[str, Any] = field(default_factory=dict) 

71 

72 def to_dict(self) -> Dict[str, Any]: 

73 return { 

74 "url": self.url, 

75 "final_url": self.final_url, 

76 "title": self.title, 

77 "description": self.description, 

78 "image": self.image, 

79 "favicon": self.favicon, 

80 "site_name": self.site_name, 

81 "link_type": self.link_type.value, 

82 "open_graph": self.open_graph.to_dict() if self.open_graph else None, 

83 "twitter_card": self.twitter_card, 

84 "metadata": self.metadata 

85 } 

86 

87 

88@dataclass 

89class FetchedContent: 

90 """Fetched content from a URL.""" 

91 url: str 

92 final_url: str 

93 status_code: int 

94 content_type: str 

95 content: Union[str, bytes] 

96 headers: Dict[str, str] = field(default_factory=dict) 

97 encoding: Optional[str] = None 

98 size: int = 0 

99 load_time: float = 0.0 

100 

101 def to_dict(self) -> Dict[str, Any]: 

102 return { 

103 "url": self.url, 

104 "final_url": self.final_url, 

105 "status_code": self.status_code, 

106 "content_type": self.content_type, 

107 "encoding": self.encoding, 

108 "size": self.size, 

109 "load_time": self.load_time, 

110 "headers": self.headers 

111 } 

112 

113 

114@dataclass 

115class LinkSummary: 

116 """Summary of link content.""" 

117 url: str 

118 title: str 

119 summary: str 

120 key_points: List[str] = field(default_factory=list) 

121 topics: List[str] = field(default_factory=list) 

122 word_count: int = 0 

123 reading_time: int = 0 # in minutes 

124 

125 def to_dict(self) -> Dict[str, Any]: 

126 return { 

127 "url": self.url, 

128 "title": self.title, 

129 "summary": self.summary, 

130 "key_points": self.key_points, 

131 "topics": self.topics, 

132 "word_count": self.word_count, 

133 "reading_time": self.reading_time 

134 } 

135 

136 

137@dataclass 

138class DetectedLink: 

139 """A detected link in text.""" 

140 url: str 

141 start: int 

142 end: int 

143 text: Optional[str] = None 

144 link_type: LinkType = LinkType.UNKNOWN 

145 

146 def to_dict(self) -> Dict[str, Any]: 

147 return { 

148 "url": self.url, 

149 "start": self.start, 

150 "end": self.end, 

151 "text": self.text, 

152 "link_type": self.link_type.value 

153 } 

154 

155 

156class LinkProcessor: 

157 """ 

158 Link processor for URL handling. 

159 

160 Provides detection, fetching, preview generation, and summarization. 

161 """ 

162 

163 # URL pattern for detection 

164 URL_PATTERN = re.compile( 

165 r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:[/\w._~:/?#\[\]@!$&\'()*+,;=-]*)?', 

166 re.IGNORECASE 

167 ) 

168 

169 # File extension to type mapping 

170 EXTENSION_TYPES = { 

171 'jpg': LinkType.IMAGE, 'jpeg': LinkType.IMAGE, 'png': LinkType.IMAGE, 

172 'gif': LinkType.IMAGE, 'webp': LinkType.IMAGE, 'svg': LinkType.IMAGE, 

173 'mp4': LinkType.VIDEO, 'webm': LinkType.VIDEO, 'avi': LinkType.VIDEO, 

174 'mov': LinkType.VIDEO, 'mkv': LinkType.VIDEO, 

175 'mp3': LinkType.AUDIO, 'wav': LinkType.AUDIO, 'ogg': LinkType.AUDIO, 

176 'flac': LinkType.AUDIO, 'm4a': LinkType.AUDIO, 

177 'pdf': LinkType.DOCUMENT, 'doc': LinkType.DOCUMENT, 'docx': LinkType.DOCUMENT, 

178 'xls': LinkType.DOCUMENT, 'xlsx': LinkType.DOCUMENT, 'ppt': LinkType.DOCUMENT, 

179 'pptx': LinkType.DOCUMENT, 'txt': LinkType.DOCUMENT 

180 } 

181 

182 # Social media domains 

183 SOCIAL_DOMAINS = { 

184 'twitter.com', 'x.com', 'facebook.com', 'instagram.com', 

185 'linkedin.com', 'tiktok.com', 'youtube.com', 'youtu.be', 

186 'reddit.com', 'pinterest.com', 'tumblr.com' 

187 } 

188 

189 def __init__( 

190 self, 

191 timeout: int = 30, 

192 max_size: int = 10 * 1024 * 1024, # 10MB 

193 user_agent: Optional[str] = None, 

194 follow_redirects: bool = True, 

195 config: Optional[Dict[str, Any]] = None 

196 ): 

197 """ 

198 Initialize link processor. 

199 

200 Args: 

201 timeout: Request timeout in seconds 

202 max_size: Maximum content size to fetch 

203 user_agent: Custom user agent string 

204 follow_redirects: Whether to follow redirects 

205 config: Additional configuration options 

206 """ 

207 self.timeout = timeout 

208 self.max_size = max_size 

209 self.user_agent = user_agent or "HevolveBot/1.0 LinkProcessor" 

210 self.follow_redirects = follow_redirects 

211 self.config = config or {} 

212 

213 def detect(self, text: str) -> List[DetectedLink]: 

214 """ 

215 Detect URLs in text. 

216 

217 Args: 

218 text: Text to search for URLs 

219 

220 Returns: 

221 List of detected links with positions 

222 """ 

223 links = [] 

224 for match in self.URL_PATTERN.finditer(text): 

225 url = match.group() 

226 link_type = self._determine_link_type(url) 

227 links.append(DetectedLink( 

228 url=url, 

229 start=match.start(), 

230 end=match.end(), 

231 text=url, 

232 link_type=link_type 

233 )) 

234 return links 

235 

236 def _determine_link_type(self, url: str) -> LinkType: 

237 """Determine the type of a link.""" 

238 parsed = urlparse(url) 

239 domain = parsed.netloc.lower().lstrip('www.') 

240 path = parsed.path.lower() 

241 

242 # Check for social media 

243 if domain in self.SOCIAL_DOMAINS: 

244 return LinkType.SOCIAL 

245 

246 # Check file extension 

247 if '.' in path: 

248 ext = path.rsplit('.', 1)[-1] 

249 if ext in self.EXTENSION_TYPES: 

250 return self.EXTENSION_TYPES[ext] 

251 

252 return LinkType.WEBPAGE 

253 

254 async def fetch( 

255 self, 

256 url: str, 

257 headers: Optional[Dict[str, str]] = None 

258 ) -> FetchedContent: 

259 """ 

260 Fetch content from a URL. 

261 

262 Args: 

263 url: URL to fetch 

264 headers: Additional headers to send 

265 

266 Returns: 

267 FetchedContent with the fetched data 

268 """ 

269 import time 

270 start_time = time.time() 

271 

272 # Would use aiohttp or httpx in real implementation 

273 # Simulated response for now 

274 return FetchedContent( 

275 url=url, 

276 final_url=url, 

277 status_code=200, 

278 content_type="text/html", 

279 content="", 

280 headers={}, 

281 encoding="utf-8", 

282 size=0, 

283 load_time=time.time() - start_time 

284 ) 

285 

286 async def preview( 

287 self, 

288 url: str, 

289 fetch_image: bool = True 

290 ) -> LinkPreview: 

291 """ 

292 Generate a preview for a URL. 

293 

294 Args: 

295 url: URL to preview 

296 fetch_image: Whether to validate/fetch preview image 

297 

298 Returns: 

299 LinkPreview with metadata and preview data 

300 """ 

301 link_type = self._determine_link_type(url) 

302 parsed = urlparse(url) 

303 domain = parsed.netloc.lower().lstrip('www.') 

304 

305 # Fetch content for preview 

306 content = await self.fetch(url) 

307 

308 # Parse Open Graph and other metadata 

309 # Would extract from HTML in real implementation 

310 open_graph = OpenGraphData( 

311 url=url, 

312 title=domain, 

313 type="website" 

314 ) 

315 

316 return LinkPreview( 

317 url=url, 

318 final_url=content.final_url, 

319 title=domain, 

320 description=None, 

321 image=None, 

322 favicon=f"https://{domain}/favicon.ico", 

323 site_name=domain, 

324 link_type=link_type, 

325 open_graph=open_graph, 

326 metadata={ 

327 "status_code": content.status_code, 

328 "content_type": content.content_type 

329 } 

330 ) 

331 

332 async def summarize( 

333 self, 

334 url: str, 

335 max_length: int = 500, 

336 include_key_points: bool = True 

337 ) -> LinkSummary: 

338 """ 

339 Fetch and summarize content from a URL. 

340 

341 Args: 

342 url: URL to summarize 

343 max_length: Maximum summary length in characters 

344 include_key_points: Whether to extract key points 

345 

346 Returns: 

347 LinkSummary with content summary 

348 """ 

349 # Fetch content 

350 content = await self.fetch(url) 

351 preview = await self.preview(url) 

352 

353 # Would use LLM to summarize in real implementation 

354 return LinkSummary( 

355 url=url, 

356 title=preview.title or "", 

357 summary="", 

358 key_points=[], 

359 topics=[], 

360 word_count=0, 

361 reading_time=0 

362 ) 

363 

364 async def extract_text(self, url: str) -> str: 

365 """ 

366 Extract readable text from a URL. 

367 

368 Args: 

369 url: URL to extract text from 

370 

371 Returns: 

372 Extracted text content 

373 """ 

374 content = await self.fetch(url) 

375 

376 if isinstance(content.content, str): 

377 # Would use readability/trafilatura for extraction 

378 return content.content 

379 

380 return "" 

381 

382 async def validate(self, url: str) -> Dict[str, Any]: 

383 """ 

384 Validate a URL (check if accessible). 

385 

386 Args: 

387 url: URL to validate 

388 

389 Returns: 

390 Validation results 

391 """ 

392 try: 

393 content = await self.fetch(url) 

394 return { 

395 "valid": content.status_code < 400, 

396 "status_code": content.status_code, 

397 "final_url": content.final_url, 

398 "content_type": content.content_type, 

399 "error": None 

400 } 

401 except Exception as e: 

402 return { 

403 "valid": False, 

404 "status_code": None, 

405 "final_url": None, 

406 "content_type": None, 

407 "error": str(e) 

408 } 

409 

410 def normalize(self, url: str) -> str: 

411 """ 

412 Normalize a URL. 

413 

414 Args: 

415 url: URL to normalize 

416 

417 Returns: 

418 Normalized URL 

419 """ 

420 parsed = urlparse(url) 

421 

422 # Ensure scheme 

423 if not parsed.scheme: 

424 url = f"https://{url}" 

425 parsed = urlparse(url) 

426 

427 # Normalize to lowercase domain 

428 normalized = parsed._replace( 

429 netloc=parsed.netloc.lower() 

430 ) 

431 

432 return normalized.geturl() 

433 

434 def is_same_domain(self, url1: str, url2: str) -> bool: 

435 """Check if two URLs are from the same domain.""" 

436 domain1 = urlparse(url1).netloc.lower().lstrip('www.') 

437 domain2 = urlparse(url2).netloc.lower().lstrip('www.') 

438 return domain1 == domain2 

439 

440 def get_domain(self, url: str) -> str: 

441 """Extract domain from URL.""" 

442 return urlparse(url).netloc.lower().lstrip('www.') 

443 

444 def is_safe(self, url: str) -> bool: 

445 """ 

446 Check if a URL is potentially safe. 

447 

448 Args: 

449 url: URL to check 

450 

451 Returns: 

452 True if URL appears safe 

453 """ 

454 parsed = urlparse(url) 

455 

456 # Check for suspicious patterns 

457 suspicious_patterns = [ 

458 'javascript:', 'data:', 'vbscript:', 

459 '.exe', '.scr', '.bat', '.cmd' 

460 ] 

461 

462 url_lower = url.lower() 

463 for pattern in suspicious_patterns: 

464 if pattern in url_lower: 

465 return False 

466 

467 # Must have valid scheme 

468 if parsed.scheme not in ('http', 'https'): 

469 return False 

470 

471 return True