Coverage for integrations/channels/media/links.py: 58.2%

1"""

2Link Processor for URL handling.

4Provides URL detection, fetching, preview generation, and summarization.

5"""

7import asyncio

8import re

9from dataclasses import dataclass, field

10from enum import Enum

11from typing import Optional, List, Dict, Any, Union

12from urllib.parse import urlparse, urljoin

13import logging

15logger = logging.getLogger(__name__)

18class LinkType(Enum):

19 """Types of links."""

20 WEBPAGE = "webpage"

21 IMAGE = "image"

22 VIDEO = "video"

23 AUDIO = "audio"

24 DOCUMENT = "document"

25 SOCIAL = "social"

26 EMBED = "embed"

27 UNKNOWN = "unknown"

30@dataclass

31class OpenGraphData:

32 """Open Graph metadata from a URL."""

33 title: Optional[str] = None

34 description: Optional[str] = None

35 image: Optional[str] = None

36 url: Optional[str] = None

37 type: Optional[str] = None

38 site_name: Optional[str] = None

39 locale: Optional[str] = None

40 video: Optional[str] = None

41 audio: Optional[str] = None

43 def to_dict(self) -> Dict[str, Any]:

44 return {

45 "title": self.title,

46 "description": self.description,

47 "image": self.image,

48 "url": self.url,

49 "type": self.type,

50 "site_name": self.site_name,

51 "locale": self.locale,

52 "video": self.video,

53 "audio": self.audio

54 }

57@dataclass

58class LinkPreview:

59 """Preview data for a link."""

60 url: str

61 final_url: str

62 title: Optional[str] = None

63 description: Optional[str] = None

64 image: Optional[str] = None

65 favicon: Optional[str] = None

66 site_name: Optional[str] = None

67 link_type: LinkType = LinkType.WEBPAGE

68 open_graph: Optional[OpenGraphData] = None

69 twitter_card: Optional[Dict[str, str]] = None

70 metadata: Dict[str, Any] = field(default_factory=dict)

72 def to_dict(self) -> Dict[str, Any]:

73 return {

74 "url": self.url,

75 "final_url": self.final_url,

76 "title": self.title,

77 "description": self.description,

78 "image": self.image,

79 "favicon": self.favicon,

80 "site_name": self.site_name,

81 "link_type": self.link_type.value,

82 "open_graph": self.open_graph.to_dict() if self.open_graph else None,

83 "twitter_card": self.twitter_card,

84 "metadata": self.metadata

85 }

88@dataclass

89class FetchedContent:

90 """Fetched content from a URL."""

91 url: str

92 final_url: str

93 status_code: int

94 content_type: str

95 content: Union[str, bytes]

96 headers: Dict[str, str] = field(default_factory=dict)

97 encoding: Optional[str] = None

98 size: int = 0

99 load_time: float = 0.0

100

101 def to_dict(self) -> Dict[str, Any]:

102 return {

103 "url": self.url,

104 "final_url": self.final_url,

105 "status_code": self.status_code,

106 "content_type": self.content_type,

107 "encoding": self.encoding,

108 "size": self.size,

109 "load_time": self.load_time,

110 "headers": self.headers

111 }

112

113

114@dataclass

115class LinkSummary:

116 """Summary of link content."""

117 url: str

118 title: str

119 summary: str

120 key_points: List[str] = field(default_factory=list)

121 topics: List[str] = field(default_factory=list)

122 word_count: int = 0

123 reading_time: int = 0 # in minutes

124

125 def to_dict(self) -> Dict[str, Any]:

126 return {

127 "url": self.url,

128 "title": self.title,

129 "summary": self.summary,

130 "key_points": self.key_points,

131 "topics": self.topics,

132 "word_count": self.word_count,

133 "reading_time": self.reading_time

134 }

135

136

137@dataclass

138class DetectedLink:

139 """A detected link in text."""

140 url: str

141 start: int

142 end: int

143 text: Optional[str] = None

144 link_type: LinkType = LinkType.UNKNOWN

145

146 def to_dict(self) -> Dict[str, Any]:

147 return {

148 "url": self.url,

149 "start": self.start,

150 "end": self.end,

151 "text": self.text,

152 "link_type": self.link_type.value

153 }

154

155

156class LinkProcessor:

157 """

158 Link processor for URL handling.

159

160 Provides detection, fetching, preview generation, and summarization.

161 """

162

163 # URL pattern for detection

164 URL_PATTERN = re.compile(

165 r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:[/\w._~:/?#\[\]@!$&\'()*+,;=-]*)?',

166 re.IGNORECASE

167 )

168

169 # File extension to type mapping

170 EXTENSION_TYPES = {

171 'jpg': LinkType.IMAGE, 'jpeg': LinkType.IMAGE, 'png': LinkType.IMAGE,

172 'gif': LinkType.IMAGE, 'webp': LinkType.IMAGE, 'svg': LinkType.IMAGE,

173 'mp4': LinkType.VIDEO, 'webm': LinkType.VIDEO, 'avi': LinkType.VIDEO,

174 'mov': LinkType.VIDEO, 'mkv': LinkType.VIDEO,

175 'mp3': LinkType.AUDIO, 'wav': LinkType.AUDIO, 'ogg': LinkType.AUDIO,

176 'flac': LinkType.AUDIO, 'm4a': LinkType.AUDIO,

177 'pdf': LinkType.DOCUMENT, 'doc': LinkType.DOCUMENT, 'docx': LinkType.DOCUMENT,

178 'xls': LinkType.DOCUMENT, 'xlsx': LinkType.DOCUMENT, 'ppt': LinkType.DOCUMENT,

179 'pptx': LinkType.DOCUMENT, 'txt': LinkType.DOCUMENT

180 }

181

182 # Social media domains

183 SOCIAL_DOMAINS = {

184 'twitter.com', 'x.com', 'facebook.com', 'instagram.com',

185 'linkedin.com', 'tiktok.com', 'youtube.com', 'youtu.be',

186 'reddit.com', 'pinterest.com', 'tumblr.com'

187 }

188

189 def __init__(

190 self,

191 timeout: int = 30,

192 max_size: int = 10 * 1024 * 1024, # 10MB

193 user_agent: Optional[str] = None,

194 follow_redirects: bool = True,

195 config: Optional[Dict[str, Any]] = None

196 ):

197 """

198 Initialize link processor.

199

200 Args:

201 timeout: Request timeout in seconds

202 max_size: Maximum content size to fetch

203 user_agent: Custom user agent string

204 follow_redirects: Whether to follow redirects

205 config: Additional configuration options

206 """

207 self.timeout = timeout

208 self.max_size = max_size

209 self.user_agent = user_agent or "HevolveBot/1.0 LinkProcessor"

210 self.follow_redirects = follow_redirects

211 self.config = config or {}

212

213 def detect(self, text: str) -> List[DetectedLink]:

214 """

215 Detect URLs in text.

216

217 Args:

218 text: Text to search for URLs

219

220 Returns:

221 List of detected links with positions

222 """

223 links = []

224 for match in self.URL_PATTERN.finditer(text):

225 url = match.group()

226 link_type = self._determine_link_type(url)

227 links.append(DetectedLink(

228 url=url,

229 start=match.start(),

230 end=match.end(),

231 text=url,

232 link_type=link_type

233 ))

234 return links

235

236 def _determine_link_type(self, url: str) -> LinkType:

237 """Determine the type of a link."""

238 parsed = urlparse(url)

239 domain = parsed.netloc.lower().lstrip('www.')

240 path = parsed.path.lower()

241

242 # Check for social media

243 if domain in self.SOCIAL_DOMAINS:

244 return LinkType.SOCIAL

245

246 # Check file extension

247 if '.' in path:

248 ext = path.rsplit('.', 1)[-1]

249 if ext in self.EXTENSION_TYPES:

250 return self.EXTENSION_TYPES[ext]

251

252 return LinkType.WEBPAGE

253

254 async def fetch(

255 self,

256 url: str,

257 headers: Optional[Dict[str, str]] = None

258 ) -> FetchedContent:

259 """

260 Fetch content from a URL.

261

262 Args:

263 url: URL to fetch

264 headers: Additional headers to send

265

266 Returns:

267 FetchedContent with the fetched data

268 """

269 import time

270 start_time = time.time()

271

272 # Would use aiohttp or httpx in real implementation

273 # Simulated response for now

274 return FetchedContent(

275 url=url,

276 final_url=url,

277 status_code=200,

278 content_type="text/html",

279 content="",

280 headers={},

281 encoding="utf-8",

282 size=0,

283 load_time=time.time() - start_time

284 )

285

286 async def preview(

287 self,

288 url: str,

289 fetch_image: bool = True

290 ) -> LinkPreview:

291 """

292 Generate a preview for a URL.

293

294 Args:

295 url: URL to preview

296 fetch_image: Whether to validate/fetch preview image

297

298 Returns:

299 LinkPreview with metadata and preview data

300 """

301 link_type = self._determine_link_type(url)

302 parsed = urlparse(url)

303 domain = parsed.netloc.lower().lstrip('www.')

304

305 # Fetch content for preview

306 content = await self.fetch(url)

307

308 # Parse Open Graph and other metadata

309 # Would extract from HTML in real implementation

310 open_graph = OpenGraphData(

311 url=url,

312 title=domain,

313 type="website"

314 )

315

316 return LinkPreview(

317 url=url,

318 final_url=content.final_url,

319 title=domain,

320 description=None,

321 image=None,

322 favicon=f"https://{domain}/favicon.ico",

323 site_name=domain,

324 link_type=link_type,

325 open_graph=open_graph,

326 metadata={

327 "status_code": content.status_code,

328 "content_type": content.content_type

329 }

330 )

331

332 async def summarize(

333 self,

334 url: str,

335 max_length: int = 500,

336 include_key_points: bool = True

337 ) -> LinkSummary:

338 """

339 Fetch and summarize content from a URL.

340

341 Args:

342 url: URL to summarize

343 max_length: Maximum summary length in characters

344 include_key_points: Whether to extract key points

345

346 Returns:

347 LinkSummary with content summary

348 """

349 # Fetch content

350 content = await self.fetch(url)

351 preview = await self.preview(url)

352

353 # Would use LLM to summarize in real implementation

354 return LinkSummary(

355 url=url,

356 title=preview.title or "",

357 summary="",

358 key_points=[],

359 topics=[],

360 word_count=0,

361 reading_time=0

362 )

363

364 async def extract_text(self, url: str) -> str:

365 """

366 Extract readable text from a URL.

367

368 Args:

369 url: URL to extract text from

370

371 Returns:

372 Extracted text content

373 """

374 content = await self.fetch(url)

375

376 if isinstance(content.content, str):

377 # Would use readability/trafilatura for extraction

378 return content.content

379

380 return ""

381

382 async def validate(self, url: str) -> Dict[str, Any]:

383 """

384 Validate a URL (check if accessible).

385

386 Args:

387 url: URL to validate

388

389 Returns:

390 Validation results

391 """

392 try:

393 content = await self.fetch(url)

394 return {

395 "valid": content.status_code < 400,

396 "status_code": content.status_code,

397 "final_url": content.final_url,

398 "content_type": content.content_type,

399 "error": None

400 }

401 except Exception as e:

402 return {

403 "valid": False,

404 "status_code": None,

405 "final_url": None,

406 "content_type": None,

407 "error": str(e)

408 }

409

410 def normalize(self, url: str) -> str:

411 """

412 Normalize a URL.

413

414 Args:

415 url: URL to normalize

416

417 Returns:

418 Normalized URL

419 """

420 parsed = urlparse(url)

421

422 # Ensure scheme

423 if not parsed.scheme:

424 url = f"https://{url}"

425 parsed = urlparse(url)

426

427 # Normalize to lowercase domain

428 normalized = parsed._replace(

429 netloc=parsed.netloc.lower()

430 )

431

432 return normalized.geturl()

433

434 def is_same_domain(self, url1: str, url2: str) -> bool:

435 """Check if two URLs are from the same domain."""

436 domain1 = urlparse(url1).netloc.lower().lstrip('www.')

437 domain2 = urlparse(url2).netloc.lower().lstrip('www.')

438 return domain1 == domain2

439

440 def get_domain(self, url: str) -> str:

441 """Extract domain from URL."""

442 return urlparse(url).netloc.lower().lstrip('www.')

443

444 def is_safe(self, url: str) -> bool:

445 """

446 Check if a URL is potentially safe.

447

448 Args:

449 url: URL to check

450

451 Returns:

452 True if URL appears safe

453 """

454 parsed = urlparse(url)

455

456 # Check for suspicious patterns

457 suspicious_patterns = [

458 'javascript:', 'data:', 'vbscript:',

459 '.exe', '.scr', '.bat', '.cmd'

460 ]

461

462 url_lower = url.lower()

463 for pattern in suspicious_patterns:

464 if pattern in url_lower:

465 return False

466

467 # Must have valid scheme

468 if parsed.scheme not in ('http', 'https'):

469 return False

470

471 return True

Coverage for integrations / channels / media / links.py: 58.2%

153 statements