Coverage for integrations / channels / media / links.py: 58.2%
153 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-12 04:49 +0000
1"""
2Link Processor for URL handling.
4Provides URL detection, fetching, preview generation, and summarization.
5"""
7import asyncio
8import re
9from dataclasses import dataclass, field
10from enum import Enum
11from typing import Optional, List, Dict, Any, Union
12from urllib.parse import urlparse, urljoin
13import logging
15logger = logging.getLogger(__name__)
18class LinkType(Enum):
19 """Types of links."""
20 WEBPAGE = "webpage"
21 IMAGE = "image"
22 VIDEO = "video"
23 AUDIO = "audio"
24 DOCUMENT = "document"
25 SOCIAL = "social"
26 EMBED = "embed"
27 UNKNOWN = "unknown"
30@dataclass
31class OpenGraphData:
32 """Open Graph metadata from a URL."""
33 title: Optional[str] = None
34 description: Optional[str] = None
35 image: Optional[str] = None
36 url: Optional[str] = None
37 type: Optional[str] = None
38 site_name: Optional[str] = None
39 locale: Optional[str] = None
40 video: Optional[str] = None
41 audio: Optional[str] = None
43 def to_dict(self) -> Dict[str, Any]:
44 return {
45 "title": self.title,
46 "description": self.description,
47 "image": self.image,
48 "url": self.url,
49 "type": self.type,
50 "site_name": self.site_name,
51 "locale": self.locale,
52 "video": self.video,
53 "audio": self.audio
54 }
57@dataclass
58class LinkPreview:
59 """Preview data for a link."""
60 url: str
61 final_url: str
62 title: Optional[str] = None
63 description: Optional[str] = None
64 image: Optional[str] = None
65 favicon: Optional[str] = None
66 site_name: Optional[str] = None
67 link_type: LinkType = LinkType.WEBPAGE
68 open_graph: Optional[OpenGraphData] = None
69 twitter_card: Optional[Dict[str, str]] = None
70 metadata: Dict[str, Any] = field(default_factory=dict)
72 def to_dict(self) -> Dict[str, Any]:
73 return {
74 "url": self.url,
75 "final_url": self.final_url,
76 "title": self.title,
77 "description": self.description,
78 "image": self.image,
79 "favicon": self.favicon,
80 "site_name": self.site_name,
81 "link_type": self.link_type.value,
82 "open_graph": self.open_graph.to_dict() if self.open_graph else None,
83 "twitter_card": self.twitter_card,
84 "metadata": self.metadata
85 }
88@dataclass
89class FetchedContent:
90 """Fetched content from a URL."""
91 url: str
92 final_url: str
93 status_code: int
94 content_type: str
95 content: Union[str, bytes]
96 headers: Dict[str, str] = field(default_factory=dict)
97 encoding: Optional[str] = None
98 size: int = 0
99 load_time: float = 0.0
101 def to_dict(self) -> Dict[str, Any]:
102 return {
103 "url": self.url,
104 "final_url": self.final_url,
105 "status_code": self.status_code,
106 "content_type": self.content_type,
107 "encoding": self.encoding,
108 "size": self.size,
109 "load_time": self.load_time,
110 "headers": self.headers
111 }
114@dataclass
115class LinkSummary:
116 """Summary of link content."""
117 url: str
118 title: str
119 summary: str
120 key_points: List[str] = field(default_factory=list)
121 topics: List[str] = field(default_factory=list)
122 word_count: int = 0
123 reading_time: int = 0 # in minutes
125 def to_dict(self) -> Dict[str, Any]:
126 return {
127 "url": self.url,
128 "title": self.title,
129 "summary": self.summary,
130 "key_points": self.key_points,
131 "topics": self.topics,
132 "word_count": self.word_count,
133 "reading_time": self.reading_time
134 }
137@dataclass
138class DetectedLink:
139 """A detected link in text."""
140 url: str
141 start: int
142 end: int
143 text: Optional[str] = None
144 link_type: LinkType = LinkType.UNKNOWN
146 def to_dict(self) -> Dict[str, Any]:
147 return {
148 "url": self.url,
149 "start": self.start,
150 "end": self.end,
151 "text": self.text,
152 "link_type": self.link_type.value
153 }
156class LinkProcessor:
157 """
158 Link processor for URL handling.
160 Provides detection, fetching, preview generation, and summarization.
161 """
163 # URL pattern for detection
164 URL_PATTERN = re.compile(
165 r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:[/\w._~:/?#\[\]@!$&\'()*+,;=-]*)?',
166 re.IGNORECASE
167 )
169 # File extension to type mapping
170 EXTENSION_TYPES = {
171 'jpg': LinkType.IMAGE, 'jpeg': LinkType.IMAGE, 'png': LinkType.IMAGE,
172 'gif': LinkType.IMAGE, 'webp': LinkType.IMAGE, 'svg': LinkType.IMAGE,
173 'mp4': LinkType.VIDEO, 'webm': LinkType.VIDEO, 'avi': LinkType.VIDEO,
174 'mov': LinkType.VIDEO, 'mkv': LinkType.VIDEO,
175 'mp3': LinkType.AUDIO, 'wav': LinkType.AUDIO, 'ogg': LinkType.AUDIO,
176 'flac': LinkType.AUDIO, 'm4a': LinkType.AUDIO,
177 'pdf': LinkType.DOCUMENT, 'doc': LinkType.DOCUMENT, 'docx': LinkType.DOCUMENT,
178 'xls': LinkType.DOCUMENT, 'xlsx': LinkType.DOCUMENT, 'ppt': LinkType.DOCUMENT,
179 'pptx': LinkType.DOCUMENT, 'txt': LinkType.DOCUMENT
180 }
182 # Social media domains
183 SOCIAL_DOMAINS = {
184 'twitter.com', 'x.com', 'facebook.com', 'instagram.com',
185 'linkedin.com', 'tiktok.com', 'youtube.com', 'youtu.be',
186 'reddit.com', 'pinterest.com', 'tumblr.com'
187 }
189 def __init__(
190 self,
191 timeout: int = 30,
192 max_size: int = 10 * 1024 * 1024, # 10MB
193 user_agent: Optional[str] = None,
194 follow_redirects: bool = True,
195 config: Optional[Dict[str, Any]] = None
196 ):
197 """
198 Initialize link processor.
200 Args:
201 timeout: Request timeout in seconds
202 max_size: Maximum content size to fetch
203 user_agent: Custom user agent string
204 follow_redirects: Whether to follow redirects
205 config: Additional configuration options
206 """
207 self.timeout = timeout
208 self.max_size = max_size
209 self.user_agent = user_agent or "HevolveBot/1.0 LinkProcessor"
210 self.follow_redirects = follow_redirects
211 self.config = config or {}
213 def detect(self, text: str) -> List[DetectedLink]:
214 """
215 Detect URLs in text.
217 Args:
218 text: Text to search for URLs
220 Returns:
221 List of detected links with positions
222 """
223 links = []
224 for match in self.URL_PATTERN.finditer(text):
225 url = match.group()
226 link_type = self._determine_link_type(url)
227 links.append(DetectedLink(
228 url=url,
229 start=match.start(),
230 end=match.end(),
231 text=url,
232 link_type=link_type
233 ))
234 return links
236 def _determine_link_type(self, url: str) -> LinkType:
237 """Determine the type of a link."""
238 parsed = urlparse(url)
239 domain = parsed.netloc.lower().lstrip('www.')
240 path = parsed.path.lower()
242 # Check for social media
243 if domain in self.SOCIAL_DOMAINS:
244 return LinkType.SOCIAL
246 # Check file extension
247 if '.' in path:
248 ext = path.rsplit('.', 1)[-1]
249 if ext in self.EXTENSION_TYPES:
250 return self.EXTENSION_TYPES[ext]
252 return LinkType.WEBPAGE
254 async def fetch(
255 self,
256 url: str,
257 headers: Optional[Dict[str, str]] = None
258 ) -> FetchedContent:
259 """
260 Fetch content from a URL.
262 Args:
263 url: URL to fetch
264 headers: Additional headers to send
266 Returns:
267 FetchedContent with the fetched data
268 """
269 import time
270 start_time = time.time()
272 # Would use aiohttp or httpx in real implementation
273 # Simulated response for now
274 return FetchedContent(
275 url=url,
276 final_url=url,
277 status_code=200,
278 content_type="text/html",
279 content="",
280 headers={},
281 encoding="utf-8",
282 size=0,
283 load_time=time.time() - start_time
284 )
286 async def preview(
287 self,
288 url: str,
289 fetch_image: bool = True
290 ) -> LinkPreview:
291 """
292 Generate a preview for a URL.
294 Args:
295 url: URL to preview
296 fetch_image: Whether to validate/fetch preview image
298 Returns:
299 LinkPreview with metadata and preview data
300 """
301 link_type = self._determine_link_type(url)
302 parsed = urlparse(url)
303 domain = parsed.netloc.lower().lstrip('www.')
305 # Fetch content for preview
306 content = await self.fetch(url)
308 # Parse Open Graph and other metadata
309 # Would extract from HTML in real implementation
310 open_graph = OpenGraphData(
311 url=url,
312 title=domain,
313 type="website"
314 )
316 return LinkPreview(
317 url=url,
318 final_url=content.final_url,
319 title=domain,
320 description=None,
321 image=None,
322 favicon=f"https://{domain}/favicon.ico",
323 site_name=domain,
324 link_type=link_type,
325 open_graph=open_graph,
326 metadata={
327 "status_code": content.status_code,
328 "content_type": content.content_type
329 }
330 )
332 async def summarize(
333 self,
334 url: str,
335 max_length: int = 500,
336 include_key_points: bool = True
337 ) -> LinkSummary:
338 """
339 Fetch and summarize content from a URL.
341 Args:
342 url: URL to summarize
343 max_length: Maximum summary length in characters
344 include_key_points: Whether to extract key points
346 Returns:
347 LinkSummary with content summary
348 """
349 # Fetch content
350 content = await self.fetch(url)
351 preview = await self.preview(url)
353 # Would use LLM to summarize in real implementation
354 return LinkSummary(
355 url=url,
356 title=preview.title or "",
357 summary="",
358 key_points=[],
359 topics=[],
360 word_count=0,
361 reading_time=0
362 )
364 async def extract_text(self, url: str) -> str:
365 """
366 Extract readable text from a URL.
368 Args:
369 url: URL to extract text from
371 Returns:
372 Extracted text content
373 """
374 content = await self.fetch(url)
376 if isinstance(content.content, str):
377 # Would use readability/trafilatura for extraction
378 return content.content
380 return ""
382 async def validate(self, url: str) -> Dict[str, Any]:
383 """
384 Validate a URL (check if accessible).
386 Args:
387 url: URL to validate
389 Returns:
390 Validation results
391 """
392 try:
393 content = await self.fetch(url)
394 return {
395 "valid": content.status_code < 400,
396 "status_code": content.status_code,
397 "final_url": content.final_url,
398 "content_type": content.content_type,
399 "error": None
400 }
401 except Exception as e:
402 return {
403 "valid": False,
404 "status_code": None,
405 "final_url": None,
406 "content_type": None,
407 "error": str(e)
408 }
410 def normalize(self, url: str) -> str:
411 """
412 Normalize a URL.
414 Args:
415 url: URL to normalize
417 Returns:
418 Normalized URL
419 """
420 parsed = urlparse(url)
422 # Ensure scheme
423 if not parsed.scheme:
424 url = f"https://{url}"
425 parsed = urlparse(url)
427 # Normalize to lowercase domain
428 normalized = parsed._replace(
429 netloc=parsed.netloc.lower()
430 )
432 return normalized.geturl()
434 def is_same_domain(self, url1: str, url2: str) -> bool:
435 """Check if two URLs are from the same domain."""
436 domain1 = urlparse(url1).netloc.lower().lstrip('www.')
437 domain2 = urlparse(url2).netloc.lower().lstrip('www.')
438 return domain1 == domain2
440 def get_domain(self, url: str) -> str:
441 """Extract domain from URL."""
442 return urlparse(url).netloc.lower().lstrip('www.')
444 def is_safe(self, url: str) -> bool:
445 """
446 Check if a URL is potentially safe.
448 Args:
449 url: URL to check
451 Returns:
452 True if URL appears safe
453 """
454 parsed = urlparse(url)
456 # Check for suspicious patterns
457 suspicious_patterns = [
458 'javascript:', 'data:', 'vbscript:',
459 '.exe', '.scr', '.bat', '.cmd'
460 ]
462 url_lower = url.lower()
463 for pattern in suspicious_patterns:
464 if pattern in url_lower:
465 return False
467 # Must have valid scheme
468 if parsed.scheme not in ('http', 'https'):
469 return False
471 return True