"""Tool helper services""" import re import httpx from urllib.parse import parse_qs, urlparse, quote from typing import List from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup class SearchService: """DuckDuckGo search""" def search(self, query: str, max_results: int = 5) -> List[dict]: headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} url = f"https://html.duckduckgo.com/html/?q={quote(query)}" try: resp = httpx.get(url, headers=headers, timeout=15, follow_redirects=True) resp.raise_for_status() except Exception: return [] soup = BeautifulSoup(resp.text, "html.parser") results = [] for result in soup.select(".result")[:max_results]: title_elem = result.select_one(".result__title a") snippet_elem = result.select_one(".result__snippet") if title_elem: raw_url = title_elem.get("href", "") if "uddg=" in raw_url: params = parse_qs(urlparse(raw_url).query) clean_url = params.get("uddg", [raw_url])[0] else: clean_url = raw_url results.append({ "title": title_elem.get_text(strip=True), "url": clean_url, "snippet": snippet_elem.get_text(strip=True) if snippet_elem else "" }) return results class FetchService: """Page fetch with concurrent support""" def __init__(self, timeout: float = 15.0): self.timeout = timeout self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" def fetch(self, url: str, extract_type: str = "text") -> dict: if not url.startswith(("http://", "https://")): url = "https://" + url try: resp = httpx.get(url, timeout=self.timeout, follow_redirects=True, headers={"User-Agent": self.user_agent}) resp.raise_for_status() except httpx.TimeoutException: return {"error": "Request timeout"} except Exception as e: return {"error": str(e)} soup = BeautifulSoup(resp.text, "html.parser") title = soup.title.string if soup.title else "" # Remove noise for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): tag.decompose() if extract_type == "links": links = [{"text": a.get_text(strip=True), "url": a["href"]} for a in soup.find_all("a", href=True) if a.get_text(strip=True) and not a["href"].startswith(("#", "javascript:"))] return {"url": url, "links": links[:50]} text = re.sub(r"\n{3,}", "\n\n", soup.get_text(separator="\n", strip=True)) if extract_type == "structured": meta_desc = soup.find("meta", attrs={"name": "description"}) return {"url": url, "title": title, "description": (meta_desc.get("content", "") if meta_desc else ""), "text": text[:5000]} return {"url": url, "title": title, "text": text[:15000]} def fetch_batch(self, urls: List[str], extract_type: str = "text", max_concurrent: int = 5) -> List[dict]: if len(urls) <= 1: return [self.fetch(url, extract_type) for url in urls] results = [None] * len(urls) max_concurrent = min(max(max_concurrent, 1), 5) with ThreadPoolExecutor(max_workers=max_concurrent) as pool: futures = {pool.submit(self.fetch, url, extract_type): i for i, url in enumerate(urls)} for future in as_completed(futures): results[futures[future]] = future.result() return results