"""Tool helper services""" import re import httpx from urllib.parse import parse_qs, urlparse, quote from typing import List from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup from ddgs import DDGS class SearchService: """DuckDuckGo search using ddgs library""" def search(self, query: str, max_results: int = 5) -> List[dict]: results = [] try: for result in DDGS().text(query, max_results=max_results): results.append({ "title": result.get("title", ""), "url": result.get("href", ""), "snippet": result.get("body", "") }) except Exception: pass return results class FetchService: """Page fetch using httpx with concurrent support""" def __init__(self, timeout: float = 15.0): self.timeout = timeout def fetch(self, url: str, extract_type: str = "text") -> dict: if not url.startswith(("http://", "https://")): url = "https://" + url try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } with httpx.Client(timeout=self.timeout, follow_redirects=True) as client: response = client.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # Remove script and style elements for script in soup(["script", "style"]): script.decompose() title = soup.title.string if soup.title else "" text = soup.get_text(separator="\n", strip=True) return { "url": url, "title": title[:500] if title else "", "text": text[:15000] } except Exception as e: return {"error": str(e)} return {"url": url, "title": "", "text": ""} def fetch_batch(self, urls: List[str], extract_type: str = "text", max_concurrent: int = 5) -> List[dict]: if len(urls) <= 1: return [self.fetch(url, extract_type) for url in urls] results = [None] * len(urls) max_concurrent = min(max(max_concurrent, 1), 5) with ThreadPoolExecutor(max_workers=max_concurrent) as pool: futures = {pool.submit(self.fetch, url, extract_type): i for i, url in enumerate(urls)} for future in as_completed(futures): results[futures[future]] = future.result() return results