Luxx/luxx/tools/services.py

"""Tool helper services"""
import re
import httpx
from urllib.parse import parse_qs, urlparse, quote
from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
from ddgs import DDGS


class SearchService:
    """DuckDuckGo search using ddgs library"""

    def search(self, query: str, max_results: int = 5) -> List[dict]:
        results = []
        try:
            for result in DDGS().text(query, max_results=max_results):
                results.append({
                    "title": result.get("title", ""),
                    "url": result.get("href", ""),
                    "snippet": result.get("body", "")
                })
        except Exception:
            pass
        return results


class FetchService:
    """Page fetch using httpx with concurrent support"""

    def __init__(self, timeout: float = 15.0):
        self.timeout = timeout

    def fetch(self, url: str, extract_type: str = "text") -> dict:
        if not url.startswith(("http://", "https://")):
            url = "https://" + url

        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            }
            with httpx.Client(timeout=self.timeout, follow_redirects=True) as client:
                response = client.get(url, headers=headers)
                response.raise_for_status()

                soup = BeautifulSoup(response.text, "html.parser")

                # Remove script and style elements
                for script in soup(["script", "style"]):
                    script.decompose()

                title = soup.title.string if soup.title else ""
                text = soup.get_text(separator="\n", strip=True)

                return {
                    "url": url,
                    "title": title[:500] if title else "",
                    "text": text[:15000]
                }
        except Exception as e:
            return {"error": str(e)}

        return {"url": url, "title": "", "text": ""}

    def fetch_batch(self, urls: List[str], extract_type: str = "text", max_concurrent: int = 5) -> List[dict]:
        if len(urls) <= 1:
            return [self.fetch(url, extract_type) for url in urls]

        results = [None] * len(urls)
        max_concurrent = min(max(max_concurrent, 1), 5)

        with ThreadPoolExecutor(max_workers=max_concurrent) as pool:
            futures = {pool.submit(self.fetch, url, extract_type): i for i, url in enumerate(urls)}
            for future in as_completed(futures):
                try:
                    results[futures[future]] = future.result()
                except Exception as e:
                    results[futures[future]] = {"error": str(e)}

        return results