diff --git a/luxx/tools/services.py b/luxx/tools/services.py index b7fe420..93fff24 100644 --- a/luxx/tools/services.py +++ b/luxx/tools/services.py @@ -5,84 +5,49 @@ from urllib.parse import parse_qs, urlparse, quote from typing import List from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup +from ddgs import DDGS class SearchService: - """DuckDuckGo search""" + """DuckDuckGo search using ddgs library""" def search(self, query: str, max_results: int = 5) -> List[dict]: - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} - url = f"https://html.duckduckgo.com/html/?q={quote(query)}" - - try: - resp = httpx.get(url, headers=headers, timeout=15, follow_redirects=True) - resp.raise_for_status() - except Exception: - return [] - - soup = BeautifulSoup(resp.text, "html.parser") results = [] - - for result in soup.select(".result")[:max_results]: - title_elem = result.select_one(".result__title a") - snippet_elem = result.select_one(".result__snippet") - - if title_elem: - raw_url = title_elem.get("href", "") - if "uddg=" in raw_url: - params = parse_qs(urlparse(raw_url).query) - clean_url = params.get("uddg", [raw_url])[0] - else: - clean_url = raw_url - + try: + for result in DDGS().text(query, max_results=max_results): results.append({ - "title": title_elem.get_text(strip=True), - "url": clean_url, - "snippet": snippet_elem.get_text(strip=True) if snippet_elem else "" + "title": result.get("title", ""), + "url": result.get("href", ""), + "snippet": result.get("body", "") }) - + except Exception: + pass return results class FetchService: - """Page fetch with concurrent support""" + """Page fetch using ddgs with concurrent support""" def __init__(self, timeout: float = 15.0): self.timeout = timeout - self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" def fetch(self, url: str, extract_type: str = "text") -> dict: if not url.startswith(("http://", "https://")): url = "https://" + url try: - resp = httpx.get(url, timeout=self.timeout, follow_redirects=True, headers={"User-Agent": self.user_agent}) - resp.raise_for_status() - except httpx.TimeoutException: - return {"error": "Request timeout"} + result = DDGS().fetch(url) + if result and result.get("results"): + page_data = result["results"][0] + return { + "url": url, + "title": page_data.get("title", ""), + "text": page_data.get("text", "")[:15000] + } except Exception as e: return {"error": str(e)} - soup = BeautifulSoup(resp.text, "html.parser") - title = soup.title.string if soup.title else "" - - # Remove noise - for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): - tag.decompose() - - if extract_type == "links": - links = [{"text": a.get_text(strip=True), "url": a["href"]} - for a in soup.find_all("a", href=True) - if a.get_text(strip=True) and not a["href"].startswith(("#", "javascript:"))] - return {"url": url, "links": links[:50]} - - text = re.sub(r"\n{3,}", "\n\n", soup.get_text(separator="\n", strip=True)) - - if extract_type == "structured": - meta_desc = soup.find("meta", attrs={"name": "description"}) - return {"url": url, "title": title, "description": (meta_desc.get("content", "") if meta_desc else ""), "text": text[:5000]} - - return {"url": url, "title": title, "text": text[:15000]} + return {"url": url, "title": "", "text": ""} def fetch_batch(self, urls: List[str], extract_type: str = "text", max_concurrent: int = 5) -> List[dict]: if len(urls) <= 1: diff --git a/pyproject.toml b/pyproject.toml index 63b4d31..e64e109 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "email-validator>=2.1.0", "shortuuid>=1.0.11", "sse-starlette>=2.0.0", + "ddgs>=5.0.0", ] [project.optional-dependencies]