fix: 修复爬虫功能

2026-04-13 15:19:18 +08:00 · 2026-04-13 15:19:18 +08:00 · 61bf235a15
parent f10c5de950
commit 61bf235a15
2 changed files with 33 additions and 54 deletions
--- a/luxx/tools/services.py
+++ b/luxx/tools/services.py
@ -5,84 +5,62 @@ from urllib.parse import parse_qs, urlparse, quote
 from typing import List
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from bs4 import BeautifulSoup
 from ddgs import DDGS
 class SearchService:
-    """DuckDuckGo search"""
+    """DuckDuckGo search using ddgs library"""
    def search(self, query: str, max_results: int = 5) -> List[dict]:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
        url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
        try:
            resp = httpx.get(url, headers=headers, timeout=15, follow_redirects=True)
            resp.raise_for_status()
        except Exception:
            return []
        soup = BeautifulSoup(resp.text, "html.parser")
        results = []
-
+        try:
-        for result in soup.select(".result")[:max_results]:
+            for result in DDGS().text(query, max_results=max_results):
            title_elem = result.select_one(".result__title a")
            snippet_elem = result.select_one(".result__snippet")
            if title_elem:
                raw_url = title_elem.get("href", "")
                if "uddg=" in raw_url:
                    params = parse_qs(urlparse(raw_url).query)
                    clean_url = params.get("uddg", [raw_url])[0]
                else:
                    clean_url = raw_url
                results.append({
-                    "title": title_elem.get_text(strip=True),
+                    "title": result.get("title", ""),
-                    "url": clean_url,
+                    "url": result.get("href", ""),
-                    "snippet": snippet_elem.get_text(strip=True) if snippet_elem else ""
+                    "snippet": result.get("body", "")
                })
-
+        except Exception:
            pass
        return results
 class FetchService:
-    """Page fetch with concurrent support"""
+    """Page fetch using httpx with concurrent support"""
    def __init__(self, timeout: float = 15.0):
        self.timeout = timeout
        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    def fetch(self, url: str, extract_type: str = "text") -> dict:
        if not url.startswith(("http://", "https://")):
            url = "https://" + url
        try:
-            resp = httpx.get(url, timeout=self.timeout, follow_redirects=True, headers={"User-Agent": self.user_agent})
+            headers = {
-            resp.raise_for_status()
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
-        except httpx.TimeoutException:
+            }
-            return {"error": "Request timeout"}
+            with httpx.Client(timeout=self.timeout, follow_redirects=True) as client:
                response = client.get(url, headers=headers)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")
                # Remove script and style elements
                for script in soup(["script", "style"]):
                    script.decompose()
                title = soup.title.string if soup.title else ""
                text = soup.get_text(separator="\n", strip=True)
                return {
                    "url": url,
                    "title": title[:500] if title else "",
                    "text": text[:15000]
                }
        except Exception as e:
            return {"error": str(e)}
-        soup = BeautifulSoup(resp.text, "html.parser")
+        return {"url": url, "title": "", "text": ""}
        title = soup.title.string if soup.title else ""
        # Remove noise
        for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
            tag.decompose()
        if extract_type == "links":
            links = [{"text": a.get_text(strip=True), "url": a["href"]}
                     for a in soup.find_all("a", href=True)
                     if a.get_text(strip=True) and not a["href"].startswith(("#", "javascript:"))]
            return {"url": url, "links": links[:50]}
        text = re.sub(r"\n{3,}", "\n\n", soup.get_text(separator="\n", strip=True))
        if extract_type == "structured":
            meta_desc = soup.find("meta", attrs={"name": "description"})
            return {"url": url, "title": title, "description": (meta_desc.get("content", "") if meta_desc else ""), "text": text[:5000]}
        return {"url": url, "title": title, "text": text[:15000]}
    def fetch_batch(self, urls: List[str], extract_type: str = "text", max_concurrent: int = 5) -> List[dict]:
        if len(urls) <= 1:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -22,6 +22,7 @@ dependencies = [
    "email-validator>=2.1.0",
    "shortuuid>=1.0.11",
    "sse-starlette>=2.0.0",
    "ddgs>=5.0.0",
 ]
 [project.optional-dependencies]