feat: 优化爬虫设置

2026-04-18 11:15:49 +08:00 · 2026-04-18 11:15:49 +08:00 · bad1a06ef1
parent dc08267c15
commit bad1a06ef1
2 changed files with 197 additions and 44 deletions
--- a/luxx/tools/builtin/crawler.py
+++ b/luxx/tools/builtin/crawler.py
@ -2,15 +2,17 @@
 from luxx.tools.factory import tool
 from luxx.tools.services import SearchService, FetchService

-# 服务实例（SearchService.search() 是静态方法风格，不需要实例化）
+
 _fetch_service = FetchService()
+_search_service = SearchService()


@tool(name="web_search", description="Search the internet. Use when you need to find latest news or answer questions.", parameters={
    "type": "object",
    "properties": {
        "query": {"type": "string", "description": "Search keywords"},
-        "max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5}
+        "max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5},
+        "region": {"type": "string", "description": "Search region (e.g. cn-zh for China, us-en for US)", "default": "cn-zh"}
    },
    "required": ["query"]
 }, required_params=["query"], category="crawler")
@ -23,8 +25,9 @@ def web_search(arguments: dict):
    """
    query = arguments["query"]
    max_results = arguments.get("max_results", 5)
+    region = arguments.get("region", "cn-zh")
+    results = _search_service.search(query, max_results, region)
    
-    results = SearchService().search(query, max_results)
    return {
        "query": query,
        "count": len(results),
@ -35,7 +38,8 @@ def web_search(arguments: dict):
@tool(name="web_fetch", description="Fetch content from a webpage.", parameters={
    "type": "object",
    "properties": {
-        "url": {"type": "string", "description": "URL to fetch"}
+        "url": {"type": "string", "description": "URL to fetch"},
+        "extract_type": {"type": "string", "description": "Extraction type: text, links, or structured", "enum": ["text", "links", "structured"], "default": "text"}
    },
    "required": ["url"]
 }, required_params=["url"], category="crawler")
@ -44,17 +48,19 @@ def web_fetch(arguments: dict):
    Fetch webpage content
    
    Returns:
-        {"url": str, "title": str, "text": str}
+        {"url": str, "text/links/structured": ...}
    """
    url = arguments["url"]
+    extract_type = arguments.get("extract_type", "text")
    
-    return _fetch_service.fetch(url)
+    return _fetch_service.fetch(url, extract_type)


@tool(name="batch_fetch", description="Batch fetch multiple webpages.", parameters={
    "type": "object",
    "properties": {
-        "urls": {"type": "array", "items": {"type": "string"}, "description": "URLs to fetch"}
+        "urls": {"type": "array", "items": {"type": "string"}, "description": "URLs to fetch"},
+        "extract_type": {"type": "string", "description": "Extraction type: text, links, or structured", "enum": ["text", "links", "structured"], "default": "text"}
    },
    "required": ["urls"]
 }, required_params=["urls"], category="crawler")
@ -66,8 +72,12 @@ def batch_fetch(arguments: dict):
        {"count": int, "results": list}
    """
    urls = arguments.get("urls", [])
+    extract_type = arguments.get("extract_type", "text")
    
-    results = _fetch_service.fetch_batch(urls)
+    if len(urls) > 10:
+        return {"error": "Maximum 10 pages can be fetched at once"}
+    
+    results = _fetch_service.fetch_batch(urls, extract_type)
    
    return {
        "count": len(results),
--- a/luxx/tools/services.py
+++ b/luxx/tools/services.py
@ -9,62 +9,205 @@ from ddgs import DDGS


 class SearchService:
-    """DuckDuckGo search using ddgs library"""
+    """Search service using DuckDuckGo"""

-    def search(self, query: str, max_results: int = 5) -> List[dict]:
-        results = []
-        with DDGS() as client:
-            for result in client.text(query, max_results=max_results):
-                results.append({
-                    "title": result.get("title", ""),
-                    "url": result.get("href", ""),
-                    "snippet": result.get("body", "")
-                })
-        return results
+    def __init__(self, engine: str = "duckduckgo"):
+        self.engine = engine
+
+    def search(
+        self,
+        query: str,
+        max_results: int = 5,
+        region: str = "cn-zh"
+    ) -> List[dict]:
+        """
+        Execute search
+
+        Args:
+            query: Search keywords
+            max_results: Max result count
+            region: Region setting
+
+        Returns:
+            Search result list
+        """
+        if self.engine == "duckduckgo":
+            return self._search_duckduckgo(query, max_results, region)
+        else:
+            raise ValueError(f"Unsupported search engine: {self.engine}")
+
+    def _search_duckduckgo(
+        self,
+        query: str,
+        max_results: int,
+        region: str
+    ) -> List[dict]:
+        """DuckDuckGo search with region support"""
+
+        with DDGS() as ddgs:
+            results = list(ddgs.text(
+                query,
+                max_results=max_results,
+                region=region
+            ))
+
+        return [
+            {
+                "title": r.get("title", ""),
+                "url": r.get("href", ""),
+                "snippet": r.get("body", "")
+            }
+            for r in results
+        ]


 class FetchService:
-    """Page fetch using httpx with concurrent support"""
+    """Page fetch service with content extraction support"""

-    def __init__(self, timeout: float = 15.0):
+    def __init__(self, timeout: float = 30.0, user_agent: str = None):
        self.timeout = timeout
+        self.user_agent = user_agent or (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/120.0.0.0 Safari/537.36"
+        )

-    def fetch(self, url: str, extract_type: str = "text") -> dict:
+    def fetch(
+        self,
+        url: str,
+        extract_type: str = "text"
+    ) -> dict:
+        """
+        Fetch a single page
+
+        Args:
+            url: Page URL
+            extract_type: Extract type (text, links, structured)
+
+        Returns:
+            Fetch result
+        """
        if not url.startswith(("http://", "https://")):
            url = "https://" + url

-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
-        }
-        with httpx.Client(timeout=self.timeout, follow_redirects=True) as client:
-            response = client.get(url, headers=headers)
-            response.raise_for_status()
-            
-            soup = BeautifulSoup(response.text, "html.parser")
-            
-            # Remove script and style elements
-            for script in soup(["script", "style"]):
-                script.decompose()
-            
-            title = soup.title.string if soup.title else ""
-            text = soup.get_text(separator="\n", strip=True)
-            
+        try:
+            resp = httpx.get(
+                url,
+                timeout=self.timeout,
+                follow_redirects=True,
+                headers={"User-Agent": self.user_agent}
+            )
+            resp.raise_for_status()
+        except Exception as e:
+            return {"error": str(e), "url": url}
+
+        html = resp.text
+        extractor = ContentExtractor(html)
+
+        if extract_type == "text":
            return {
                "url": url,
-                "title": title[:500] if title else "",
-                "text": text[:15000]
+                "text": extractor.extract_text()
            }
+        elif extract_type == "links":
+            return {
+                "url": url,
+                "links": extractor.extract_links()
+            }
+        else:
+            return extractor.extract_structured(url)

-    def fetch_batch(self, urls: List[str], extract_type: str = "text", max_concurrent: int = 5) -> List[dict]:
+    def fetch_batch(
+        self,
+        urls: List[str],
+        extract_type: str = "text",
+        max_concurrent: int = 5
+    ) -> List[dict]:
+        """
+        Batch fetch pages concurrently.
+
+        Args:
+            urls: URL list
+            extract_type: Extract type
+            max_concurrent: Max concurrent requests (1-5, default 5)
+
+        Returns:
+            Result list (same order as input URLs)
+        """
        if len(urls) <= 1:
            return [self.fetch(url, extract_type) for url in urls]

-        results = [None] * len(urls)
        max_concurrent = min(max(max_concurrent, 1), 5)
+        results = [None] * len(urls)

        with ThreadPoolExecutor(max_workers=max_concurrent) as pool:
-            futures = {pool.submit(self.fetch, url, extract_type): i for i, url in enumerate(urls)}
+            futures = {
+                pool.submit(self.fetch, url, extract_type): i
+                for i, url in enumerate(urls)
+            }
            for future in as_completed(futures):
-                results[futures[future]] = future.result()
+                idx = futures[future]
+                try:
+                    results[idx] = future.result()
+                except Exception as e:
+                    results[idx] = {"error": str(e)}

        return results
+
+
+class ContentExtractor:
+    """Content extractor for HTML pages"""
+
+    def __init__(self, html: str):
+        self.html = html
+        self._soup = None
+
+    @property
+    def soup(self):
+        if self._soup is None:
+            self._soup = BeautifulSoup(self.html, "html.parser")
+        return self._soup
+
+    def extract_text(self) -> str:
+        """Extract plain text"""
+        # Remove script, style, nav, footer, header
+        for tag in self.soup(["script", "style", "nav", "footer", "header"]):
+            tag.decompose()
+
+        text = self.soup.get_text(separator="\n", strip=True)
+        # Clean extra whitespace
+        text = re.sub(r"\n{3,}", "\n\n", text)
+        return text
+
+    def extract_links(self) -> List[dict]:
+        """Extract links"""
+        links = []
+        for a in self.soup.find_all("a", href=True):
+            text = a.get_text(strip=True)
+            href = a["href"]
+            if text and href and not href.startswith(("#", "javascript:")):
+                links.append({"text": text, "href": href})
+        return links[:50]  # Limit count
+
+    def extract_structured(self, url: str = "") -> dict:
+        """Extract structured content"""
+        soup = self.soup
+
+        # Extract title
+        title = ""
+        if soup.title:
+            title = soup.title.string or ""
+
+        # Extract meta description
+        description = ""
+        meta_desc = soup.find("meta", attrs={"name": "description"})
+        if meta_desc:
+            description = meta_desc.get("content", "")
+
+        return {
+            "url": url,
+            "title": title.strip() if title else "",
+            "description": description.strip(),
+            "text": self.extract_text()[:5000],
+            "links": self.extract_links()[:20]
+        }