feat: 优化爬虫选择

2026-04-18 11:42:33 +08:00 · 2026-04-18 11:42:33 +08:00 · 96f7518f43
parent bad1a06ef1
commit 96f7518f43
4 changed files with 102 additions and 9 deletions
--- a/luxx/services/chat.py
+++ b/luxx/services/chat.py
@ -286,6 +286,7 @@ class ChatService:
                "completion_tokens": 0,
                "total_tokens": 0
            }
            actual_token_count = 0
            # Streaming context for state management
            ctx = StreamContext()
@ -482,6 +483,7 @@ class ChatService:
            yield _sse_event("error", {"content": "Exceeded maximum tool call iterations"})
        except Exception as e:
            logger.error(f"Stream error: {e}")
            yield _sse_event("error", {"content": str(e)})
    def _save_message(
--- a/luxx/tools/builtin/crawler.py
+++ b/luxx/tools/builtin/crawler.py
@ -6,19 +6,18 @@ from luxx.tools.services import SearchService, FetchService
 _fetch_service = FetchService()
 _search_service = SearchService()
@tool(name="web_search", description="Search the internet. Use when you need to find latest news or answer questions.", parameters={
    "type": "object",
    "properties": {
        "query": {"type": "string", "description": "Search keywords"},
        "max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5},
-        "region": {"type": "string", "description": "Search region (e.g. cn-zh for China, us-en for US)", "default": "cn-zh"}
+        "region": {"type": "string", "description": "Search region (e.g. cn-zh for China, us-en for US)", "default": "cn-zh"},
    },
    "required": ["query"]
 }, required_params=["query"], category="crawler")
 def web_search(arguments: dict):
    """
-    Search the web using DuckDuckGo
+    Search the web using DuckDuckGo or Bing
    Returns:
        {"query": str, "count": int, "results": list}
@ -26,6 +25,8 @@ def web_search(arguments: dict):
    query = arguments["query"]
    max_results = arguments.get("max_results", 5)
    region = arguments.get("region", "cn-zh")
    results = _search_service.search(query, max_results, region)
    return {
--- a/luxx/tools/services.py
+++ b/luxx/tools/services.py
@ -1,17 +1,19 @@
 """Tool helper services"""
 import re
 import httpx
-from urllib.parse import parse_qs, urlparse, quote
+from urllib.parse import  quote
 from typing import List
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from bs4 import BeautifulSoup
 from ddgs import DDGS
 from curl_cffi import requests as curl_requests
 class SearchService:
-    """Search service using DuckDuckGo"""
+    """Search service supporting multiple engines"""
-    def __init__(self, engine: str = "duckduckgo"):
+    def __init__(self, engine: str = "bing"):
        self.engine = engine
    def search(
@ -33,6 +35,8 @@ class SearchService:
        """
        if self.engine == "duckduckgo":
            return self._search_duckduckgo(query, max_results, region)
        elif self.engine == "bing":
            return self._search_bing(query, max_results, region)
        else:
            raise ValueError(f"Unsupported search engine: {self.engine}")
@ -60,6 +64,85 @@ class SearchService:
            for r in results
        ]
    def _search_bing(
        self,
        query: str,
        max_results: int,
        region: str
    ) -> List[dict]:
        """Bing search using curl-cffi to simulate browser"""
        # Map region to Bing market code
        market_map = {
            "cn-zh": "zh-CN",
            "us-en": "en-US",
        }
        market = market_map.get(region, "en-US")
        results = []
        offset = 0
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": f"{market},en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
        }
        while len(results) < max_results:
            url = f"https://www.bing.com/search?q={quote(query)}&first={offset}&mkt={market}"
            try:
                response = curl_requests.get(
                    url,
                    headers=headers,
                    impersonate="chrome",
                    timeout=15
                )
                if response.status_code != 200:
                    break
                soup = BeautifulSoup(response.text, "html.parser")
                # Find search result items
                for item in soup.select("li.b_algo"):
                    title_elem = item.select_one("h2 a")
                    snippet_elem = item.select_one("div.b_paractl")
                    cite_elem = item.select_one("cite")
                    if title_elem:
                        title = title_elem.get_text(strip=True)
                        url = title_elem.get("href", "")
                        # Get snippet
                        snippet = ""
                        if snippet_elem:
                            snippet = snippet_elem.get_text(strip=True)
                        elif cite_elem:
                            snippet = cite_elem.get_text(strip=True)
                        results.append({
                            "title": title,
                            "url": url,
                            "snippet": snippet
                        })
                        if len(results) >= max_results:
                            break
                # Check if there are more results
                next_page = soup.select_one("a.sb_pagN")
                if not next_page or len(results) >= max_results:
                    break
                offset += 10
            except Exception as e:
                break
        return results[:max_results]
 class FetchService:
    """Page fetch service with content extraction support"""
@ -91,11 +174,17 @@ class FetchService:
            url = "https://" + url
        try:
-            resp = httpx.get(
+            resp = curl_requests.get(
                url,
                timeout=self.timeout,
-                follow_redirects=True,
+                impersonate="chrome",
-                headers={"User-Agent": self.user_agent}
+                headers={
                    "User-Agent": self.user_agent,
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                    "Accept-Language": "en-US,en;q=0.5",
                    "Accept-Encoding": "gzip, deflate, br",
                    "Connection": "keep-alive",
                }
            )
            resp.raise_for_status()
        except Exception as e:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -23,6 +23,7 @@ dependencies = [
    "shortuuid>=1.0.11",
    "sse-starlette>=2.0.0",
    "ddgs>=5.0.0",
    "curl-cffi>=0.6.0",
 ]
 [project.optional-dependencies]