feat: 优化爬虫选择

This commit is contained in:
ViperEkura 2026-04-18 11:42:33 +08:00
parent bad1a06ef1
commit 96f7518f43
4 changed files with 102 additions and 9 deletions

View File

@ -286,6 +286,7 @@ class ChatService:
"completion_tokens": 0, "completion_tokens": 0,
"total_tokens": 0 "total_tokens": 0
} }
actual_token_count = 0
# Streaming context for state management # Streaming context for state management
ctx = StreamContext() ctx = StreamContext()
@ -482,6 +483,7 @@ class ChatService:
yield _sse_event("error", {"content": "Exceeded maximum tool call iterations"}) yield _sse_event("error", {"content": "Exceeded maximum tool call iterations"})
except Exception as e: except Exception as e:
logger.error(f"Stream error: {e}")
yield _sse_event("error", {"content": str(e)}) yield _sse_event("error", {"content": str(e)})
def _save_message( def _save_message(

View File

@ -6,19 +6,18 @@ from luxx.tools.services import SearchService, FetchService
_fetch_service = FetchService() _fetch_service = FetchService()
_search_service = SearchService() _search_service = SearchService()
@tool(name="web_search", description="Search the internet. Use when you need to find latest news or answer questions.", parameters={ @tool(name="web_search", description="Search the internet. Use when you need to find latest news or answer questions.", parameters={
"type": "object", "type": "object",
"properties": { "properties": {
"query": {"type": "string", "description": "Search keywords"}, "query": {"type": "string", "description": "Search keywords"},
"max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5}, "max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5},
"region": {"type": "string", "description": "Search region (e.g. cn-zh for China, us-en for US)", "default": "cn-zh"} "region": {"type": "string", "description": "Search region (e.g. cn-zh for China, us-en for US)", "default": "cn-zh"},
}, },
"required": ["query"] "required": ["query"]
}, required_params=["query"], category="crawler") }, required_params=["query"], category="crawler")
def web_search(arguments: dict): def web_search(arguments: dict):
""" """
Search the web using DuckDuckGo Search the web using DuckDuckGo or Bing
Returns: Returns:
{"query": str, "count": int, "results": list} {"query": str, "count": int, "results": list}
@ -26,6 +25,8 @@ def web_search(arguments: dict):
query = arguments["query"] query = arguments["query"]
max_results = arguments.get("max_results", 5) max_results = arguments.get("max_results", 5)
region = arguments.get("region", "cn-zh") region = arguments.get("region", "cn-zh")
results = _search_service.search(query, max_results, region) results = _search_service.search(query, max_results, region)
return { return {

View File

@ -1,17 +1,19 @@
"""Tool helper services""" """Tool helper services"""
import re import re
import httpx import httpx
from urllib.parse import parse_qs, urlparse, quote from urllib.parse import quote
from typing import List from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ddgs import DDGS from ddgs import DDGS
from curl_cffi import requests as curl_requests
class SearchService: class SearchService:
"""Search service using DuckDuckGo""" """Search service supporting multiple engines"""
def __init__(self, engine: str = "duckduckgo"): def __init__(self, engine: str = "bing"):
self.engine = engine self.engine = engine
def search( def search(
@ -33,6 +35,8 @@ class SearchService:
""" """
if self.engine == "duckduckgo": if self.engine == "duckduckgo":
return self._search_duckduckgo(query, max_results, region) return self._search_duckduckgo(query, max_results, region)
elif self.engine == "bing":
return self._search_bing(query, max_results, region)
else: else:
raise ValueError(f"Unsupported search engine: {self.engine}") raise ValueError(f"Unsupported search engine: {self.engine}")
@ -60,6 +64,85 @@ class SearchService:
for r in results for r in results
] ]
def _search_bing(
self,
query: str,
max_results: int,
region: str
) -> List[dict]:
"""Bing search using curl-cffi to simulate browser"""
# Map region to Bing market code
market_map = {
"cn-zh": "zh-CN",
"us-en": "en-US",
}
market = market_map.get(region, "en-US")
results = []
offset = 0
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": f"{market},en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
}
while len(results) < max_results:
url = f"https://www.bing.com/search?q={quote(query)}&first={offset}&mkt={market}"
try:
response = curl_requests.get(
url,
headers=headers,
impersonate="chrome",
timeout=15
)
if response.status_code != 200:
break
soup = BeautifulSoup(response.text, "html.parser")
# Find search result items
for item in soup.select("li.b_algo"):
title_elem = item.select_one("h2 a")
snippet_elem = item.select_one("div.b_paractl")
cite_elem = item.select_one("cite")
if title_elem:
title = title_elem.get_text(strip=True)
url = title_elem.get("href", "")
# Get snippet
snippet = ""
if snippet_elem:
snippet = snippet_elem.get_text(strip=True)
elif cite_elem:
snippet = cite_elem.get_text(strip=True)
results.append({
"title": title,
"url": url,
"snippet": snippet
})
if len(results) >= max_results:
break
# Check if there are more results
next_page = soup.select_one("a.sb_pagN")
if not next_page or len(results) >= max_results:
break
offset += 10
except Exception as e:
break
return results[:max_results]
class FetchService: class FetchService:
"""Page fetch service with content extraction support""" """Page fetch service with content extraction support"""
@ -91,11 +174,17 @@ class FetchService:
url = "https://" + url url = "https://" + url
try: try:
resp = httpx.get( resp = curl_requests.get(
url, url,
timeout=self.timeout, timeout=self.timeout,
follow_redirects=True, impersonate="chrome",
headers={"User-Agent": self.user_agent} headers={
"User-Agent": self.user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
}
) )
resp.raise_for_status() resp.raise_for_status()
except Exception as e: except Exception as e:

View File

@ -23,6 +23,7 @@ dependencies = [
"shortuuid>=1.0.11", "shortuuid>=1.0.11",
"sse-starlette>=2.0.0", "sse-starlette>=2.0.0",
"ddgs>=5.0.0", "ddgs>=5.0.0",
"curl-cffi>=0.6.0",
] ]
[project.optional-dependencies] [project.optional-dependencies]