feat: 优化爬虫选择
This commit is contained in:
parent
bad1a06ef1
commit
96f7518f43
|
|
@ -286,6 +286,7 @@ class ChatService:
|
|||
"completion_tokens": 0,
|
||||
"total_tokens": 0
|
||||
}
|
||||
actual_token_count = 0
|
||||
|
||||
# Streaming context for state management
|
||||
ctx = StreamContext()
|
||||
|
|
@ -482,6 +483,7 @@ class ChatService:
|
|||
yield _sse_event("error", {"content": "Exceeded maximum tool call iterations"})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Stream error: {e}")
|
||||
yield _sse_event("error", {"content": str(e)})
|
||||
|
||||
def _save_message(
|
||||
|
|
|
|||
|
|
@ -6,19 +6,18 @@ from luxx.tools.services import SearchService, FetchService
|
|||
_fetch_service = FetchService()
|
||||
_search_service = SearchService()
|
||||
|
||||
|
||||
@tool(name="web_search", description="Search the internet. Use when you need to find latest news or answer questions.", parameters={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string", "description": "Search keywords"},
|
||||
"max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5},
|
||||
"region": {"type": "string", "description": "Search region (e.g. cn-zh for China, us-en for US)", "default": "cn-zh"}
|
||||
"region": {"type": "string", "description": "Search region (e.g. cn-zh for China, us-en for US)", "default": "cn-zh"},
|
||||
},
|
||||
"required": ["query"]
|
||||
}, required_params=["query"], category="crawler")
|
||||
def web_search(arguments: dict):
|
||||
"""
|
||||
Search the web using DuckDuckGo
|
||||
Search the web using DuckDuckGo or Bing
|
||||
|
||||
Returns:
|
||||
{"query": str, "count": int, "results": list}
|
||||
|
|
@ -26,6 +25,8 @@ def web_search(arguments: dict):
|
|||
query = arguments["query"]
|
||||
max_results = arguments.get("max_results", 5)
|
||||
region = arguments.get("region", "cn-zh")
|
||||
|
||||
|
||||
results = _search_service.search(query, max_results, region)
|
||||
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -1,17 +1,19 @@
|
|||
"""Tool helper services"""
|
||||
import re
|
||||
import httpx
|
||||
from urllib.parse import parse_qs, urlparse, quote
|
||||
from urllib.parse import quote
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from bs4 import BeautifulSoup
|
||||
from ddgs import DDGS
|
||||
|
||||
from curl_cffi import requests as curl_requests
|
||||
|
||||
|
||||
class SearchService:
|
||||
"""Search service using DuckDuckGo"""
|
||||
"""Search service supporting multiple engines"""
|
||||
|
||||
def __init__(self, engine: str = "duckduckgo"):
|
||||
def __init__(self, engine: str = "bing"):
|
||||
self.engine = engine
|
||||
|
||||
def search(
|
||||
|
|
@ -33,6 +35,8 @@ class SearchService:
|
|||
"""
|
||||
if self.engine == "duckduckgo":
|
||||
return self._search_duckduckgo(query, max_results, region)
|
||||
elif self.engine == "bing":
|
||||
return self._search_bing(query, max_results, region)
|
||||
else:
|
||||
raise ValueError(f"Unsupported search engine: {self.engine}")
|
||||
|
||||
|
|
@ -60,6 +64,85 @@ class SearchService:
|
|||
for r in results
|
||||
]
|
||||
|
||||
def _search_bing(
|
||||
self,
|
||||
query: str,
|
||||
max_results: int,
|
||||
region: str
|
||||
) -> List[dict]:
|
||||
"""Bing search using curl-cffi to simulate browser"""
|
||||
# Map region to Bing market code
|
||||
market_map = {
|
||||
"cn-zh": "zh-CN",
|
||||
"us-en": "en-US",
|
||||
}
|
||||
market = market_map.get(region, "en-US")
|
||||
|
||||
results = []
|
||||
offset = 0
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": f"{market},en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
|
||||
while len(results) < max_results:
|
||||
url = f"https://www.bing.com/search?q={quote(query)}&first={offset}&mkt={market}"
|
||||
|
||||
try:
|
||||
response = curl_requests.get(
|
||||
url,
|
||||
headers=headers,
|
||||
impersonate="chrome",
|
||||
timeout=15
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
break
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Find search result items
|
||||
for item in soup.select("li.b_algo"):
|
||||
title_elem = item.select_one("h2 a")
|
||||
snippet_elem = item.select_one("div.b_paractl")
|
||||
cite_elem = item.select_one("cite")
|
||||
|
||||
if title_elem:
|
||||
title = title_elem.get_text(strip=True)
|
||||
url = title_elem.get("href", "")
|
||||
|
||||
# Get snippet
|
||||
snippet = ""
|
||||
if snippet_elem:
|
||||
snippet = snippet_elem.get_text(strip=True)
|
||||
elif cite_elem:
|
||||
snippet = cite_elem.get_text(strip=True)
|
||||
|
||||
results.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"snippet": snippet
|
||||
})
|
||||
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
|
||||
# Check if there are more results
|
||||
next_page = soup.select_one("a.sb_pagN")
|
||||
if not next_page or len(results) >= max_results:
|
||||
break
|
||||
|
||||
offset += 10
|
||||
|
||||
except Exception as e:
|
||||
break
|
||||
|
||||
return results[:max_results]
|
||||
|
||||
|
||||
class FetchService:
|
||||
"""Page fetch service with content extraction support"""
|
||||
|
|
@ -91,11 +174,17 @@ class FetchService:
|
|||
url = "https://" + url
|
||||
|
||||
try:
|
||||
resp = httpx.get(
|
||||
resp = curl_requests.get(
|
||||
url,
|
||||
timeout=self.timeout,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": self.user_agent}
|
||||
impersonate="chrome",
|
||||
headers={
|
||||
"User-Agent": self.user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ dependencies = [
|
|||
"shortuuid>=1.0.11",
|
||||
"sse-starlette>=2.0.0",
|
||||
"ddgs>=5.0.0",
|
||||
"curl-cffi>=0.6.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
|
|
|||
Loading…
Reference in New Issue