feat: 优化爬虫选择
This commit is contained in:
parent
bad1a06ef1
commit
96f7518f43
|
|
@ -286,6 +286,7 @@ class ChatService:
|
||||||
"completion_tokens": 0,
|
"completion_tokens": 0,
|
||||||
"total_tokens": 0
|
"total_tokens": 0
|
||||||
}
|
}
|
||||||
|
actual_token_count = 0
|
||||||
|
|
||||||
# Streaming context for state management
|
# Streaming context for state management
|
||||||
ctx = StreamContext()
|
ctx = StreamContext()
|
||||||
|
|
@ -482,6 +483,7 @@ class ChatService:
|
||||||
yield _sse_event("error", {"content": "Exceeded maximum tool call iterations"})
|
yield _sse_event("error", {"content": "Exceeded maximum tool call iterations"})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
logger.error(f"Stream error: {e}")
|
||||||
yield _sse_event("error", {"content": str(e)})
|
yield _sse_event("error", {"content": str(e)})
|
||||||
|
|
||||||
def _save_message(
|
def _save_message(
|
||||||
|
|
|
||||||
|
|
@ -6,19 +6,18 @@ from luxx.tools.services import SearchService, FetchService
|
||||||
_fetch_service = FetchService()
|
_fetch_service = FetchService()
|
||||||
_search_service = SearchService()
|
_search_service = SearchService()
|
||||||
|
|
||||||
|
|
||||||
@tool(name="web_search", description="Search the internet. Use when you need to find latest news or answer questions.", parameters={
|
@tool(name="web_search", description="Search the internet. Use when you need to find latest news or answer questions.", parameters={
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"query": {"type": "string", "description": "Search keywords"},
|
"query": {"type": "string", "description": "Search keywords"},
|
||||||
"max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5},
|
"max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5},
|
||||||
"region": {"type": "string", "description": "Search region (e.g. cn-zh for China, us-en for US)", "default": "cn-zh"}
|
"region": {"type": "string", "description": "Search region (e.g. cn-zh for China, us-en for US)", "default": "cn-zh"},
|
||||||
},
|
},
|
||||||
"required": ["query"]
|
"required": ["query"]
|
||||||
}, required_params=["query"], category="crawler")
|
}, required_params=["query"], category="crawler")
|
||||||
def web_search(arguments: dict):
|
def web_search(arguments: dict):
|
||||||
"""
|
"""
|
||||||
Search the web using DuckDuckGo
|
Search the web using DuckDuckGo or Bing
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
{"query": str, "count": int, "results": list}
|
{"query": str, "count": int, "results": list}
|
||||||
|
|
@ -26,6 +25,8 @@ def web_search(arguments: dict):
|
||||||
query = arguments["query"]
|
query = arguments["query"]
|
||||||
max_results = arguments.get("max_results", 5)
|
max_results = arguments.get("max_results", 5)
|
||||||
region = arguments.get("region", "cn-zh")
|
region = arguments.get("region", "cn-zh")
|
||||||
|
|
||||||
|
|
||||||
results = _search_service.search(query, max_results, region)
|
results = _search_service.search(query, max_results, region)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,19 @@
|
||||||
"""Tool helper services"""
|
"""Tool helper services"""
|
||||||
import re
|
import re
|
||||||
import httpx
|
import httpx
|
||||||
from urllib.parse import parse_qs, urlparse, quote
|
from urllib.parse import quote
|
||||||
from typing import List
|
from typing import List
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from ddgs import DDGS
|
from ddgs import DDGS
|
||||||
|
|
||||||
|
from curl_cffi import requests as curl_requests
|
||||||
|
|
||||||
|
|
||||||
class SearchService:
|
class SearchService:
|
||||||
"""Search service using DuckDuckGo"""
|
"""Search service supporting multiple engines"""
|
||||||
|
|
||||||
def __init__(self, engine: str = "duckduckgo"):
|
def __init__(self, engine: str = "bing"):
|
||||||
self.engine = engine
|
self.engine = engine
|
||||||
|
|
||||||
def search(
|
def search(
|
||||||
|
|
@ -33,6 +35,8 @@ class SearchService:
|
||||||
"""
|
"""
|
||||||
if self.engine == "duckduckgo":
|
if self.engine == "duckduckgo":
|
||||||
return self._search_duckduckgo(query, max_results, region)
|
return self._search_duckduckgo(query, max_results, region)
|
||||||
|
elif self.engine == "bing":
|
||||||
|
return self._search_bing(query, max_results, region)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported search engine: {self.engine}")
|
raise ValueError(f"Unsupported search engine: {self.engine}")
|
||||||
|
|
||||||
|
|
@ -60,6 +64,85 @@ class SearchService:
|
||||||
for r in results
|
for r in results
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def _search_bing(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
max_results: int,
|
||||||
|
region: str
|
||||||
|
) -> List[dict]:
|
||||||
|
"""Bing search using curl-cffi to simulate browser"""
|
||||||
|
# Map region to Bing market code
|
||||||
|
market_map = {
|
||||||
|
"cn-zh": "zh-CN",
|
||||||
|
"us-en": "en-US",
|
||||||
|
}
|
||||||
|
market = market_map.get(region, "en-US")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
offset = 0
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": f"{market},en;q=0.5",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
}
|
||||||
|
|
||||||
|
while len(results) < max_results:
|
||||||
|
url = f"https://www.bing.com/search?q={quote(query)}&first={offset}&mkt={market}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = curl_requests.get(
|
||||||
|
url,
|
||||||
|
headers=headers,
|
||||||
|
impersonate="chrome",
|
||||||
|
timeout=15
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
break
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
# Find search result items
|
||||||
|
for item in soup.select("li.b_algo"):
|
||||||
|
title_elem = item.select_one("h2 a")
|
||||||
|
snippet_elem = item.select_one("div.b_paractl")
|
||||||
|
cite_elem = item.select_one("cite")
|
||||||
|
|
||||||
|
if title_elem:
|
||||||
|
title = title_elem.get_text(strip=True)
|
||||||
|
url = title_elem.get("href", "")
|
||||||
|
|
||||||
|
# Get snippet
|
||||||
|
snippet = ""
|
||||||
|
if snippet_elem:
|
||||||
|
snippet = snippet_elem.get_text(strip=True)
|
||||||
|
elif cite_elem:
|
||||||
|
snippet = cite_elem.get_text(strip=True)
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"title": title,
|
||||||
|
"url": url,
|
||||||
|
"snippet": snippet
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(results) >= max_results:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Check if there are more results
|
||||||
|
next_page = soup.select_one("a.sb_pagN")
|
||||||
|
if not next_page or len(results) >= max_results:
|
||||||
|
break
|
||||||
|
|
||||||
|
offset += 10
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
break
|
||||||
|
|
||||||
|
return results[:max_results]
|
||||||
|
|
||||||
|
|
||||||
class FetchService:
|
class FetchService:
|
||||||
"""Page fetch service with content extraction support"""
|
"""Page fetch service with content extraction support"""
|
||||||
|
|
@ -91,11 +174,17 @@ class FetchService:
|
||||||
url = "https://" + url
|
url = "https://" + url
|
||||||
|
|
||||||
try:
|
try:
|
||||||
resp = httpx.get(
|
resp = curl_requests.get(
|
||||||
url,
|
url,
|
||||||
timeout=self.timeout,
|
timeout=self.timeout,
|
||||||
follow_redirects=True,
|
impersonate="chrome",
|
||||||
headers={"User-Agent": self.user_agent}
|
headers={
|
||||||
|
"User-Agent": self.user_agent,
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.5",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
}
|
||||||
)
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ dependencies = [
|
||||||
"shortuuid>=1.0.11",
|
"shortuuid>=1.0.11",
|
||||||
"sse-starlette>=2.0.0",
|
"sse-starlette>=2.0.0",
|
||||||
"ddgs>=5.0.0",
|
"ddgs>=5.0.0",
|
||||||
|
"curl-cffi>=0.6.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue