feat: 优化爬虫设置
This commit is contained in:
parent
dc08267c15
commit
bad1a06ef1
|
|
@ -2,15 +2,17 @@
|
||||||
from luxx.tools.factory import tool
|
from luxx.tools.factory import tool
|
||||||
from luxx.tools.services import SearchService, FetchService
|
from luxx.tools.services import SearchService, FetchService
|
||||||
|
|
||||||
# 服务实例(SearchService.search() 是静态方法风格,不需要实例化)
|
|
||||||
_fetch_service = FetchService()
|
_fetch_service = FetchService()
|
||||||
|
_search_service = SearchService()
|
||||||
|
|
||||||
|
|
||||||
@tool(name="web_search", description="Search the internet. Use when you need to find latest news or answer questions.", parameters={
|
@tool(name="web_search", description="Search the internet. Use when you need to find latest news or answer questions.", parameters={
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"query": {"type": "string", "description": "Search keywords"},
|
"query": {"type": "string", "description": "Search keywords"},
|
||||||
"max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5}
|
"max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5},
|
||||||
|
"region": {"type": "string", "description": "Search region (e.g. cn-zh for China, us-en for US)", "default": "cn-zh"}
|
||||||
},
|
},
|
||||||
"required": ["query"]
|
"required": ["query"]
|
||||||
}, required_params=["query"], category="crawler")
|
}, required_params=["query"], category="crawler")
|
||||||
|
|
@ -23,8 +25,9 @@ def web_search(arguments: dict):
|
||||||
"""
|
"""
|
||||||
query = arguments["query"]
|
query = arguments["query"]
|
||||||
max_results = arguments.get("max_results", 5)
|
max_results = arguments.get("max_results", 5)
|
||||||
|
region = arguments.get("region", "cn-zh")
|
||||||
|
results = _search_service.search(query, max_results, region)
|
||||||
|
|
||||||
results = SearchService().search(query, max_results)
|
|
||||||
return {
|
return {
|
||||||
"query": query,
|
"query": query,
|
||||||
"count": len(results),
|
"count": len(results),
|
||||||
|
|
@ -35,7 +38,8 @@ def web_search(arguments: dict):
|
||||||
@tool(name="web_fetch", description="Fetch content from a webpage.", parameters={
|
@tool(name="web_fetch", description="Fetch content from a webpage.", parameters={
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"url": {"type": "string", "description": "URL to fetch"}
|
"url": {"type": "string", "description": "URL to fetch"},
|
||||||
|
"extract_type": {"type": "string", "description": "Extraction type: text, links, or structured", "enum": ["text", "links", "structured"], "default": "text"}
|
||||||
},
|
},
|
||||||
"required": ["url"]
|
"required": ["url"]
|
||||||
}, required_params=["url"], category="crawler")
|
}, required_params=["url"], category="crawler")
|
||||||
|
|
@ -44,17 +48,19 @@ def web_fetch(arguments: dict):
|
||||||
Fetch webpage content
|
Fetch webpage content
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
{"url": str, "title": str, "text": str}
|
{"url": str, "text/links/structured": ...}
|
||||||
"""
|
"""
|
||||||
url = arguments["url"]
|
url = arguments["url"]
|
||||||
|
extract_type = arguments.get("extract_type", "text")
|
||||||
|
|
||||||
return _fetch_service.fetch(url)
|
return _fetch_service.fetch(url, extract_type)
|
||||||
|
|
||||||
|
|
||||||
@tool(name="batch_fetch", description="Batch fetch multiple webpages.", parameters={
|
@tool(name="batch_fetch", description="Batch fetch multiple webpages.", parameters={
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"urls": {"type": "array", "items": {"type": "string"}, "description": "URLs to fetch"}
|
"urls": {"type": "array", "items": {"type": "string"}, "description": "URLs to fetch"},
|
||||||
|
"extract_type": {"type": "string", "description": "Extraction type: text, links, or structured", "enum": ["text", "links", "structured"], "default": "text"}
|
||||||
},
|
},
|
||||||
"required": ["urls"]
|
"required": ["urls"]
|
||||||
}, required_params=["urls"], category="crawler")
|
}, required_params=["urls"], category="crawler")
|
||||||
|
|
@ -66,8 +72,12 @@ def batch_fetch(arguments: dict):
|
||||||
{"count": int, "results": list}
|
{"count": int, "results": list}
|
||||||
"""
|
"""
|
||||||
urls = arguments.get("urls", [])
|
urls = arguments.get("urls", [])
|
||||||
|
extract_type = arguments.get("extract_type", "text")
|
||||||
|
|
||||||
results = _fetch_service.fetch_batch(urls)
|
if len(urls) > 10:
|
||||||
|
return {"error": "Maximum 10 pages can be fetched at once"}
|
||||||
|
|
||||||
|
results = _fetch_service.fetch_batch(urls, extract_type)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"count": len(results),
|
"count": len(results),
|
||||||
|
|
|
||||||
|
|
@ -9,62 +9,205 @@ from ddgs import DDGS
|
||||||
|
|
||||||
|
|
||||||
class SearchService:
|
class SearchService:
|
||||||
"""DuckDuckGo search using ddgs library"""
|
"""Search service using DuckDuckGo"""
|
||||||
|
|
||||||
def search(self, query: str, max_results: int = 5) -> List[dict]:
|
def __init__(self, engine: str = "duckduckgo"):
|
||||||
results = []
|
self.engine = engine
|
||||||
with DDGS() as client:
|
|
||||||
for result in client.text(query, max_results=max_results):
|
def search(
|
||||||
results.append({
|
self,
|
||||||
"title": result.get("title", ""),
|
query: str,
|
||||||
"url": result.get("href", ""),
|
max_results: int = 5,
|
||||||
"snippet": result.get("body", "")
|
region: str = "cn-zh"
|
||||||
})
|
) -> List[dict]:
|
||||||
return results
|
"""
|
||||||
|
Execute search
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search keywords
|
||||||
|
max_results: Max result count
|
||||||
|
region: Region setting
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Search result list
|
||||||
|
"""
|
||||||
|
if self.engine == "duckduckgo":
|
||||||
|
return self._search_duckduckgo(query, max_results, region)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported search engine: {self.engine}")
|
||||||
|
|
||||||
|
def _search_duckduckgo(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
max_results: int,
|
||||||
|
region: str
|
||||||
|
) -> List[dict]:
|
||||||
|
"""DuckDuckGo search with region support"""
|
||||||
|
|
||||||
|
with DDGS() as ddgs:
|
||||||
|
results = list(ddgs.text(
|
||||||
|
query,
|
||||||
|
max_results=max_results,
|
||||||
|
region=region
|
||||||
|
))
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"title": r.get("title", ""),
|
||||||
|
"url": r.get("href", ""),
|
||||||
|
"snippet": r.get("body", "")
|
||||||
|
}
|
||||||
|
for r in results
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class FetchService:
|
class FetchService:
|
||||||
"""Page fetch using httpx with concurrent support"""
|
"""Page fetch service with content extraction support"""
|
||||||
|
|
||||||
def __init__(self, timeout: float = 15.0):
|
def __init__(self, timeout: float = 30.0, user_agent: str = None):
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
|
self.user_agent = user_agent or (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
def fetch(self, url: str, extract_type: str = "text") -> dict:
|
def fetch(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
extract_type: str = "text"
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Fetch a single page
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Page URL
|
||||||
|
extract_type: Extract type (text, links, structured)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Fetch result
|
||||||
|
"""
|
||||||
if not url.startswith(("http://", "https://")):
|
if not url.startswith(("http://", "https://")):
|
||||||
url = "https://" + url
|
url = "https://" + url
|
||||||
|
|
||||||
headers = {
|
try:
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
resp = httpx.get(
|
||||||
}
|
url,
|
||||||
with httpx.Client(timeout=self.timeout, follow_redirects=True) as client:
|
timeout=self.timeout,
|
||||||
response = client.get(url, headers=headers)
|
follow_redirects=True,
|
||||||
response.raise_for_status()
|
headers={"User-Agent": self.user_agent}
|
||||||
|
)
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
resp.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
# Remove script and style elements
|
return {"error": str(e), "url": url}
|
||||||
for script in soup(["script", "style"]):
|
|
||||||
script.decompose()
|
html = resp.text
|
||||||
|
extractor = ContentExtractor(html)
|
||||||
title = soup.title.string if soup.title else ""
|
|
||||||
text = soup.get_text(separator="\n", strip=True)
|
if extract_type == "text":
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"url": url,
|
"url": url,
|
||||||
"title": title[:500] if title else "",
|
"text": extractor.extract_text()
|
||||||
"text": text[:15000]
|
|
||||||
}
|
}
|
||||||
|
elif extract_type == "links":
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"links": extractor.extract_links()
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return extractor.extract_structured(url)
|
||||||
|
|
||||||
def fetch_batch(self, urls: List[str], extract_type: str = "text", max_concurrent: int = 5) -> List[dict]:
|
def fetch_batch(
|
||||||
|
self,
|
||||||
|
urls: List[str],
|
||||||
|
extract_type: str = "text",
|
||||||
|
max_concurrent: int = 5
|
||||||
|
) -> List[dict]:
|
||||||
|
"""
|
||||||
|
Batch fetch pages concurrently.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: URL list
|
||||||
|
extract_type: Extract type
|
||||||
|
max_concurrent: Max concurrent requests (1-5, default 5)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Result list (same order as input URLs)
|
||||||
|
"""
|
||||||
if len(urls) <= 1:
|
if len(urls) <= 1:
|
||||||
return [self.fetch(url, extract_type) for url in urls]
|
return [self.fetch(url, extract_type) for url in urls]
|
||||||
|
|
||||||
results = [None] * len(urls)
|
|
||||||
max_concurrent = min(max(max_concurrent, 1), 5)
|
max_concurrent = min(max(max_concurrent, 1), 5)
|
||||||
|
results = [None] * len(urls)
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=max_concurrent) as pool:
|
with ThreadPoolExecutor(max_workers=max_concurrent) as pool:
|
||||||
futures = {pool.submit(self.fetch, url, extract_type): i for i, url in enumerate(urls)}
|
futures = {
|
||||||
|
pool.submit(self.fetch, url, extract_type): i
|
||||||
|
for i, url in enumerate(urls)
|
||||||
|
}
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
results[futures[future]] = future.result()
|
idx = futures[future]
|
||||||
|
try:
|
||||||
|
results[idx] = future.result()
|
||||||
|
except Exception as e:
|
||||||
|
results[idx] = {"error": str(e)}
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
class ContentExtractor:
|
||||||
|
"""Content extractor for HTML pages"""
|
||||||
|
|
||||||
|
def __init__(self, html: str):
|
||||||
|
self.html = html
|
||||||
|
self._soup = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def soup(self):
|
||||||
|
if self._soup is None:
|
||||||
|
self._soup = BeautifulSoup(self.html, "html.parser")
|
||||||
|
return self._soup
|
||||||
|
|
||||||
|
def extract_text(self) -> str:
|
||||||
|
"""Extract plain text"""
|
||||||
|
# Remove script, style, nav, footer, header
|
||||||
|
for tag in self.soup(["script", "style", "nav", "footer", "header"]):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
text = self.soup.get_text(separator="\n", strip=True)
|
||||||
|
# Clean extra whitespace
|
||||||
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def extract_links(self) -> List[dict]:
|
||||||
|
"""Extract links"""
|
||||||
|
links = []
|
||||||
|
for a in self.soup.find_all("a", href=True):
|
||||||
|
text = a.get_text(strip=True)
|
||||||
|
href = a["href"]
|
||||||
|
if text and href and not href.startswith(("#", "javascript:")):
|
||||||
|
links.append({"text": text, "href": href})
|
||||||
|
return links[:50] # Limit count
|
||||||
|
|
||||||
|
def extract_structured(self, url: str = "") -> dict:
|
||||||
|
"""Extract structured content"""
|
||||||
|
soup = self.soup
|
||||||
|
|
||||||
|
# Extract title
|
||||||
|
title = ""
|
||||||
|
if soup.title:
|
||||||
|
title = soup.title.string or ""
|
||||||
|
|
||||||
|
# Extract meta description
|
||||||
|
description = ""
|
||||||
|
meta_desc = soup.find("meta", attrs={"name": "description"})
|
||||||
|
if meta_desc:
|
||||||
|
description = meta_desc.get("content", "")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"title": title.strip() if title else "",
|
||||||
|
"description": description.strip(),
|
||||||
|
"text": self.extract_text()[:5000],
|
||||||
|
"links": self.extract_links()[:20]
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue