190 lines
5.6 KiB
Python
190 lines
5.6 KiB
Python
"""网页爬虫工具"""
|
||
import requests
|
||
from typing import Dict, Any, List, Optional
|
||
from bs4 import BeautifulSoup
|
||
|
||
from alcor.tools.factory import tool
|
||
|
||
|
||
@tool(
|
||
name="web_search",
|
||
description="Search the internet for information using web search",
|
||
parameters={
|
||
"type": "object",
|
||
"properties": {
|
||
"query": {
|
||
"type": "string",
|
||
"description": "Search keywords"
|
||
},
|
||
"max_results": {
|
||
"type": "integer",
|
||
"description": "Maximum number of results to return",
|
||
"default": 5
|
||
}
|
||
},
|
||
"required": ["query"]
|
||
},
|
||
category="crawler"
|
||
)
|
||
def web_search(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""
|
||
执行网络搜索
|
||
|
||
注意:这是一个占位实现,实际使用时需要接入真实的搜索API
|
||
如:Google Custom Search, DuckDuckGo, SerpAPI等
|
||
"""
|
||
query = arguments.get("query", "")
|
||
max_results = arguments.get("max_results", 5)
|
||
|
||
if not query:
|
||
return {"success": False, "error": "Query is required"}
|
||
|
||
# 模拟搜索结果
|
||
# 实际实现应接入真实搜索API
|
||
return {
|
||
"success": True,
|
||
"data": {
|
||
"query": query,
|
||
"results": [
|
||
{
|
||
"title": f"Result for '{query}' - Example {i+1}",
|
||
"url": f"https://example.com/result_{i+1}",
|
||
"snippet": f"This is a sample search result for the query '{query}'. " * 3
|
||
}
|
||
for i in range(min(max_results, 5))
|
||
]
|
||
}
|
||
}
|
||
|
||
|
||
@tool(
|
||
name="web_fetch",
|
||
description="Fetch and parse content from a web page",
|
||
parameters={
|
||
"type": "object",
|
||
"properties": {
|
||
"url": {
|
||
"type": "string",
|
||
"description": "URL of the web page to fetch"
|
||
},
|
||
"extract_text": {
|
||
"type": "boolean",
|
||
"description": "Whether to extract text content only",
|
||
"default": True
|
||
}
|
||
},
|
||
"required": ["url"]
|
||
},
|
||
category="crawler"
|
||
)
|
||
def web_fetch(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""获取并解析网页内容"""
|
||
url = arguments.get("url", "")
|
||
extract_text = arguments.get("extract_text", True)
|
||
|
||
if not url:
|
||
return {"success": False, "error": "URL is required"}
|
||
|
||
# 简单的URL验证
|
||
if not url.startswith(("http://", "https://")):
|
||
url = "https://" + url
|
||
|
||
try:
|
||
headers = {
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||
}
|
||
response = requests.get(url, headers=headers, timeout=10)
|
||
response.raise_for_status()
|
||
|
||
if extract_text:
|
||
soup = BeautifulSoup(response.text, "html.parser")
|
||
# 移除script和style标签
|
||
for tag in soup(["script", "style"]):
|
||
tag.decompose()
|
||
text = soup.get_text(separator="\n", strip=True)
|
||
# 清理多余空行
|
||
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
||
text = "\n".join(lines)
|
||
|
||
return {
|
||
"success": True,
|
||
"data": {
|
||
"url": url,
|
||
"title": soup.title.string if soup.title else "",
|
||
"content": text[:10000] # 限制内容长度
|
||
}
|
||
}
|
||
else:
|
||
return {
|
||
"success": True,
|
||
"data": {
|
||
"url": url,
|
||
"html": response.text[:50000] # 限制HTML长度
|
||
}
|
||
}
|
||
except requests.RequestException as e:
|
||
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}
|
||
|
||
|
||
@tool(
|
||
name="extract_links",
|
||
description="Extract all links from a web page",
|
||
parameters={
|
||
"type": "object",
|
||
"properties": {
|
||
"url": {
|
||
"type": "string",
|
||
"description": "URL of the web page"
|
||
},
|
||
"max_links": {
|
||
"type": "integer",
|
||
"description": "Maximum number of links to extract",
|
||
"default": 20
|
||
}
|
||
},
|
||
"required": ["url"]
|
||
},
|
||
category="crawler"
|
||
)
|
||
def extract_links(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""提取网页中的所有链接"""
|
||
url = arguments.get("url", "")
|
||
max_links = arguments.get("max_links", 20)
|
||
|
||
if not url:
|
||
return {"success": False, "error": "URL is required"}
|
||
|
||
if not url.startswith(("http://", "https://")):
|
||
url = "https://" + url
|
||
|
||
try:
|
||
headers = {
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||
}
|
||
response = requests.get(url, headers=headers, timeout=10)
|
||
response.raise_for_status()
|
||
|
||
soup = BeautifulSoup(response.text, "html.parser")
|
||
links = []
|
||
|
||
for a_tag in soup.find_all("a", href=True)[:max_links]:
|
||
href = a_tag["href"]
|
||
# 处理相对URL
|
||
if href.startswith("/"):
|
||
from urllib.parse import urljoin
|
||
href = urljoin(url, href)
|
||
links.append({
|
||
"text": a_tag.get_text(strip=True) or href,
|
||
"url": href
|
||
})
|
||
|
||
return {
|
||
"success": True,
|
||
"data": {
|
||
"url": url,
|
||
"links": links
|
||
}
|
||
}
|
||
except requests.RequestException as e:
|
||
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}
|