Luxx/alcor/tools/builtin/crawler.py

190 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""网页爬虫工具"""
import requests
from typing import Dict, Any, List, Optional
from bs4 import BeautifulSoup
from alcor.tools.factory import tool
@tool(
name="web_search",
description="Search the internet for information using web search",
parameters={
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search keywords"
},
"max_results": {
"type": "integer",
"description": "Maximum number of results to return",
"default": 5
}
},
"required": ["query"]
},
category="crawler"
)
def web_search(arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
执行网络搜索
注意这是一个占位实现实际使用时需要接入真实的搜索API
Google Custom Search, DuckDuckGo, SerpAPI等
"""
query = arguments.get("query", "")
max_results = arguments.get("max_results", 5)
if not query:
return {"success": False, "error": "Query is required"}
# 模拟搜索结果
# 实际实现应接入真实搜索API
return {
"success": True,
"data": {
"query": query,
"results": [
{
"title": f"Result for '{query}' - Example {i+1}",
"url": f"https://example.com/result_{i+1}",
"snippet": f"This is a sample search result for the query '{query}'. " * 3
}
for i in range(min(max_results, 5))
]
}
}
@tool(
name="web_fetch",
description="Fetch and parse content from a web page",
parameters={
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "URL of the web page to fetch"
},
"extract_text": {
"type": "boolean",
"description": "Whether to extract text content only",
"default": True
}
},
"required": ["url"]
},
category="crawler"
)
def web_fetch(arguments: Dict[str, Any]) -> Dict[str, Any]:
"""获取并解析网页内容"""
url = arguments.get("url", "")
extract_text = arguments.get("extract_text", True)
if not url:
return {"success": False, "error": "URL is required"}
# 简单的URL验证
if not url.startswith(("http://", "https://")):
url = "https://" + url
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
if extract_text:
soup = BeautifulSoup(response.text, "html.parser")
# 移除script和style标签
for tag in soup(["script", "style"]):
tag.decompose()
text = soup.get_text(separator="\n", strip=True)
# 清理多余空行
lines = [line.strip() for line in text.split("\n") if line.strip()]
text = "\n".join(lines)
return {
"success": True,
"data": {
"url": url,
"title": soup.title.string if soup.title else "",
"content": text[:10000] # 限制内容长度
}
}
else:
return {
"success": True,
"data": {
"url": url,
"html": response.text[:50000] # 限制HTML长度
}
}
except requests.RequestException as e:
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}
@tool(
name="extract_links",
description="Extract all links from a web page",
parameters={
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "URL of the web page"
},
"max_links": {
"type": "integer",
"description": "Maximum number of links to extract",
"default": 20
}
},
"required": ["url"]
},
category="crawler"
)
def extract_links(arguments: Dict[str, Any]) -> Dict[str, Any]:
"""提取网页中的所有链接"""
url = arguments.get("url", "")
max_links = arguments.get("max_links", 20)
if not url:
return {"success": False, "error": "URL is required"}
if not url.startswith(("http://", "https://")):
url = "https://" + url
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
links = []
for a_tag in soup.find_all("a", href=True)[:max_links]:
href = a_tag["href"]
# 处理相对URL
if href.startswith("/"):
from urllib.parse import urljoin
href = urljoin(url, href)
links.append({
"text": a_tag.get_text(strip=True) or href,
"url": href
})
return {
"success": True,
"data": {
"url": url,
"links": links
}
}
except requests.RequestException as e:
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}