Luxx/alcor/tools/builtin/crawler.py

"""网页爬虫工具"""
import requests
from typing import Dict, Any, List, Optional
from bs4 import BeautifulSoup

from alcor.tools.factory import tool


@tool(
    name="web_search",
    description="Search the internet for information using web search",
    parameters={
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search keywords"
            },
            "max_results": {
                "type": "integer",
                "description": "Maximum number of results to return",
                "default": 5
            }
        },
        "required": ["query"]
    },
    category="crawler"
)
def web_search(arguments: Dict[str, Any]) -> Dict[str, Any]:
    """
    执行网络搜索

    注意：这是一个占位实现，实际使用时需要接入真实的搜索API
    如：Google Custom Search, DuckDuckGo, SerpAPI等
    """
    query = arguments.get("query", "")
    max_results = arguments.get("max_results", 5)

    if not query:
        return {"success": False, "error": "Query is required"}

    # 模拟搜索结果
    # 实际实现应接入真实搜索API
    return {
        "success": True,
        "data": {
            "query": query,
            "results": [
                {
                    "title": f"Result for '{query}' - Example {i+1}",
                    "url": f"https://example.com/result_{i+1}",
                    "snippet": f"This is a sample search result for the query '{query}'. " * 3
                }
                for i in range(min(max_results, 5))
            ]
        }
    }


@tool(
    name="web_fetch",
    description="Fetch and parse content from a web page",
    parameters={
        "type": "object",
        "properties": {
            "url": {
                "type": "string",
                "description": "URL of the web page to fetch"
            },
            "extract_text": {
                "type": "boolean",
                "description": "Whether to extract text content only",
                "default": True
            }
        },
        "required": ["url"]
    },
    category="crawler"
)
def web_fetch(arguments: Dict[str, Any]) -> Dict[str, Any]:
    """获取并解析网页内容"""
    url = arguments.get("url", "")
    extract_text = arguments.get("extract_text", True)

    if not url:
        return {"success": False, "error": "URL is required"}

    # 简单的URL验证
    if not url.startswith(("http://", "https://")):
        url = "https://" + url

    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        if extract_text:
            soup = BeautifulSoup(response.text, "html.parser")
            # 移除script和style标签
            for tag in soup(["script", "style"]):
                tag.decompose()
            text = soup.get_text(separator="\n", strip=True)
            # 清理多余空行
            lines = [line.strip() for line in text.split("\n") if line.strip()]
            text = "\n".join(lines)

            return {
                "success": True,
                "data": {
                    "url": url,
                    "title": soup.title.string if soup.title else "",
                    "content": text[:10000]  # 限制内容长度
                }
            }
        else:
            return {
                "success": True,
                "data": {
                    "url": url,
                    "html": response.text[:50000]  # 限制HTML长度
                }
            }
    except requests.RequestException as e:
        return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}


@tool(
    name="extract_links",
    description="Extract all links from a web page",
    parameters={
        "type": "object",
        "properties": {
            "url": {
                "type": "string",
                "description": "URL of the web page"
            },
            "max_links": {
                "type": "integer",
                "description": "Maximum number of links to extract",
                "default": 20
            }
        },
        "required": ["url"]
    },
    category="crawler"
)
def extract_links(arguments: Dict[str, Any]) -> Dict[str, Any]:
    """提取网页中的所有链接"""
    url = arguments.get("url", "")
    max_links = arguments.get("max_links", 20)

    if not url:
        return {"success": False, "error": "URL is required"}

    if not url.startswith(("http://", "https://")):
        url = "https://" + url

    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        links = []

        for a_tag in soup.find_all("a", href=True)[:max_links]:
            href = a_tag["href"]
            # 处理相对URL
            if href.startswith("/"):
                from urllib.parse import urljoin
                href = urljoin(url, href)
            links.append({
                "text": a_tag.get_text(strip=True) or href,
                "url": href
            })

        return {
            "success": True,
            "data": {
                "url": url,
                "links": links
            }
        }
    except requests.RequestException as e:
        return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}