Luxx/luxx/tools/builtin/crawler.py

190 lines
5.7 KiB
Python

"""Web crawler tools"""
import requests
from typing import Dict, Any, List, Optional
from bs4 import BeautifulSoup
from luxx.tools.factory import tool
@tool(
name="web_search",
description="Search the internet for information using web search",
parameters={
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search keywords"
},
"max_results": {
"type": "integer",
"description": "Maximum number of results to return",
"default": 5
}
},
"required": ["query"]
},
category="crawler"
)
def web_search(arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Execute web search
Note: This is a placeholder implementation, real usage requires integrating with actual search APIs
such as: Google Custom Search, DuckDuckGo, SerpAPI, etc.
"""
query = arguments.get("query", "")
max_results = arguments.get("max_results", 5)
if not query:
return {"success": False, "error": "Query is required"}
# Simulated search results
# Real implementation should integrate with actual search API
return {
"success": True,
"data": {
"query": query,
"results": [
{
"title": f"Result for '{query}' - Example {i+1}",
"url": f"https://example.com/result_{i+1}",
"snippet": f"This is a sample search result for the query '{query}'. " * 3
}
for i in range(min(max_results, 5))
]
}
}
@tool(
name="web_fetch",
description="Fetch and parse content from a web page",
parameters={
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "URL of the web page to fetch"
},
"extract_text": {
"type": "boolean",
"description": "Whether to extract text content only",
"default": True
}
},
"required": ["url"]
},
category="crawler"
)
def web_fetch(arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Fetch and parse web page content"""
url = arguments.get("url", "")
extract_text = arguments.get("extract_text", True)
if not url:
return {"success": False, "error": "URL is required"}
# Simple URL validation
if not url.startswith(("http://", "https://")):
url = "https://" + url
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
if extract_text:
soup = BeautifulSoup(response.text, "html.parser")
# Remove script and style tags
for tag in soup(["script", "style"]):
tag.decompose()
text = soup.get_text(separator="\n", strip=True)
# Clean up extra blank lines
lines = [line.strip() for line in text.split("\n") if line.strip()]
text = "\n".join(lines)
return {
"success": True,
"data": {
"url": url,
"title": soup.title.string if soup.title else "",
"content": text[:10000] # Limit content length
}
}
else:
return {
"success": True,
"data": {
"url": url,
"html": response.text[:50000] # Limit HTML length
}
}
except requests.RequestException as e:
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}
@tool(
name="extract_links",
description="Extract all links from a web page",
parameters={
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "URL of the web page"
},
"max_links": {
"type": "integer",
"description": "Maximum number of links to extract",
"default": 20
}
},
"required": ["url"]
},
category="crawler"
)
def extract_links(arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Extract all links from a web page"""
url = arguments.get("url", "")
max_links = arguments.get("max_links", 20)
if not url:
return {"success": False, "error": "URL is required"}
if not url.startswith(("http://", "https://")):
url = "https://" + url
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
links = []
for a_tag in soup.find_all("a", href=True)[:max_links]:
href = a_tag["href"]
# Handle relative URLs
if href.startswith("/"):
from urllib.parse import urljoin
href = urljoin(url, href)
links.append({
"text": a_tag.get_text(strip=True) or href,
"url": href
})
return {
"success": True,
"data": {
"url": url,
"links": links
}
}
except requests.RequestException as e:
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}