190 lines
5.7 KiB
Python
190 lines
5.7 KiB
Python
"""Web crawler tools"""
|
|
import requests
|
|
from typing import Dict, Any, List, Optional
|
|
from bs4 import BeautifulSoup
|
|
|
|
from luxx.tools.factory import tool
|
|
|
|
|
|
@tool(
|
|
name="web_search",
|
|
description="Search the internet for information using web search",
|
|
parameters={
|
|
"type": "object",
|
|
"properties": {
|
|
"query": {
|
|
"type": "string",
|
|
"description": "Search keywords"
|
|
},
|
|
"max_results": {
|
|
"type": "integer",
|
|
"description": "Maximum number of results to return",
|
|
"default": 5
|
|
}
|
|
},
|
|
"required": ["query"]
|
|
},
|
|
category="crawler"
|
|
)
|
|
def web_search(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Execute web search
|
|
|
|
Note: This is a placeholder implementation, real usage requires integrating with actual search APIs
|
|
such as: Google Custom Search, DuckDuckGo, SerpAPI, etc.
|
|
"""
|
|
query = arguments.get("query", "")
|
|
max_results = arguments.get("max_results", 5)
|
|
|
|
if not query:
|
|
return {"success": False, "error": "Query is required"}
|
|
|
|
# Simulated search results
|
|
# Real implementation should integrate with actual search API
|
|
return {
|
|
"success": True,
|
|
"data": {
|
|
"query": query,
|
|
"results": [
|
|
{
|
|
"title": f"Result for '{query}' - Example {i+1}",
|
|
"url": f"https://example.com/result_{i+1}",
|
|
"snippet": f"This is a sample search result for the query '{query}'. " * 3
|
|
}
|
|
for i in range(min(max_results, 5))
|
|
]
|
|
}
|
|
}
|
|
|
|
|
|
@tool(
|
|
name="web_fetch",
|
|
description="Fetch and parse content from a web page",
|
|
parameters={
|
|
"type": "object",
|
|
"properties": {
|
|
"url": {
|
|
"type": "string",
|
|
"description": "URL of the web page to fetch"
|
|
},
|
|
"extract_text": {
|
|
"type": "boolean",
|
|
"description": "Whether to extract text content only",
|
|
"default": True
|
|
}
|
|
},
|
|
"required": ["url"]
|
|
},
|
|
category="crawler"
|
|
)
|
|
def web_fetch(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Fetch and parse web page content"""
|
|
url = arguments.get("url", "")
|
|
extract_text = arguments.get("extract_text", True)
|
|
|
|
if not url:
|
|
return {"success": False, "error": "URL is required"}
|
|
|
|
# Simple URL validation
|
|
if not url.startswith(("http://", "https://")):
|
|
url = "https://" + url
|
|
|
|
try:
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
}
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
if extract_text:
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
# Remove script and style tags
|
|
for tag in soup(["script", "style"]):
|
|
tag.decompose()
|
|
text = soup.get_text(separator="\n", strip=True)
|
|
# Clean up extra blank lines
|
|
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
|
text = "\n".join(lines)
|
|
|
|
return {
|
|
"success": True,
|
|
"data": {
|
|
"url": url,
|
|
"title": soup.title.string if soup.title else "",
|
|
"content": text[:10000] # Limit content length
|
|
}
|
|
}
|
|
else:
|
|
return {
|
|
"success": True,
|
|
"data": {
|
|
"url": url,
|
|
"html": response.text[:50000] # Limit HTML length
|
|
}
|
|
}
|
|
except requests.RequestException as e:
|
|
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}
|
|
|
|
|
|
@tool(
|
|
name="extract_links",
|
|
description="Extract all links from a web page",
|
|
parameters={
|
|
"type": "object",
|
|
"properties": {
|
|
"url": {
|
|
"type": "string",
|
|
"description": "URL of the web page"
|
|
},
|
|
"max_links": {
|
|
"type": "integer",
|
|
"description": "Maximum number of links to extract",
|
|
"default": 20
|
|
}
|
|
},
|
|
"required": ["url"]
|
|
},
|
|
category="crawler"
|
|
)
|
|
def extract_links(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Extract all links from a web page"""
|
|
url = arguments.get("url", "")
|
|
max_links = arguments.get("max_links", 20)
|
|
|
|
if not url:
|
|
return {"success": False, "error": "URL is required"}
|
|
|
|
if not url.startswith(("http://", "https://")):
|
|
url = "https://" + url
|
|
|
|
try:
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
}
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
links = []
|
|
|
|
for a_tag in soup.find_all("a", href=True)[:max_links]:
|
|
href = a_tag["href"]
|
|
# Handle relative URLs
|
|
if href.startswith("/"):
|
|
from urllib.parse import urljoin
|
|
href = urljoin(url, href)
|
|
links.append({
|
|
"text": a_tag.get_text(strip=True) or href,
|
|
"url": href
|
|
})
|
|
|
|
return {
|
|
"success": True,
|
|
"data": {
|
|
"url": url,
|
|
"links": links
|
|
}
|
|
}
|
|
except requests.RequestException as e:
|
|
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}
|