129 lines
3.5 KiB
Python
129 lines
3.5 KiB
Python
"""Crawler related tools"""
|
|
from luxx.tools.factory import tool
|
|
from luxx.tools.services import SearchService, FetchService
|
|
|
|
|
|
@tool(
|
|
name="web_search",
|
|
description="Search the internet for information. Use when you need to find latest news or answer questions.",
|
|
parameters={
|
|
"type": "object",
|
|
"properties": {
|
|
"query": {
|
|
"type": "string",
|
|
"description": "Search keywords"
|
|
},
|
|
"max_results": {
|
|
"type": "integer",
|
|
"description": "Number of results to return, default 5",
|
|
"default": 5
|
|
}
|
|
},
|
|
"required": ["query"]
|
|
},
|
|
category="crawler"
|
|
)
|
|
def web_search(arguments: dict) -> dict:
|
|
"""
|
|
Web search tool using DuckDuckGo
|
|
"""
|
|
query = arguments["query"]
|
|
max_results = arguments.get("max_results", 5)
|
|
|
|
service = SearchService()
|
|
results = service.search(query, max_results)
|
|
|
|
if not results:
|
|
return {"success": True, "data": {"query": query, "results": []}, "message": "No results found"}
|
|
|
|
return {"success": True, "data": {"query": query, "results": results}}
|
|
|
|
|
|
@tool(
|
|
name="web_fetch",
|
|
description="Fetch content from a webpage. Use when user needs detailed information from a page.",
|
|
parameters={
|
|
"type": "object",
|
|
"properties": {
|
|
"url": {
|
|
"type": "string",
|
|
"description": "URL of the webpage to fetch"
|
|
},
|
|
"extract_type": {
|
|
"type": "string",
|
|
"description": "Extraction type: text, links, or structured",
|
|
"enum": ["text", "links", "structured"],
|
|
"default": "text"
|
|
}
|
|
},
|
|
"required": ["url"]
|
|
},
|
|
category="crawler"
|
|
)
|
|
def web_fetch(arguments: dict) -> dict:
|
|
"""
|
|
Page fetch tool
|
|
"""
|
|
url = arguments["url"]
|
|
extract_type = arguments.get("extract_type", "text")
|
|
|
|
if not url:
|
|
return {"success": False, "error": "URL is required"}
|
|
|
|
service = FetchService(timeout=15)
|
|
result = service.fetch(url, extract_type)
|
|
|
|
if "error" in result:
|
|
return {"success": False, "error": result["error"]}
|
|
|
|
return {"success": True, "data": result}
|
|
|
|
|
|
@tool(
|
|
name="batch_fetch",
|
|
description="Batch fetch multiple webpages. Use when you need to get content from multiple pages.",
|
|
parameters={
|
|
"type": "object",
|
|
"properties": {
|
|
"urls": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "List of URLs to fetch"
|
|
},
|
|
"extract_type": {
|
|
"type": "string",
|
|
"enum": ["text", "links", "structured"],
|
|
"default": "text"
|
|
}
|
|
},
|
|
"required": ["urls"]
|
|
},
|
|
category="crawler"
|
|
)
|
|
def batch_fetch(arguments: dict) -> dict:
|
|
"""
|
|
Batch fetch tool
|
|
"""
|
|
urls = arguments["urls"]
|
|
extract_type = arguments.get("extract_type", "text")
|
|
|
|
if not urls:
|
|
return {"success": False, "error": "URLs list is required"}
|
|
|
|
if len(urls) > 10:
|
|
return {"success": False, "error": "Maximum 10 pages allowed"}
|
|
|
|
service = FetchService(timeout=10)
|
|
results = service.fetch_batch(urls, extract_type)
|
|
|
|
successful = sum(1 for r in results if "error" not in r)
|
|
|
|
return {
|
|
"success": True,
|
|
"data": {
|
|
"results": results,
|
|
"total": len(results),
|
|
"successful": successful
|
|
}
|
|
}
|