87 lines
2.9 KiB
Python
87 lines
2.9 KiB
Python
"""Crawler tools - all exception handling in decorator"""
|
|
from luxx.tools.factory import tool
|
|
from luxx.tools.services import SearchService, FetchService
|
|
|
|
|
|
_fetch_service = FetchService()
|
|
_search_service = SearchService()
|
|
|
|
@tool(name="web_search", description="Search the internet. Use when you need to find latest news or answer questions.", parameters={
|
|
"type": "object",
|
|
"properties": {
|
|
"query": {"type": "string", "description": "Search keywords"},
|
|
"max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5},
|
|
"region": {"type": "string", "description": "Search region (e.g. cn-zh for China, us-en for US)", "default": "cn-zh"},
|
|
},
|
|
"required": ["query"]
|
|
}, required_params=["query"], category="crawler")
|
|
def web_search(arguments: dict):
|
|
"""
|
|
Search the web using DuckDuckGo or Bing
|
|
|
|
Returns:
|
|
{"query": str, "count": int, "results": list}
|
|
"""
|
|
query = arguments["query"]
|
|
max_results = arguments.get("max_results", 5)
|
|
region = arguments.get("region", "cn-zh")
|
|
|
|
|
|
results = _search_service.search(query, max_results, region)
|
|
|
|
return {
|
|
"query": query,
|
|
"count": len(results),
|
|
"results": results
|
|
}
|
|
|
|
|
|
@tool(name="web_fetch", description="Fetch content from a webpage.", parameters={
|
|
"type": "object",
|
|
"properties": {
|
|
"url": {"type": "string", "description": "URL to fetch"},
|
|
"extract_type": {"type": "string", "description": "Extraction type: text, links, or structured", "enum": ["text", "links", "structured"], "default": "text"}
|
|
},
|
|
"required": ["url"]
|
|
}, required_params=["url"], category="crawler")
|
|
def web_fetch(arguments: dict):
|
|
"""
|
|
Fetch webpage content
|
|
|
|
Returns:
|
|
{"url": str, "text/links/structured": ...}
|
|
"""
|
|
url = arguments["url"]
|
|
extract_type = arguments.get("extract_type", "text")
|
|
|
|
return _fetch_service.fetch(url, extract_type)
|
|
|
|
|
|
@tool(name="batch_fetch", description="Batch fetch multiple webpages.", parameters={
|
|
"type": "object",
|
|
"properties": {
|
|
"urls": {"type": "array", "items": {"type": "string"}, "description": "URLs to fetch"},
|
|
"extract_type": {"type": "string", "description": "Extraction type: text, links, or structured", "enum": ["text", "links", "structured"], "default": "text"}
|
|
},
|
|
"required": ["urls"]
|
|
}, required_params=["urls"], category="crawler")
|
|
def batch_fetch(arguments: dict):
|
|
"""
|
|
Batch fetch multiple webpages
|
|
|
|
Returns:
|
|
{"count": int, "results": list}
|
|
"""
|
|
urls = arguments.get("urls", [])
|
|
extract_type = arguments.get("extract_type", "text")
|
|
|
|
if len(urls) > 10:
|
|
return {"error": "Maximum 10 pages can be fetched at once"}
|
|
|
|
results = _fetch_service.fetch_batch(urls, extract_type)
|
|
|
|
return {
|
|
"count": len(results),
|
|
"results": results
|
|
}
|