"""Web crawler tools""" import requests from typing import Dict, Any, List, Optional from bs4 import BeautifulSoup from luxx.tools.factory import tool @tool( name="web_search", description="Search the internet for information using web search", parameters={ "type": "object", "properties": { "query": { "type": "string", "description": "Search keywords" }, "max_results": { "type": "integer", "description": "Maximum number of results to return", "default": 5 } }, "required": ["query"] }, category="crawler" ) def web_search(arguments: Dict[str, Any]) -> Dict[str, Any]: """ Execute web search Note: This is a placeholder implementation, real usage requires integrating with actual search APIs such as: Google Custom Search, DuckDuckGo, SerpAPI, etc. """ query = arguments.get("query", "") max_results = arguments.get("max_results", 5) if not query: return {"success": False, "error": "Query is required"} # Simulated search results # Real implementation should integrate with actual search API return { "success": True, "data": { "query": query, "results": [ { "title": f"Result for '{query}' - Example {i+1}", "url": f"https://example.com/result_{i+1}", "snippet": f"This is a sample search result for the query '{query}'. " * 3 } for i in range(min(max_results, 5)) ] } } @tool( name="web_fetch", description="Fetch and parse content from a web page", parameters={ "type": "object", "properties": { "url": { "type": "string", "description": "URL of the web page to fetch" }, "extract_text": { "type": "boolean", "description": "Whether to extract text content only", "default": True } }, "required": ["url"] }, category="crawler" ) def web_fetch(arguments: Dict[str, Any]) -> Dict[str, Any]: """Fetch and parse web page content""" url = arguments.get("url", "") extract_text = arguments.get("extract_text", True) if not url: return {"success": False, "error": "URL is required"} # Simple URL validation if not url.startswith(("http://", "https://")): url = "https://" + url try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() if extract_text: soup = BeautifulSoup(response.text, "html.parser") # Remove script and style tags for tag in soup(["script", "style"]): tag.decompose() text = soup.get_text(separator="\n", strip=True) # Clean up extra blank lines lines = [line.strip() for line in text.split("\n") if line.strip()] text = "\n".join(lines) return { "success": True, "data": { "url": url, "title": soup.title.string if soup.title else "", "content": text[:10000] # Limit content length } } else: return { "success": True, "data": { "url": url, "html": response.text[:50000] # Limit HTML length } } except requests.RequestException as e: return {"success": False, "error": f"Failed to fetch URL: {str(e)}"} @tool( name="extract_links", description="Extract all links from a web page", parameters={ "type": "object", "properties": { "url": { "type": "string", "description": "URL of the web page" }, "max_links": { "type": "integer", "description": "Maximum number of links to extract", "default": 20 } }, "required": ["url"] }, category="crawler" ) def extract_links(arguments: Dict[str, Any]) -> Dict[str, Any]: """Extract all links from a web page""" url = arguments.get("url", "") max_links = arguments.get("max_links", 20) if not url: return {"success": False, "error": "URL is required"} if not url.startswith(("http://", "https://")): url = "https://" + url try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") links = [] for a_tag in soup.find_all("a", href=True)[:max_links]: href = a_tag["href"] # Handle relative URLs if href.startswith("/"): from urllib.parse import urljoin href = urljoin(url, href) links.append({ "text": a_tag.get_text(strip=True) or href, "url": href }) return { "success": True, "data": { "url": url, "links": links } } except requests.RequestException as e: return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}