diff --git a/luxx/tools/builtin/crawler.py b/luxx/tools/builtin/crawler.py index 4eb548f..55bcc16 100644 --- a/luxx/tools/builtin/crawler.py +++ b/luxx/tools/builtin/crawler.py @@ -2,15 +2,17 @@ from luxx.tools.factory import tool from luxx.tools.services import SearchService, FetchService -# 服务实例(SearchService.search() 是静态方法风格,不需要实例化) + _fetch_service = FetchService() +_search_service = SearchService() @tool(name="web_search", description="Search the internet. Use when you need to find latest news or answer questions.", parameters={ "type": "object", "properties": { "query": {"type": "string", "description": "Search keywords"}, - "max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5} + "max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5}, + "region": {"type": "string", "description": "Search region (e.g. cn-zh for China, us-en for US)", "default": "cn-zh"} }, "required": ["query"] }, required_params=["query"], category="crawler") @@ -23,8 +25,9 @@ def web_search(arguments: dict): """ query = arguments["query"] max_results = arguments.get("max_results", 5) + region = arguments.get("region", "cn-zh") + results = _search_service.search(query, max_results, region) - results = SearchService().search(query, max_results) return { "query": query, "count": len(results), @@ -35,7 +38,8 @@ def web_search(arguments: dict): @tool(name="web_fetch", description="Fetch content from a webpage.", parameters={ "type": "object", "properties": { - "url": {"type": "string", "description": "URL to fetch"} + "url": {"type": "string", "description": "URL to fetch"}, + "extract_type": {"type": "string", "description": "Extraction type: text, links, or structured", "enum": ["text", "links", "structured"], "default": "text"} }, "required": ["url"] }, required_params=["url"], category="crawler") @@ -44,17 +48,19 @@ def web_fetch(arguments: dict): Fetch webpage content Returns: - {"url": str, "title": str, "text": str} + {"url": str, "text/links/structured": ...} """ url = arguments["url"] + extract_type = arguments.get("extract_type", "text") - return _fetch_service.fetch(url) + return _fetch_service.fetch(url, extract_type) @tool(name="batch_fetch", description="Batch fetch multiple webpages.", parameters={ "type": "object", "properties": { - "urls": {"type": "array", "items": {"type": "string"}, "description": "URLs to fetch"} + "urls": {"type": "array", "items": {"type": "string"}, "description": "URLs to fetch"}, + "extract_type": {"type": "string", "description": "Extraction type: text, links, or structured", "enum": ["text", "links", "structured"], "default": "text"} }, "required": ["urls"] }, required_params=["urls"], category="crawler") @@ -66,8 +72,12 @@ def batch_fetch(arguments: dict): {"count": int, "results": list} """ urls = arguments.get("urls", []) + extract_type = arguments.get("extract_type", "text") - results = _fetch_service.fetch_batch(urls) + if len(urls) > 10: + return {"error": "Maximum 10 pages can be fetched at once"} + + results = _fetch_service.fetch_batch(urls, extract_type) return { "count": len(results), diff --git a/luxx/tools/services.py b/luxx/tools/services.py index 3877896..c350893 100644 --- a/luxx/tools/services.py +++ b/luxx/tools/services.py @@ -9,62 +9,205 @@ from ddgs import DDGS class SearchService: - """DuckDuckGo search using ddgs library""" + """Search service using DuckDuckGo""" - def search(self, query: str, max_results: int = 5) -> List[dict]: - results = [] - with DDGS() as client: - for result in client.text(query, max_results=max_results): - results.append({ - "title": result.get("title", ""), - "url": result.get("href", ""), - "snippet": result.get("body", "") - }) - return results + def __init__(self, engine: str = "duckduckgo"): + self.engine = engine + + def search( + self, + query: str, + max_results: int = 5, + region: str = "cn-zh" + ) -> List[dict]: + """ + Execute search + + Args: + query: Search keywords + max_results: Max result count + region: Region setting + + Returns: + Search result list + """ + if self.engine == "duckduckgo": + return self._search_duckduckgo(query, max_results, region) + else: + raise ValueError(f"Unsupported search engine: {self.engine}") + + def _search_duckduckgo( + self, + query: str, + max_results: int, + region: str + ) -> List[dict]: + """DuckDuckGo search with region support""" + + with DDGS() as ddgs: + results = list(ddgs.text( + query, + max_results=max_results, + region=region + )) + + return [ + { + "title": r.get("title", ""), + "url": r.get("href", ""), + "snippet": r.get("body", "") + } + for r in results + ] class FetchService: - """Page fetch using httpx with concurrent support""" + """Page fetch service with content extraction support""" - def __init__(self, timeout: float = 15.0): + def __init__(self, timeout: float = 30.0, user_agent: str = None): self.timeout = timeout + self.user_agent = user_agent or ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ) - def fetch(self, url: str, extract_type: str = "text") -> dict: + def fetch( + self, + url: str, + extract_type: str = "text" + ) -> dict: + """ + Fetch a single page + + Args: + url: Page URL + extract_type: Extract type (text, links, structured) + + Returns: + Fetch result + """ if not url.startswith(("http://", "https://")): url = "https://" + url - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - } - with httpx.Client(timeout=self.timeout, follow_redirects=True) as client: - response = client.get(url, headers=headers) - response.raise_for_status() - - soup = BeautifulSoup(response.text, "html.parser") - - # Remove script and style elements - for script in soup(["script", "style"]): - script.decompose() - - title = soup.title.string if soup.title else "" - text = soup.get_text(separator="\n", strip=True) - + try: + resp = httpx.get( + url, + timeout=self.timeout, + follow_redirects=True, + headers={"User-Agent": self.user_agent} + ) + resp.raise_for_status() + except Exception as e: + return {"error": str(e), "url": url} + + html = resp.text + extractor = ContentExtractor(html) + + if extract_type == "text": return { "url": url, - "title": title[:500] if title else "", - "text": text[:15000] + "text": extractor.extract_text() } + elif extract_type == "links": + return { + "url": url, + "links": extractor.extract_links() + } + else: + return extractor.extract_structured(url) - def fetch_batch(self, urls: List[str], extract_type: str = "text", max_concurrent: int = 5) -> List[dict]: + def fetch_batch( + self, + urls: List[str], + extract_type: str = "text", + max_concurrent: int = 5 + ) -> List[dict]: + """ + Batch fetch pages concurrently. + + Args: + urls: URL list + extract_type: Extract type + max_concurrent: Max concurrent requests (1-5, default 5) + + Returns: + Result list (same order as input URLs) + """ if len(urls) <= 1: return [self.fetch(url, extract_type) for url in urls] - results = [None] * len(urls) max_concurrent = min(max(max_concurrent, 1), 5) + results = [None] * len(urls) with ThreadPoolExecutor(max_workers=max_concurrent) as pool: - futures = {pool.submit(self.fetch, url, extract_type): i for i, url in enumerate(urls)} + futures = { + pool.submit(self.fetch, url, extract_type): i + for i, url in enumerate(urls) + } for future in as_completed(futures): - results[futures[future]] = future.result() + idx = futures[future] + try: + results[idx] = future.result() + except Exception as e: + results[idx] = {"error": str(e)} return results + + +class ContentExtractor: + """Content extractor for HTML pages""" + + def __init__(self, html: str): + self.html = html + self._soup = None + + @property + def soup(self): + if self._soup is None: + self._soup = BeautifulSoup(self.html, "html.parser") + return self._soup + + def extract_text(self) -> str: + """Extract plain text""" + # Remove script, style, nav, footer, header + for tag in self.soup(["script", "style", "nav", "footer", "header"]): + tag.decompose() + + text = self.soup.get_text(separator="\n", strip=True) + # Clean extra whitespace + text = re.sub(r"\n{3,}", "\n\n", text) + return text + + def extract_links(self) -> List[dict]: + """Extract links""" + links = [] + for a in self.soup.find_all("a", href=True): + text = a.get_text(strip=True) + href = a["href"] + if text and href and not href.startswith(("#", "javascript:")): + links.append({"text": text, "href": href}) + return links[:50] # Limit count + + def extract_structured(self, url: str = "") -> dict: + """Extract structured content""" + soup = self.soup + + # Extract title + title = "" + if soup.title: + title = soup.title.string or "" + + # Extract meta description + description = "" + meta_desc = soup.find("meta", attrs={"name": "description"}) + if meta_desc: + description = meta_desc.get("content", "") + + return { + "url": url, + "title": title.strip() if title else "", + "description": description.strip(), + "text": self.extract_text()[:5000], + "links": self.extract_links()[:20] + }