fix: 修复爬虫功能

This commit is contained in:
ViperEkura 2026-04-13 15:19:18 +08:00
parent f10c5de950
commit 61bf235a15
2 changed files with 33 additions and 54 deletions

View File

@ -5,84 +5,62 @@ from urllib.parse import parse_qs, urlparse, quote
from typing import List from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ddgs import DDGS
class SearchService: class SearchService:
"""DuckDuckGo search""" """DuckDuckGo search using ddgs library"""
def search(self, query: str, max_results: int = 5) -> List[dict]: def search(self, query: str, max_results: int = 5) -> List[dict]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
try:
resp = httpx.get(url, headers=headers, timeout=15, follow_redirects=True)
resp.raise_for_status()
except Exception:
return []
soup = BeautifulSoup(resp.text, "html.parser")
results = [] results = []
try:
for result in soup.select(".result")[:max_results]: for result in DDGS().text(query, max_results=max_results):
title_elem = result.select_one(".result__title a")
snippet_elem = result.select_one(".result__snippet")
if title_elem:
raw_url = title_elem.get("href", "")
if "uddg=" in raw_url:
params = parse_qs(urlparse(raw_url).query)
clean_url = params.get("uddg", [raw_url])[0]
else:
clean_url = raw_url
results.append({ results.append({
"title": title_elem.get_text(strip=True), "title": result.get("title", ""),
"url": clean_url, "url": result.get("href", ""),
"snippet": snippet_elem.get_text(strip=True) if snippet_elem else "" "snippet": result.get("body", "")
}) })
except Exception:
pass
return results return results
class FetchService: class FetchService:
"""Page fetch with concurrent support""" """Page fetch using httpx with concurrent support"""
def __init__(self, timeout: float = 15.0): def __init__(self, timeout: float = 15.0):
self.timeout = timeout self.timeout = timeout
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
def fetch(self, url: str, extract_type: str = "text") -> dict: def fetch(self, url: str, extract_type: str = "text") -> dict:
if not url.startswith(("http://", "https://")): if not url.startswith(("http://", "https://")):
url = "https://" + url url = "https://" + url
try: try:
resp = httpx.get(url, timeout=self.timeout, follow_redirects=True, headers={"User-Agent": self.user_agent}) headers = {
resp.raise_for_status() "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
except httpx.TimeoutException: }
return {"error": "Request timeout"} with httpx.Client(timeout=self.timeout, follow_redirects=True) as client:
response = client.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
title = soup.title.string if soup.title else ""
text = soup.get_text(separator="\n", strip=True)
return {
"url": url,
"title": title[:500] if title else "",
"text": text[:15000]
}
except Exception as e: except Exception as e:
return {"error": str(e)} return {"error": str(e)}
soup = BeautifulSoup(resp.text, "html.parser") return {"url": url, "title": "", "text": ""}
title = soup.title.string if soup.title else ""
# Remove noise
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
tag.decompose()
if extract_type == "links":
links = [{"text": a.get_text(strip=True), "url": a["href"]}
for a in soup.find_all("a", href=True)
if a.get_text(strip=True) and not a["href"].startswith(("#", "javascript:"))]
return {"url": url, "links": links[:50]}
text = re.sub(r"\n{3,}", "\n\n", soup.get_text(separator="\n", strip=True))
if extract_type == "structured":
meta_desc = soup.find("meta", attrs={"name": "description"})
return {"url": url, "title": title, "description": (meta_desc.get("content", "") if meta_desc else ""), "text": text[:5000]}
return {"url": url, "title": title, "text": text[:15000]}
def fetch_batch(self, urls: List[str], extract_type: str = "text", max_concurrent: int = 5) -> List[dict]: def fetch_batch(self, urls: List[str], extract_type: str = "text", max_concurrent: int = 5) -> List[dict]:
if len(urls) <= 1: if len(urls) <= 1:

View File

@ -22,6 +22,7 @@ dependencies = [
"email-validator>=2.1.0", "email-validator>=2.1.0",
"shortuuid>=1.0.11", "shortuuid>=1.0.11",
"sse-starlette>=2.0.0", "sse-starlette>=2.0.0",
"ddgs>=5.0.0",
] ]
[project.optional-dependencies] [project.optional-dependencies]