fix: 修复爬虫功能
This commit is contained in:
parent
f10c5de950
commit
2e64c69776
|
|
@ -5,42 +5,23 @@ from urllib.parse import parse_qs, urlparse, quote
|
||||||
from typing import List
|
from typing import List
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from ddgs import DDGS
|
||||||
|
|
||||||
|
|
||||||
class SearchService:
|
class SearchService:
|
||||||
"""DuckDuckGo search"""
|
"""DuckDuckGo search using ddgs library"""
|
||||||
|
|
||||||
def search(self, query: str, max_results: int = 5) -> List[dict]:
|
def search(self, query: str, max_results: int = 5) -> List[dict]:
|
||||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
|
||||||
url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
|
|
||||||
|
|
||||||
try:
|
|
||||||
resp = httpx.get(url, headers=headers, timeout=15, follow_redirects=True)
|
|
||||||
resp.raise_for_status()
|
|
||||||
except Exception:
|
|
||||||
return []
|
|
||||||
|
|
||||||
soup = BeautifulSoup(resp.text, "html.parser")
|
|
||||||
results = []
|
results = []
|
||||||
|
try:
|
||||||
for result in soup.select(".result")[:max_results]:
|
for result in DDGS().text(query, max_results=max_results):
|
||||||
title_elem = result.select_one(".result__title a")
|
|
||||||
snippet_elem = result.select_one(".result__snippet")
|
|
||||||
|
|
||||||
if title_elem:
|
|
||||||
raw_url = title_elem.get("href", "")
|
|
||||||
if "uddg=" in raw_url:
|
|
||||||
params = parse_qs(urlparse(raw_url).query)
|
|
||||||
clean_url = params.get("uddg", [raw_url])[0]
|
|
||||||
else:
|
|
||||||
clean_url = raw_url
|
|
||||||
|
|
||||||
results.append({
|
results.append({
|
||||||
"title": title_elem.get_text(strip=True),
|
"title": result.get("title", ""),
|
||||||
"url": clean_url,
|
"url": result.get("href", ""),
|
||||||
"snippet": snippet_elem.get_text(strip=True) if snippet_elem else ""
|
"snippet": result.get("body", "")
|
||||||
})
|
})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,7 @@ dependencies = [
|
||||||
"email-validator>=2.1.0",
|
"email-validator>=2.1.0",
|
||||||
"shortuuid>=1.0.11",
|
"shortuuid>=1.0.11",
|
||||||
"sse-starlette>=2.0.0",
|
"sse-starlette>=2.0.0",
|
||||||
|
"ddgs>=5.0.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue