fix: 修复爬虫功能
This commit is contained in:
parent
f10c5de950
commit
7c998d6de3
|
|
@ -5,84 +5,49 @@ from urllib.parse import parse_qs, urlparse, quote
|
|||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from bs4 import BeautifulSoup
|
||||
from ddgs import DDGS
|
||||
|
||||
|
||||
class SearchService:
|
||||
"""DuckDuckGo search"""
|
||||
"""DuckDuckGo search using ddgs library"""
|
||||
|
||||
def search(self, query: str, max_results: int = 5) -> List[dict]:
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
||||
url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
|
||||
|
||||
try:
|
||||
resp = httpx.get(url, headers=headers, timeout=15, follow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
results = []
|
||||
|
||||
for result in soup.select(".result")[:max_results]:
|
||||
title_elem = result.select_one(".result__title a")
|
||||
snippet_elem = result.select_one(".result__snippet")
|
||||
|
||||
if title_elem:
|
||||
raw_url = title_elem.get("href", "")
|
||||
if "uddg=" in raw_url:
|
||||
params = parse_qs(urlparse(raw_url).query)
|
||||
clean_url = params.get("uddg", [raw_url])[0]
|
||||
else:
|
||||
clean_url = raw_url
|
||||
|
||||
try:
|
||||
for result in DDGS().text(query, max_results=max_results):
|
||||
results.append({
|
||||
"title": title_elem.get_text(strip=True),
|
||||
"url": clean_url,
|
||||
"snippet": snippet_elem.get_text(strip=True) if snippet_elem else ""
|
||||
"title": result.get("title", ""),
|
||||
"url": result.get("href", ""),
|
||||
"snippet": result.get("body", "")
|
||||
})
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
return results
|
||||
|
||||
|
||||
class FetchService:
|
||||
"""Page fetch with concurrent support"""
|
||||
"""Page fetch using ddgs with concurrent support"""
|
||||
|
||||
def __init__(self, timeout: float = 15.0):
|
||||
self.timeout = timeout
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
|
||||
def fetch(self, url: str, extract_type: str = "text") -> dict:
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = "https://" + url
|
||||
|
||||
try:
|
||||
resp = httpx.get(url, timeout=self.timeout, follow_redirects=True, headers={"User-Agent": self.user_agent})
|
||||
resp.raise_for_status()
|
||||
except httpx.TimeoutException:
|
||||
return {"error": "Request timeout"}
|
||||
result = DDGS().fetch(url)
|
||||
if result and result.get("results"):
|
||||
page_data = result["results"][0]
|
||||
return {
|
||||
"url": url,
|
||||
"title": page_data.get("title", ""),
|
||||
"text": page_data.get("text", "")[:15000]
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
title = soup.title.string if soup.title else ""
|
||||
|
||||
# Remove noise
|
||||
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
||||
tag.decompose()
|
||||
|
||||
if extract_type == "links":
|
||||
links = [{"text": a.get_text(strip=True), "url": a["href"]}
|
||||
for a in soup.find_all("a", href=True)
|
||||
if a.get_text(strip=True) and not a["href"].startswith(("#", "javascript:"))]
|
||||
return {"url": url, "links": links[:50]}
|
||||
|
||||
text = re.sub(r"\n{3,}", "\n\n", soup.get_text(separator="\n", strip=True))
|
||||
|
||||
if extract_type == "structured":
|
||||
meta_desc = soup.find("meta", attrs={"name": "description"})
|
||||
return {"url": url, "title": title, "description": (meta_desc.get("content", "") if meta_desc else ""), "text": text[:5000]}
|
||||
|
||||
return {"url": url, "title": title, "text": text[:15000]}
|
||||
return {"url": url, "title": "", "text": ""}
|
||||
|
||||
def fetch_batch(self, urls: List[str], extract_type: str = "text", max_concurrent: int = 5) -> List[dict]:
|
||||
if len(urls) <= 1:
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ dependencies = [
|
|||
"email-validator>=2.1.0",
|
||||
"shortuuid>=1.0.11",
|
||||
"sse-starlette>=2.0.0",
|
||||
"ddgs>=5.0.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
|
|
|||
Loading…
Reference in New Issue