feat: 完善爬虫功能

This commit is contained in:
ViperEkura 2026-04-13 08:38:12 +08:00
parent 805f8c86da
commit a84b8617a6
17 changed files with 422 additions and 172 deletions

2
.gitignore vendored
View File

@ -9,7 +9,7 @@
!README.md
!.gitignore
!luxx/**/*.py
!*.py
!asserts/**/*.md
# Dashboard

View File

@ -5,9 +5,13 @@
- **框架**: FastAPI 0.109+
- **数据库**: SQLAlchemy 2.0+
- **认证**: JWT (PyJWT)
- **HTTP客户端**: httpx
- **HTTP客户端**: httpx, requests
- **配置**: YAML (PyYAML)
- **代码执行**: Python 原生执行
- **网页爬虫**:
- `httpx` - HTTP 客户端
- `beautifulsoup4` - HTML 解析
- `lxml` - XML/HTML 解析器
## 目录结构
@ -36,6 +40,7 @@ luxx/
│ ├── crawler.py # 网页爬虫
│ ├── data.py # 数据处理
│ └── weather.py # 天气查询
│ └── services.py # 工具服务层
└── utils/ # 工具函数
└── helpers.py
```
@ -205,7 +210,9 @@ classDiagram
|------|------|------|
| `python_execute` | 执行 Python 代码 | 支持 print 输出、变量访问 |
| `python_eval` | 计算表达式 | 快速求值 |
| `web_crawl` | 网页抓取 | BeautifulSoup + httpx |
| `web_search` | DuckDuckGo HTML | DuckDuckGo HTML 搜索 |
| `web_fetch` | 网页抓取 | httpx + BeautifulSoup支持 text/links/structured |
| `batch_fetch` | 批量抓取 | 并发获取多个页面 |
| `get_weather` | 天气查询 | 支持城市名查询 |
| `process_data` | 数据处理 | JSON 转换、格式化等 |

View File

@ -7,7 +7,7 @@ app:
database:
type: sqlite
url: sqlite:///../chat.db
url: sqlite:///./chat.db
llm:
provider: deepseek

View File

@ -66,7 +66,9 @@ const renderedContent = computed(() => {
function formatTime(time) {
if (!time) return ''
return new Date(time).toLocaleTimeString('zh-CN', { hour: '2-digit', minute: '2-digit' })
const date = new Date(time)
// 使
return date.toLocaleTimeString('zh-CN', { hour: '2-digit', minute: '2-digit' })
}
function copyContent() {

View File

@ -9,10 +9,11 @@
<span class="step-label">思考中</span>
<span class="step-brief">{{ item.brief || '正在思考...' }}</span>
<span v-if="streaming && item.key === lastThinkingKey" class="loading-dots">...</span>
<span v-else-if="item.content && item.content.length > 1024" class="truncate-hint">已截断</span>
<span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span>
</div>
<div v-if="expandedKeys.has(item.key)" class="step-content">
<div class="thinking-text">{{ item.content }}</div>
<div class="thinking-text">{{ item.displayContent }}</div>
</div>
</div>
@ -25,6 +26,7 @@
<span v-if="item.loading" class="loading-dots">...</span>
<span v-else-if="item.isSuccess === true" class="step-badge success">成功</span>
<span v-else-if="item.isSuccess === false" class="step-badge error">失败</span>
<span v-if="item.fullResult && item.fullResult.length > 1024" class="truncate-hint">已截断</span>
<span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span>
</div>
<div v-if="expandedKeys.has(item.key)" class="step-content">
@ -34,7 +36,7 @@
</div>
<div v-if="item.resultSummary || item.fullResult" class="tool-detail" style="margin-top: 8px;">
<span class="detail-label">结果</span>
<pre>{{ item.fullResult || item.resultSummary }}</pre>
<pre>{{ item.displayResult }}</pre>
</div>
</div>
</div>
@ -71,12 +73,14 @@ const allItems = computed(() => {
if (props.processSteps && props.processSteps.length > 0) {
for (const step of props.processSteps) {
if (step.type === 'thinking') {
const content = step.content || ''
items.push({
key: step.id || `thinking-${step.index}`,
type: 'thinking',
index: step.index,
content: step.content || '',
brief: step.content ? step.content.slice(0, 50) + (step.content.length > 50 ? '...' : '') : '',
content: content,
displayContent: content.length > 1024 ? content.slice(0, 1024) + '\n\n[... 内容已截断 ...]' : content,
brief: content.slice(0, 50) + (content.length > 50 ? '...' : ''),
})
} else if (step.type === 'tool_call') {
items.push({
@ -97,12 +101,15 @@ const allItems = computed(() => {
const toolId = step.id_ref || step.id
const match = items.findLast(it => it.type === 'tool_call' && it.id === toolId)
if (match) {
match.resultSummary = step.content ? step.content.slice(0, 200) : ''
match.fullResult = step.content || ''
const resultContent = step.content || ''
match.resultSummary = resultContent.slice(0, 200)
match.fullResult = resultContent
match.displayResult = resultContent.length > 1024 ? resultContent.slice(0, 1024) + '\n\n[... 结果已截断 ...]' : resultContent
match.isSuccess = step.success !== false
match.loading = false
} else {
// tool_call
const placeholderContent = step.content || ''
items.push({
key: `result-${step.id || step.index}`,
type: 'tool_call',
@ -113,8 +120,9 @@ const allItems = computed(() => {
brief: step.name || '工具结果',
loading: false,
isSuccess: true,
resultSummary: step.content ? step.content.slice(0, 200) : '',
fullResult: step.content || ''
resultSummary: placeholderContent.slice(0, 200),
fullResult: placeholderContent,
displayResult: placeholderContent.length > 1024 ? placeholderContent.slice(0, 1024) + '\n\n[... 结果已截断 ...]' : placeholderContent
})
}
} else if (step.type === 'text') {
@ -280,6 +288,15 @@ const sparkleIcon = `<svg viewBox="0 0 24 24" width="14" height="14" fill="none"
color: var(--success-color);
}
.truncate-hint {
font-size: 10px;
padding: 2px 6px;
background: var(--warning-bg);
color: var(--warning-color);
border-radius: 4px;
margin-left: 4px;
}
.step-badge.error {
background: var(--danger-bg);
color: var(--danger-color);

View File

@ -41,6 +41,8 @@
/* 状态颜色 */
--success-color: #059669;
--success-bg: rgba(16, 185, 129, 0.1);
--warning-color: #d97706;
--warning-bg: rgba(217, 119, 6, 0.1);
--danger-color: #ef4444;
--danger-bg: rgba(239, 68, 68, 0.08);
@ -112,6 +114,8 @@
--success-color: #34d399;
--success-bg: rgba(52, 211, 153, 0.15);
--warning-color: #fbbf24;
--warning-bg: rgba(251, 191, 36, 0.15);
--danger-color: #f87171;
--danger-bg: rgba(248, 113, 113, 0.15);

View File

@ -52,9 +52,9 @@ const blockMathExtension = {
}
marked.use({
extensions: [blockMathExtension, mathExtension],
gfm: true,
breaks: true,
gfm: true
extensions: [blockMathExtension, mathExtension]
})
export function renderMarkdown(text) {

View File

@ -24,7 +24,7 @@
</div>
</div>
<div ref="messagesContainer" class="messages-container">
<div ref="messagesContainer" class="messages-container" @scroll="handleScroll">
<div v-if="loading" class="load-more-top">
<span>加载中...</span>
</div>
@ -106,6 +106,7 @@ const sending = ref(false)
const streamingMessage = ref(null)
const messagesContainer = ref(null)
const textareaRef = ref(null)
const autoScroll = ref(true)
const conversationId = ref(route.params.id)
const conversationTitle = ref('')
@ -128,6 +129,7 @@ function onKeydown(e) {
}
const loadMessages = async () => {
autoScroll.value = true
loading.value = true
try {
const res = await messagesAPI.list(conversationId.value)
@ -191,6 +193,7 @@ const sendMessage = async () => {
{ conversation_id: conversationId.value, content },
{
onProcessStep: (step) => {
autoScroll.value = true //
if (!streamingMessage.value) return
// id
const idx = streamingMessage.value.process_steps.findIndex(s => s.id === step.id)
@ -202,6 +205,7 @@ const sendMessage = async () => {
},
onDone: () => {
//
autoScroll.value = true
if (streamingMessage.value) {
messages.value.push({
...streamingMessage.value,
@ -230,6 +234,7 @@ const sendMessage = async () => {
}
const scrollToBottom = () => {
if (!autoScroll.value) return
nextTick(() => {
if (messagesContainer.value) {
messagesContainer.value.scrollTo({
@ -240,6 +245,15 @@ const scrollToBottom = () => {
})
}
//
const handleScroll = () => {
if (!messagesContainer.value) return
const { scrollTop, scrollHeight, clientHeight } = messagesContainer.value
const distanceToBottom = scrollHeight - scrollTop - clientHeight
// 50px
autoScroll.value = distanceToBottom < 50
}
//
watch(() => streamingMessage.value?.process_steps?.length, () => {
if (streamingMessage.value) {

View File

@ -102,6 +102,11 @@
<label>模型名称</label>
<input v-model="form.default_model" placeholder="deepseek-chat / gpt-4" required />
</div>
<div class="form-group">
<label>最大 Tokens</label>
<input v-model.number="form.max_tokens" type="number" placeholder="8192" min="1" />
<span class="hint">单次回复最大 token 默认 8192</span>
</div>
<div class="form-group">
<label class="switch-card" :class="{ active: form.is_default }">
<div class="switch-content">
@ -201,7 +206,7 @@ const testResult = ref(null)
const formError = ref('')
const form = ref({
name: '', base_url: '', api_key: '', default_model: '', is_default: false
name: '', base_url: '', api_key: '', default_model: '', max_tokens: 8192, is_default: false
})
const fetchProviders = async () => {
@ -218,7 +223,7 @@ const fetchProviders = async () => {
const closeModal = () => {
showModal.value = false
editing.value = null
form.value = { name: '', base_url: '', api_key: '', default_model: '', is_default: false }
form.value = { name: '', base_url: '', api_key: '', default_model: '', max_tokens: 8192, is_default: false }
formError.value = ''
}
@ -232,6 +237,7 @@ const editProvider = async (p) => {
base_url: res.data.base_url,
api_key: res.data.api_key || '',
default_model: res.data.default_model,
max_tokens: res.data.max_tokens || 8192,
is_default: res.data.is_default
}
}
@ -381,6 +387,7 @@ input:checked + .slider:before { transform: translateX(22px); }
.switch-card input:checked + .slider { background-color: var(--accent); }
.switch-card input:checked + .slider:before { transform: translateX(22px); }
.modal-actions { display: flex; justify-content: flex-end; gap: 1rem; margin-top: 1.5rem; }
.form-group .hint { font-size: 0.85rem; color: var(--text); margin-top: 4px; display: block; }
.spinner { width: 48px; height: 48px; border: 4px solid var(--border); border-top-color: var(--accent); border-radius: 50%; animation: spin 1s linear infinite; margin: 0 auto 1rem; }
@keyframes spin { to { transform: rotate(360deg); } }
</style>

View File

@ -7,6 +7,10 @@ from sqlalchemy.orm import Mapped, mapped_column, relationship
from luxx.database import Base
def local_now():
return datetime.now()
class LLMProvider(Base):
"""LLM Provider configuration model"""
__tablename__ = "llm_providers"
@ -18,10 +22,11 @@ class LLMProvider(Base):
base_url: Mapped[str] = mapped_column(String(500), nullable=False)
api_key: Mapped[str] = mapped_column(String(500), nullable=False)
default_model: Mapped[str] = mapped_column(String(100), nullable=False, default="gpt-4")
max_tokens: Mapped[int] = mapped_column(Integer, default=8192) # 默认 8192
is_default: Mapped[bool] = mapped_column(Boolean, default=False)
enabled: Mapped[bool] = mapped_column(Boolean, default=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=local_now, onupdate=local_now)
# Relationships
user: Mapped["User"] = relationship("User", backref="llm_providers")
@ -35,6 +40,7 @@ class LLMProvider(Base):
"provider_type": self.provider_type,
"base_url": self.base_url,
"default_model": self.default_model,
"max_tokens": self.max_tokens,
"is_default": self.is_default,
"enabled": self.enabled,
"created_at": self.created_at.isoformat() if self.created_at else None,
@ -53,8 +59,8 @@ class Project(Base):
user_id: Mapped[int] = mapped_column(Integer, ForeignKey("users.id"), nullable=False)
name: Mapped[str] = mapped_column(String(255), nullable=False)
description: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=local_now, onupdate=local_now)
# Relationships
user: Mapped["User"] = relationship("User", backref="projects")
@ -70,7 +76,7 @@ class User(Base):
password_hash: Mapped[Optional[str]] = mapped_column(String(255), nullable=True)
role: Mapped[str] = mapped_column(String(20), default="user")
is_active: Mapped[bool] = mapped_column(Boolean, default=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
# Relationships
conversations: Mapped[List["Conversation"]] = relationship(
@ -102,8 +108,8 @@ class Conversation(Base):
temperature: Mapped[float] = mapped_column(Float, default=0.7)
max_tokens: Mapped[int] = mapped_column(Integer, default=2000)
thinking_enabled: Mapped[bool] = mapped_column(Boolean, default=False)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=local_now, onupdate=local_now)
# Relationships
user: Mapped["User"] = relationship("User", back_populates="conversations")
@ -161,7 +167,7 @@ class Message(Base):
role: Mapped[str] = mapped_column(String(16), nullable=False) # user, assistant, system, tool
content: Mapped[str] = mapped_column(Text, nullable=False, default="")
token_count: Mapped[int] = mapped_column(Integer, default=0)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
# Relationships
conversation: Mapped["Conversation"] = relationship("Conversation", back_populates="messages")

View File

@ -79,8 +79,8 @@ def send_message(
)
db.add(user_message)
from datetime import datetime
conversation.updated_at = datetime.utcnow()
from datetime import datetime, timezone, timedelta
conversation.updated_at = datetime.now(timezone(timedelta(hours=8)))
response = chat_service.non_stream_response(
conversation=conversation,
@ -133,7 +133,7 @@ async def stream_message(
token_count=len(data.content) // 4
)
db.add(user_message)
conversation.updated_at = datetime.utcnow()
conversation.updated_at = datetime.now()
db.commit()
async def event_generator():

View File

@ -20,7 +20,8 @@ def _sse_event(event: str, data: dict) -> str:
def get_llm_client(conversation: Conversation = None):
"""Get LLM client, optionally using conversation's provider"""
"""Get LLM client, optionally using conversation's provider. Returns (client, max_tokens)"""
max_tokens = None
if conversation and conversation.provider_id:
from luxx.models import LLMProvider
from luxx.database import SessionLocal
@ -28,18 +29,19 @@ def get_llm_client(conversation: Conversation = None):
try:
provider = db.query(LLMProvider).filter(LLMProvider.id == conversation.provider_id).first()
if provider:
max_tokens = provider.max_tokens
client = LLMClient(
api_key=provider.api_key,
api_url=provider.base_url,
model=provider.default_model
)
return client
return client, max_tokens
finally:
db.close()
# Fallback to global config
client = LLMClient()
return client
return client, max_tokens
class ChatService:
@ -112,8 +114,10 @@ class ChatService:
tools = registry.list_all() if tools_enabled else None
llm = get_llm_client(conversation)
llm, provider_max_tokens = get_llm_client(conversation)
model = conversation.model or llm.default_model or "gpt-4"
# 使用 provider 的 max_tokens如果 conversation 有自己的 max_tokens 则覆盖
max_tokens = conversation.max_tokens if hasattr(conversation, 'max_tokens') and conversation.max_tokens else provider_max_tokens
# State tracking
all_steps = []
@ -146,7 +150,7 @@ class ChatService:
messages=messages,
tools=tools,
temperature=conversation.temperature,
max_tokens=conversation.max_tokens
max_tokens=max_tokens or 8192
):
# Parse SSE line
# Format: "event: xxx\ndata: {...}\n\n"

View File

@ -1,14 +1,11 @@
"""Web crawler tools"""
import requests
from typing import Dict, Any, List, Optional
from bs4 import BeautifulSoup
"""Crawler related tools"""
from luxx.tools.factory import tool
from luxx.tools.services import SearchService, FetchService
@tool(
name="web_search",
description="Search the internet for information using web search",
description="Search the internet for information. Use when you need to find latest news or answer questions.",
parameters={
"type": "object",
"properties": {
@ -18,7 +15,7 @@ from luxx.tools.factory import tool
},
"max_results": {
"type": "integer",
"description": "Maximum number of results to return",
"description": "Number of results to return, default 5",
"default": 5
}
},
@ -26,40 +23,25 @@ from luxx.tools.factory import tool
},
category="crawler"
)
def web_search(arguments: Dict[str, Any]) -> Dict[str, Any]:
def web_search(arguments: dict) -> dict:
"""
Execute web search
Note: This is a placeholder implementation, real usage requires integrating with actual search APIs
such as: Google Custom Search, DuckDuckGo, SerpAPI, etc.
Web search tool using DuckDuckGo
"""
query = arguments.get("query", "")
query = arguments["query"]
max_results = arguments.get("max_results", 5)
if not query:
return {"success": False, "error": "Query is required"}
service = SearchService()
results = service.search(query, max_results)
# Simulated search results
# Real implementation should integrate with actual search API
return {
"success": True,
"data": {
"query": query,
"results": [
{
"title": f"Result for '{query}' - Example {i+1}",
"url": f"https://example.com/result_{i+1}",
"snippet": f"This is a sample search result for the query '{query}'. " * 3
}
for i in range(min(max_results, 5))
]
}
}
if not results:
return {"success": True, "data": {"query": query, "results": []}, "message": "No results found"}
return {"success": True, "data": {"query": query, "results": results}}
@tool(
name="web_fetch",
description="Fetch and parse content from a web page",
description="Fetch content from a webpage. Use when user needs detailed information from a page.",
parameters={
"type": "object",
"properties": {
@ -67,123 +49,80 @@ def web_search(arguments: Dict[str, Any]) -> Dict[str, Any]:
"type": "string",
"description": "URL of the webpage to fetch"
},
"extract_text": {
"type": "boolean",
"description": "Whether to extract text content only",
"default": True
"extract_type": {
"type": "string",
"description": "Extraction type: text, links, or structured",
"enum": ["text", "links", "structured"],
"default": "text"
}
},
"required": ["url"]
},
category="crawler"
)
def web_fetch(arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Fetch and parse web page content"""
url = arguments.get("url", "")
extract_text = arguments.get("extract_text", True)
def web_fetch(arguments: dict) -> dict:
"""
Page fetch tool
"""
url = arguments["url"]
extract_type = arguments.get("extract_type", "text")
if not url:
return {"success": False, "error": "URL is required"}
# Simple URL validation
if not url.startswith(("http://", "https://")):
url = "https://" + url
service = FetchService(timeout=15)
result = service.fetch(url, extract_type)
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
if "error" in result:
return {"success": False, "error": result["error"]}
if extract_text:
soup = BeautifulSoup(response.text, "html.parser")
# Remove script and style tags
for tag in soup(["script", "style"]):
tag.decompose()
text = soup.get_text(separator="\n", strip=True)
# Clean up extra blank lines
lines = [line.strip() for line in text.split("\n") if line.strip()]
text = "\n".join(lines)
return {
"success": True,
"data": {
"url": url,
"title": soup.title.string if soup.title else "",
"content": text[:10000] # Limit content length
}
}
else:
return {
"success": True,
"data": {
"url": url,
"html": response.text[:50000] # Limit HTML length
}
}
except requests.RequestException as e:
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}
return {"success": True, "data": result}
@tool(
name="extract_links",
description="Extract all links from a web page",
name="batch_fetch",
description="Batch fetch multiple webpages. Use when you need to get content from multiple pages.",
parameters={
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "URL of the web page"
"urls": {
"type": "array",
"items": {"type": "string"},
"description": "List of URLs to fetch"
},
"max_links": {
"type": "integer",
"description": "Maximum number of links to extract",
"default": 20
"extract_type": {
"type": "string",
"enum": ["text", "links", "structured"],
"default": "text"
}
},
"required": ["url"]
"required": ["urls"]
},
category="crawler"
)
def extract_links(arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Extract all links from a web page"""
url = arguments.get("url", "")
max_links = arguments.get("max_links", 20)
def batch_fetch(arguments: dict) -> dict:
"""
Batch fetch tool
"""
urls = arguments["urls"]
extract_type = arguments.get("extract_type", "text")
if not url:
return {"success": False, "error": "URL is required"}
if not urls:
return {"success": False, "error": "URLs list is required"}
if not url.startswith(("http://", "https://")):
url = "https://" + url
if len(urls) > 10:
return {"success": False, "error": "Maximum 10 pages allowed"}
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
service = FetchService(timeout=10)
results = service.fetch_batch(urls, extract_type)
soup = BeautifulSoup(response.text, "html.parser")
links = []
for a_tag in soup.find_all("a", href=True)[:max_links]:
href = a_tag["href"]
# Handle relative URLs
if href.startswith("/"):
from urllib.parse import urljoin
href = urljoin(url, href)
links.append({
"text": a_tag.get_text(strip=True) or href,
"url": href
})
successful = sum(1 for r in results if "error" not in r)
return {
"success": True,
"data": {
"url": url,
"links": links
"results": results,
"total": len(results),
"successful": successful
}
}
except requests.RequestException as e:
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}

View File

@ -156,7 +156,7 @@ class ToolExecutor:
"tool_call_id": call_id,
"role": "tool",
"name": name,
"content": json.dumps(result)
"content": json.dumps(result, ensure_ascii=False)
}
def _create_error_result(self, call_id: str, name: str, error: str) -> Dict[str, Any]:
@ -165,7 +165,7 @@ class ToolExecutor:
"tool_call_id": call_id,
"role": "tool",
"name": name,
"content": json.dumps({"success": False, "error": error})
"content": json.dumps({"success": False, "error": error}, ensure_ascii=False)
}
def clear_cache(self) -> None:

247
luxx/tools/services.py Normal file
View File

@ -0,0 +1,247 @@
"""Tool helper services"""
import re
import httpx
from urllib.parse import parse_qs, urlparse
from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed
class SearchService:
"""Search service using DuckDuckGo"""
def __init__(self, engine: str = "duckduckgo"):
self.engine = engine
def search(
self,
query: str,
max_results: int = 5,
region: str = "cn-zh"
) -> List[dict]:
"""
Execute search
Args:
query: Search keywords
max_results: Max result count
region: Region setting
Returns:
Search result list
"""
if self.engine == "duckduckgo":
return self._search_duckduckgo(query, max_results, region)
else:
raise ValueError(f"Unsupported search engine: {self.engine}")
def _search_duckduckgo(
self,
query: str,
max_results: int,
region: str
) -> List[dict]:
"""DuckDuckGo search via HTML"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml",
}
from urllib.parse import quote
encoded_query = quote(query)
url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
try:
resp = httpx.get(url, headers=headers, timeout=15, follow_redirects=True)
resp.raise_for_status()
except Exception:
return []
from bs4 import BeautifulSoup
soup = BeautifulSoup(resp.text, "html.parser")
results = []
for result in soup.select(".result")[:max_results]:
title_elem = result.select_one(".result__title a")
snippet_elem = result.select_one(".result__snippet")
if title_elem:
raw_url = title_elem.get("href", "")
# Clean DuckDuckGo redirect URL
if "uddg=" in raw_url:
parsed = urlparse(raw_url)
params = parse_qs(parsed.query)
clean_url = params.get("uddg", [raw_url])[0]
else:
clean_url = raw_url
results.append({
"title": title_elem.get_text(strip=True),
"url": clean_url,
"snippet": snippet_elem.get_text(strip=True) if snippet_elem else ""
})
return results
class FetchService:
"""Page fetch service"""
def __init__(self, timeout: float = 15.0):
self.timeout = timeout
self.user_agent = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
def fetch(
self,
url: str,
extract_type: str = "text"
) -> dict:
"""
Fetch a single page
Args:
url: Page URL
extract_type: Extract type (text, links, structured)
Returns:
Fetch result
"""
if not url.startswith(("http://", "https://")):
url = "https://" + url
try:
resp = httpx.get(
url,
timeout=self.timeout,
follow_redirects=True,
headers={"User-Agent": self.user_agent}
)
resp.raise_for_status()
except httpx.TimeoutException:
return {"error": "Request timeout", "url": url}
except Exception as e:
return {"error": str(e), "url": url}
html = resp.text
extractor = ContentExtractor(html)
if extract_type == "text":
return {
"url": url,
"title": extractor.extract_title(),
"text": extractor.extract_text()[:15000]
}
elif extract_type == "links":
return {
"url": url,
"links": extractor.extract_links()
}
else:
return extractor.extract_structured(url)
def fetch_batch(
self,
urls: List[str],
extract_type: str = "text",
max_concurrent: int = 5
) -> List[dict]:
"""
Batch fetch pages concurrently.
Args:
urls: URL list
extract_type: Extract type
max_concurrent: Max concurrent requests (1-5, default 5)
Returns:
Result list (same order as input URLs)
"""
if len(urls) <= 1:
return [self.fetch(url, extract_type) for url in urls]
max_concurrent = min(max(max_concurrent, 1), 5)
results = [None] * len(urls)
with ThreadPoolExecutor(max_workers=max_concurrent) as pool:
futures = {
pool.submit(self.fetch, url, extract_type): i
for i, url in enumerate(urls)
}
for future in as_completed(futures):
idx = futures[future]
try:
results[idx] = future.result()
except Exception as e:
results[idx] = {"error": str(e)}
return results
class ContentExtractor:
"""Content extractor using BeautifulSoup"""
def __init__(self, html: str):
self.html = html
self._soup = None
@property
def soup(self):
if self._soup is None:
from bs4 import BeautifulSoup
self._soup = BeautifulSoup(self.html, "html.parser")
return self._soup
def extract_title(self) -> str:
"""Extract page title"""
if self.soup.title:
return self.soup.title.string or ""
return ""
def extract_text(self) -> str:
"""Extract plain text"""
# Remove script and style
for tag in self.soup(["script", "style", "nav", "footer", "header", "aside"]):
tag.decompose()
text = self.soup.get_text(separator="\n", strip=True)
# Clean extra whitespace
text = re.sub(r"\n{3,}", "\n\n", text)
return text
def extract_links(self, max_count: int = 50) -> List[dict]:
"""Extract links"""
links = []
for a in self.soup.find_all("a", href=True):
text = a.get_text(strip=True)
href = a["href"]
if text and href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
links.append({"text": text, "url": href})
if len(links) >= max_count:
break
return links
def extract_structured(self, url: str = "") -> dict:
"""Extract structured content"""
soup = self.soup
# Extract title
title = ""
if soup.title:
title = soup.title.string or ""
# Extract meta description
description = ""
meta_desc = soup.find("meta", attrs={"name": "description"})
if meta_desc:
description = meta_desc.get("content", "")
return {
"url": url,
"title": title.strip(),
"description": description.strip(),
"text": self.extract_text()[:5000],
"links": self.extract_links(20)
}

View File

@ -2,7 +2,6 @@
name = "luxx"
version = "1.0.0"
description = "luxx - FastAPI + SQLAlchemy"
readme = "docs/README.md"
requires-python = ">=3.10"
dependencies = [
@ -19,6 +18,7 @@ dependencies = [
"requests>=2.31.0",
"beautifulsoup4>=4.12.3",
"lxml>=5.1.0",
"httpx>=0.26.0",
"pyyaml>=6.0.1",
"shortuuid>=1.0.11",
"pydantic>=2.5.0",
@ -34,3 +34,6 @@ dev = [
"black>=24.0.0",
"ruff>=0.1.0",
]
[tool.setuptools]
packages = ["luxx"]

View File