feat: 完善爬虫功能

This commit is contained in:
ViperEkura 2026-04-13 08:38:12 +08:00
parent 805f8c86da
commit a84b8617a6
17 changed files with 422 additions and 172 deletions

2
.gitignore vendored
View File

@ -9,7 +9,7 @@
!README.md !README.md
!.gitignore !.gitignore
!luxx/**/*.py !*.py
!asserts/**/*.md !asserts/**/*.md
# Dashboard # Dashboard

View File

@ -5,9 +5,13 @@
- **框架**: FastAPI 0.109+ - **框架**: FastAPI 0.109+
- **数据库**: SQLAlchemy 2.0+ - **数据库**: SQLAlchemy 2.0+
- **认证**: JWT (PyJWT) - **认证**: JWT (PyJWT)
- **HTTP客户端**: httpx - **HTTP客户端**: httpx, requests
- **配置**: YAML (PyYAML) - **配置**: YAML (PyYAML)
- **代码执行**: Python 原生执行 - **代码执行**: Python 原生执行
- **网页爬虫**:
- `httpx` - HTTP 客户端
- `beautifulsoup4` - HTML 解析
- `lxml` - XML/HTML 解析器
## 目录结构 ## 目录结构
@ -36,6 +40,7 @@ luxx/
│ ├── crawler.py # 网页爬虫 │ ├── crawler.py # 网页爬虫
│ ├── data.py # 数据处理 │ ├── data.py # 数据处理
│ └── weather.py # 天气查询 │ └── weather.py # 天气查询
│ └── services.py # 工具服务层
└── utils/ # 工具函数 └── utils/ # 工具函数
└── helpers.py └── helpers.py
``` ```
@ -205,7 +210,9 @@ classDiagram
|------|------|------| |------|------|------|
| `python_execute` | 执行 Python 代码 | 支持 print 输出、变量访问 | | `python_execute` | 执行 Python 代码 | 支持 print 输出、变量访问 |
| `python_eval` | 计算表达式 | 快速求值 | | `python_eval` | 计算表达式 | 快速求值 |
| `web_crawl` | 网页抓取 | BeautifulSoup + httpx | | `web_search` | DuckDuckGo HTML | DuckDuckGo HTML 搜索 |
| `web_fetch` | 网页抓取 | httpx + BeautifulSoup支持 text/links/structured |
| `batch_fetch` | 批量抓取 | 并发获取多个页面 |
| `get_weather` | 天气查询 | 支持城市名查询 | | `get_weather` | 天气查询 | 支持城市名查询 |
| `process_data` | 数据处理 | JSON 转换、格式化等 | | `process_data` | 数据处理 | JSON 转换、格式化等 |

View File

@ -7,7 +7,7 @@ app:
database: database:
type: sqlite type: sqlite
url: sqlite:///../chat.db url: sqlite:///./chat.db
llm: llm:
provider: deepseek provider: deepseek

View File

@ -66,7 +66,9 @@ const renderedContent = computed(() => {
function formatTime(time) { function formatTime(time) {
if (!time) return '' if (!time) return ''
return new Date(time).toLocaleTimeString('zh-CN', { hour: '2-digit', minute: '2-digit' }) const date = new Date(time)
// 使
return date.toLocaleTimeString('zh-CN', { hour: '2-digit', minute: '2-digit' })
} }
function copyContent() { function copyContent() {

View File

@ -9,10 +9,11 @@
<span class="step-label">思考中</span> <span class="step-label">思考中</span>
<span class="step-brief">{{ item.brief || '正在思考...' }}</span> <span class="step-brief">{{ item.brief || '正在思考...' }}</span>
<span v-if="streaming && item.key === lastThinkingKey" class="loading-dots">...</span> <span v-if="streaming && item.key === lastThinkingKey" class="loading-dots">...</span>
<span v-else-if="item.content && item.content.length > 1024" class="truncate-hint">已截断</span>
<span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span> <span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span>
</div> </div>
<div v-if="expandedKeys.has(item.key)" class="step-content"> <div v-if="expandedKeys.has(item.key)" class="step-content">
<div class="thinking-text">{{ item.content }}</div> <div class="thinking-text">{{ item.displayContent }}</div>
</div> </div>
</div> </div>
@ -25,6 +26,7 @@
<span v-if="item.loading" class="loading-dots">...</span> <span v-if="item.loading" class="loading-dots">...</span>
<span v-else-if="item.isSuccess === true" class="step-badge success">成功</span> <span v-else-if="item.isSuccess === true" class="step-badge success">成功</span>
<span v-else-if="item.isSuccess === false" class="step-badge error">失败</span> <span v-else-if="item.isSuccess === false" class="step-badge error">失败</span>
<span v-if="item.fullResult && item.fullResult.length > 1024" class="truncate-hint">已截断</span>
<span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span> <span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span>
</div> </div>
<div v-if="expandedKeys.has(item.key)" class="step-content"> <div v-if="expandedKeys.has(item.key)" class="step-content">
@ -34,7 +36,7 @@
</div> </div>
<div v-if="item.resultSummary || item.fullResult" class="tool-detail" style="margin-top: 8px;"> <div v-if="item.resultSummary || item.fullResult" class="tool-detail" style="margin-top: 8px;">
<span class="detail-label">结果</span> <span class="detail-label">结果</span>
<pre>{{ item.fullResult || item.resultSummary }}</pre> <pre>{{ item.displayResult }}</pre>
</div> </div>
</div> </div>
</div> </div>
@ -71,12 +73,14 @@ const allItems = computed(() => {
if (props.processSteps && props.processSteps.length > 0) { if (props.processSteps && props.processSteps.length > 0) {
for (const step of props.processSteps) { for (const step of props.processSteps) {
if (step.type === 'thinking') { if (step.type === 'thinking') {
const content = step.content || ''
items.push({ items.push({
key: step.id || `thinking-${step.index}`, key: step.id || `thinking-${step.index}`,
type: 'thinking', type: 'thinking',
index: step.index, index: step.index,
content: step.content || '', content: content,
brief: step.content ? step.content.slice(0, 50) + (step.content.length > 50 ? '...' : '') : '', displayContent: content.length > 1024 ? content.slice(0, 1024) + '\n\n[... 内容已截断 ...]' : content,
brief: content.slice(0, 50) + (content.length > 50 ? '...' : ''),
}) })
} else if (step.type === 'tool_call') { } else if (step.type === 'tool_call') {
items.push({ items.push({
@ -97,12 +101,15 @@ const allItems = computed(() => {
const toolId = step.id_ref || step.id const toolId = step.id_ref || step.id
const match = items.findLast(it => it.type === 'tool_call' && it.id === toolId) const match = items.findLast(it => it.type === 'tool_call' && it.id === toolId)
if (match) { if (match) {
match.resultSummary = step.content ? step.content.slice(0, 200) : '' const resultContent = step.content || ''
match.fullResult = step.content || '' match.resultSummary = resultContent.slice(0, 200)
match.fullResult = resultContent
match.displayResult = resultContent.length > 1024 ? resultContent.slice(0, 1024) + '\n\n[... 结果已截断 ...]' : resultContent
match.isSuccess = step.success !== false match.isSuccess = step.success !== false
match.loading = false match.loading = false
} else { } else {
// tool_call // tool_call
const placeholderContent = step.content || ''
items.push({ items.push({
key: `result-${step.id || step.index}`, key: `result-${step.id || step.index}`,
type: 'tool_call', type: 'tool_call',
@ -113,8 +120,9 @@ const allItems = computed(() => {
brief: step.name || '工具结果', brief: step.name || '工具结果',
loading: false, loading: false,
isSuccess: true, isSuccess: true,
resultSummary: step.content ? step.content.slice(0, 200) : '', resultSummary: placeholderContent.slice(0, 200),
fullResult: step.content || '' fullResult: placeholderContent,
displayResult: placeholderContent.length > 1024 ? placeholderContent.slice(0, 1024) + '\n\n[... 结果已截断 ...]' : placeholderContent
}) })
} }
} else if (step.type === 'text') { } else if (step.type === 'text') {
@ -280,6 +288,15 @@ const sparkleIcon = `<svg viewBox="0 0 24 24" width="14" height="14" fill="none"
color: var(--success-color); color: var(--success-color);
} }
.truncate-hint {
font-size: 10px;
padding: 2px 6px;
background: var(--warning-bg);
color: var(--warning-color);
border-radius: 4px;
margin-left: 4px;
}
.step-badge.error { .step-badge.error {
background: var(--danger-bg); background: var(--danger-bg);
color: var(--danger-color); color: var(--danger-color);

View File

@ -41,6 +41,8 @@
/* 状态颜色 */ /* 状态颜色 */
--success-color: #059669; --success-color: #059669;
--success-bg: rgba(16, 185, 129, 0.1); --success-bg: rgba(16, 185, 129, 0.1);
--warning-color: #d97706;
--warning-bg: rgba(217, 119, 6, 0.1);
--danger-color: #ef4444; --danger-color: #ef4444;
--danger-bg: rgba(239, 68, 68, 0.08); --danger-bg: rgba(239, 68, 68, 0.08);
@ -112,6 +114,8 @@
--success-color: #34d399; --success-color: #34d399;
--success-bg: rgba(52, 211, 153, 0.15); --success-bg: rgba(52, 211, 153, 0.15);
--warning-color: #fbbf24;
--warning-bg: rgba(251, 191, 36, 0.15);
--danger-color: #f87171; --danger-color: #f87171;
--danger-bg: rgba(248, 113, 113, 0.15); --danger-bg: rgba(248, 113, 113, 0.15);

View File

@ -52,9 +52,9 @@ const blockMathExtension = {
} }
marked.use({ marked.use({
extensions: [blockMathExtension, mathExtension], gfm: true,
breaks: true, breaks: true,
gfm: true extensions: [blockMathExtension, mathExtension]
}) })
export function renderMarkdown(text) { export function renderMarkdown(text) {

View File

@ -24,7 +24,7 @@
</div> </div>
</div> </div>
<div ref="messagesContainer" class="messages-container"> <div ref="messagesContainer" class="messages-container" @scroll="handleScroll">
<div v-if="loading" class="load-more-top"> <div v-if="loading" class="load-more-top">
<span>加载中...</span> <span>加载中...</span>
</div> </div>
@ -106,6 +106,7 @@ const sending = ref(false)
const streamingMessage = ref(null) const streamingMessage = ref(null)
const messagesContainer = ref(null) const messagesContainer = ref(null)
const textareaRef = ref(null) const textareaRef = ref(null)
const autoScroll = ref(true)
const conversationId = ref(route.params.id) const conversationId = ref(route.params.id)
const conversationTitle = ref('') const conversationTitle = ref('')
@ -128,6 +129,7 @@ function onKeydown(e) {
} }
const loadMessages = async () => { const loadMessages = async () => {
autoScroll.value = true
loading.value = true loading.value = true
try { try {
const res = await messagesAPI.list(conversationId.value) const res = await messagesAPI.list(conversationId.value)
@ -191,6 +193,7 @@ const sendMessage = async () => {
{ conversation_id: conversationId.value, content }, { conversation_id: conversationId.value, content },
{ {
onProcessStep: (step) => { onProcessStep: (step) => {
autoScroll.value = true //
if (!streamingMessage.value) return if (!streamingMessage.value) return
// id // id
const idx = streamingMessage.value.process_steps.findIndex(s => s.id === step.id) const idx = streamingMessage.value.process_steps.findIndex(s => s.id === step.id)
@ -202,6 +205,7 @@ const sendMessage = async () => {
}, },
onDone: () => { onDone: () => {
// //
autoScroll.value = true
if (streamingMessage.value) { if (streamingMessage.value) {
messages.value.push({ messages.value.push({
...streamingMessage.value, ...streamingMessage.value,
@ -230,6 +234,7 @@ const sendMessage = async () => {
} }
const scrollToBottom = () => { const scrollToBottom = () => {
if (!autoScroll.value) return
nextTick(() => { nextTick(() => {
if (messagesContainer.value) { if (messagesContainer.value) {
messagesContainer.value.scrollTo({ messagesContainer.value.scrollTo({
@ -240,6 +245,15 @@ const scrollToBottom = () => {
}) })
} }
//
const handleScroll = () => {
if (!messagesContainer.value) return
const { scrollTop, scrollHeight, clientHeight } = messagesContainer.value
const distanceToBottom = scrollHeight - scrollTop - clientHeight
// 50px
autoScroll.value = distanceToBottom < 50
}
// //
watch(() => streamingMessage.value?.process_steps?.length, () => { watch(() => streamingMessage.value?.process_steps?.length, () => {
if (streamingMessage.value) { if (streamingMessage.value) {

View File

@ -102,6 +102,11 @@
<label>模型名称</label> <label>模型名称</label>
<input v-model="form.default_model" placeholder="deepseek-chat / gpt-4" required /> <input v-model="form.default_model" placeholder="deepseek-chat / gpt-4" required />
</div> </div>
<div class="form-group">
<label>最大 Tokens</label>
<input v-model.number="form.max_tokens" type="number" placeholder="8192" min="1" />
<span class="hint">单次回复最大 token 默认 8192</span>
</div>
<div class="form-group"> <div class="form-group">
<label class="switch-card" :class="{ active: form.is_default }"> <label class="switch-card" :class="{ active: form.is_default }">
<div class="switch-content"> <div class="switch-content">
@ -201,7 +206,7 @@ const testResult = ref(null)
const formError = ref('') const formError = ref('')
const form = ref({ const form = ref({
name: '', base_url: '', api_key: '', default_model: '', is_default: false name: '', base_url: '', api_key: '', default_model: '', max_tokens: 8192, is_default: false
}) })
const fetchProviders = async () => { const fetchProviders = async () => {
@ -218,7 +223,7 @@ const fetchProviders = async () => {
const closeModal = () => { const closeModal = () => {
showModal.value = false showModal.value = false
editing.value = null editing.value = null
form.value = { name: '', base_url: '', api_key: '', default_model: '', is_default: false } form.value = { name: '', base_url: '', api_key: '', default_model: '', max_tokens: 8192, is_default: false }
formError.value = '' formError.value = ''
} }
@ -232,6 +237,7 @@ const editProvider = async (p) => {
base_url: res.data.base_url, base_url: res.data.base_url,
api_key: res.data.api_key || '', api_key: res.data.api_key || '',
default_model: res.data.default_model, default_model: res.data.default_model,
max_tokens: res.data.max_tokens || 8192,
is_default: res.data.is_default is_default: res.data.is_default
} }
} }
@ -381,6 +387,7 @@ input:checked + .slider:before { transform: translateX(22px); }
.switch-card input:checked + .slider { background-color: var(--accent); } .switch-card input:checked + .slider { background-color: var(--accent); }
.switch-card input:checked + .slider:before { transform: translateX(22px); } .switch-card input:checked + .slider:before { transform: translateX(22px); }
.modal-actions { display: flex; justify-content: flex-end; gap: 1rem; margin-top: 1.5rem; } .modal-actions { display: flex; justify-content: flex-end; gap: 1rem; margin-top: 1.5rem; }
.form-group .hint { font-size: 0.85rem; color: var(--text); margin-top: 4px; display: block; }
.spinner { width: 48px; height: 48px; border: 4px solid var(--border); border-top-color: var(--accent); border-radius: 50%; animation: spin 1s linear infinite; margin: 0 auto 1rem; } .spinner { width: 48px; height: 48px; border: 4px solid var(--border); border-top-color: var(--accent); border-radius: 50%; animation: spin 1s linear infinite; margin: 0 auto 1rem; }
@keyframes spin { to { transform: rotate(360deg); } } @keyframes spin { to { transform: rotate(360deg); } }
</style> </style>

View File

@ -7,6 +7,10 @@ from sqlalchemy.orm import Mapped, mapped_column, relationship
from luxx.database import Base from luxx.database import Base
def local_now():
return datetime.now()
class LLMProvider(Base): class LLMProvider(Base):
"""LLM Provider configuration model""" """LLM Provider configuration model"""
__tablename__ = "llm_providers" __tablename__ = "llm_providers"
@ -18,10 +22,11 @@ class LLMProvider(Base):
base_url: Mapped[str] = mapped_column(String(500), nullable=False) base_url: Mapped[str] = mapped_column(String(500), nullable=False)
api_key: Mapped[str] = mapped_column(String(500), nullable=False) api_key: Mapped[str] = mapped_column(String(500), nullable=False)
default_model: Mapped[str] = mapped_column(String(100), nullable=False, default="gpt-4") default_model: Mapped[str] = mapped_column(String(100), nullable=False, default="gpt-4")
max_tokens: Mapped[int] = mapped_column(Integer, default=8192) # 默认 8192
is_default: Mapped[bool] = mapped_column(Boolean, default=False) is_default: Mapped[bool] = mapped_column(Boolean, default=False)
enabled: Mapped[bool] = mapped_column(Boolean, default=True) enabled: Mapped[bool] = mapped_column(Boolean, default=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) updated_at: Mapped[datetime] = mapped_column(DateTime, default=local_now, onupdate=local_now)
# Relationships # Relationships
user: Mapped["User"] = relationship("User", backref="llm_providers") user: Mapped["User"] = relationship("User", backref="llm_providers")
@ -35,6 +40,7 @@ class LLMProvider(Base):
"provider_type": self.provider_type, "provider_type": self.provider_type,
"base_url": self.base_url, "base_url": self.base_url,
"default_model": self.default_model, "default_model": self.default_model,
"max_tokens": self.max_tokens,
"is_default": self.is_default, "is_default": self.is_default,
"enabled": self.enabled, "enabled": self.enabled,
"created_at": self.created_at.isoformat() if self.created_at else None, "created_at": self.created_at.isoformat() if self.created_at else None,
@ -53,8 +59,8 @@ class Project(Base):
user_id: Mapped[int] = mapped_column(Integer, ForeignKey("users.id"), nullable=False) user_id: Mapped[int] = mapped_column(Integer, ForeignKey("users.id"), nullable=False)
name: Mapped[str] = mapped_column(String(255), nullable=False) name: Mapped[str] = mapped_column(String(255), nullable=False)
description: Mapped[Optional[str]] = mapped_column(Text, nullable=True) description: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) updated_at: Mapped[datetime] = mapped_column(DateTime, default=local_now, onupdate=local_now)
# Relationships # Relationships
user: Mapped["User"] = relationship("User", backref="projects") user: Mapped["User"] = relationship("User", backref="projects")
@ -70,7 +76,7 @@ class User(Base):
password_hash: Mapped[Optional[str]] = mapped_column(String(255), nullable=True) password_hash: Mapped[Optional[str]] = mapped_column(String(255), nullable=True)
role: Mapped[str] = mapped_column(String(20), default="user") role: Mapped[str] = mapped_column(String(20), default="user")
is_active: Mapped[bool] = mapped_column(Boolean, default=True) is_active: Mapped[bool] = mapped_column(Boolean, default=True)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
# Relationships # Relationships
conversations: Mapped[List["Conversation"]] = relationship( conversations: Mapped[List["Conversation"]] = relationship(
@ -102,8 +108,8 @@ class Conversation(Base):
temperature: Mapped[float] = mapped_column(Float, default=0.7) temperature: Mapped[float] = mapped_column(Float, default=0.7)
max_tokens: Mapped[int] = mapped_column(Integer, default=2000) max_tokens: Mapped[int] = mapped_column(Integer, default=2000)
thinking_enabled: Mapped[bool] = mapped_column(Boolean, default=False) thinking_enabled: Mapped[bool] = mapped_column(Boolean, default=False)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) updated_at: Mapped[datetime] = mapped_column(DateTime, default=local_now, onupdate=local_now)
# Relationships # Relationships
user: Mapped["User"] = relationship("User", back_populates="conversations") user: Mapped["User"] = relationship("User", back_populates="conversations")
@ -161,7 +167,7 @@ class Message(Base):
role: Mapped[str] = mapped_column(String(16), nullable=False) # user, assistant, system, tool role: Mapped[str] = mapped_column(String(16), nullable=False) # user, assistant, system, tool
content: Mapped[str] = mapped_column(Text, nullable=False, default="") content: Mapped[str] = mapped_column(Text, nullable=False, default="")
token_count: Mapped[int] = mapped_column(Integer, default=0) token_count: Mapped[int] = mapped_column(Integer, default=0)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
# Relationships # Relationships
conversation: Mapped["Conversation"] = relationship("Conversation", back_populates="messages") conversation: Mapped["Conversation"] = relationship("Conversation", back_populates="messages")

View File

@ -79,8 +79,8 @@ def send_message(
) )
db.add(user_message) db.add(user_message)
from datetime import datetime from datetime import datetime, timezone, timedelta
conversation.updated_at = datetime.utcnow() conversation.updated_at = datetime.now(timezone(timedelta(hours=8)))
response = chat_service.non_stream_response( response = chat_service.non_stream_response(
conversation=conversation, conversation=conversation,
@ -133,7 +133,7 @@ async def stream_message(
token_count=len(data.content) // 4 token_count=len(data.content) // 4
) )
db.add(user_message) db.add(user_message)
conversation.updated_at = datetime.utcnow() conversation.updated_at = datetime.now()
db.commit() db.commit()
async def event_generator(): async def event_generator():

View File

@ -20,7 +20,8 @@ def _sse_event(event: str, data: dict) -> str:
def get_llm_client(conversation: Conversation = None): def get_llm_client(conversation: Conversation = None):
"""Get LLM client, optionally using conversation's provider""" """Get LLM client, optionally using conversation's provider. Returns (client, max_tokens)"""
max_tokens = None
if conversation and conversation.provider_id: if conversation and conversation.provider_id:
from luxx.models import LLMProvider from luxx.models import LLMProvider
from luxx.database import SessionLocal from luxx.database import SessionLocal
@ -28,18 +29,19 @@ def get_llm_client(conversation: Conversation = None):
try: try:
provider = db.query(LLMProvider).filter(LLMProvider.id == conversation.provider_id).first() provider = db.query(LLMProvider).filter(LLMProvider.id == conversation.provider_id).first()
if provider: if provider:
max_tokens = provider.max_tokens
client = LLMClient( client = LLMClient(
api_key=provider.api_key, api_key=provider.api_key,
api_url=provider.base_url, api_url=provider.base_url,
model=provider.default_model model=provider.default_model
) )
return client return client, max_tokens
finally: finally:
db.close() db.close()
# Fallback to global config # Fallback to global config
client = LLMClient() client = LLMClient()
return client return client, max_tokens
class ChatService: class ChatService:
@ -112,8 +114,10 @@ class ChatService:
tools = registry.list_all() if tools_enabled else None tools = registry.list_all() if tools_enabled else None
llm = get_llm_client(conversation) llm, provider_max_tokens = get_llm_client(conversation)
model = conversation.model or llm.default_model or "gpt-4" model = conversation.model or llm.default_model or "gpt-4"
# 使用 provider 的 max_tokens如果 conversation 有自己的 max_tokens 则覆盖
max_tokens = conversation.max_tokens if hasattr(conversation, 'max_tokens') and conversation.max_tokens else provider_max_tokens
# State tracking # State tracking
all_steps = [] all_steps = []
@ -146,7 +150,7 @@ class ChatService:
messages=messages, messages=messages,
tools=tools, tools=tools,
temperature=conversation.temperature, temperature=conversation.temperature,
max_tokens=conversation.max_tokens max_tokens=max_tokens or 8192
): ):
# Parse SSE line # Parse SSE line
# Format: "event: xxx\ndata: {...}\n\n" # Format: "event: xxx\ndata: {...}\n\n"

View File

@ -1,14 +1,11 @@
"""Web crawler tools""" """Crawler related tools"""
import requests
from typing import Dict, Any, List, Optional
from bs4 import BeautifulSoup
from luxx.tools.factory import tool from luxx.tools.factory import tool
from luxx.tools.services import SearchService, FetchService
@tool( @tool(
name="web_search", name="web_search",
description="Search the internet for information using web search", description="Search the internet for information. Use when you need to find latest news or answer questions.",
parameters={ parameters={
"type": "object", "type": "object",
"properties": { "properties": {
@ -18,7 +15,7 @@ from luxx.tools.factory import tool
}, },
"max_results": { "max_results": {
"type": "integer", "type": "integer",
"description": "Maximum number of results to return", "description": "Number of results to return, default 5",
"default": 5 "default": 5
} }
}, },
@ -26,164 +23,106 @@ from luxx.tools.factory import tool
}, },
category="crawler" category="crawler"
) )
def web_search(arguments: Dict[str, Any]) -> Dict[str, Any]: def web_search(arguments: dict) -> dict:
""" """
Execute web search Web search tool using DuckDuckGo
Note: This is a placeholder implementation, real usage requires integrating with actual search APIs
such as: Google Custom Search, DuckDuckGo, SerpAPI, etc.
""" """
query = arguments.get("query", "") query = arguments["query"]
max_results = arguments.get("max_results", 5) max_results = arguments.get("max_results", 5)
if not query: service = SearchService()
return {"success": False, "error": "Query is required"} results = service.search(query, max_results)
# Simulated search results if not results:
# Real implementation should integrate with actual search API return {"success": True, "data": {"query": query, "results": []}, "message": "No results found"}
return {
"success": True, return {"success": True, "data": {"query": query, "results": results}}
"data": {
"query": query,
"results": [
{
"title": f"Result for '{query}' - Example {i+1}",
"url": f"https://example.com/result_{i+1}",
"snippet": f"This is a sample search result for the query '{query}'. " * 3
}
for i in range(min(max_results, 5))
]
}
}
@tool( @tool(
name="web_fetch", name="web_fetch",
description="Fetch and parse content from a web page", description="Fetch content from a webpage. Use when user needs detailed information from a page.",
parameters={ parameters={
"type": "object", "type": "object",
"properties": { "properties": {
"url": { "url": {
"type": "string", "type": "string",
"description": "URL of the web page to fetch" "description": "URL of the webpage to fetch"
}, },
"extract_text": { "extract_type": {
"type": "boolean", "type": "string",
"description": "Whether to extract text content only", "description": "Extraction type: text, links, or structured",
"default": True "enum": ["text", "links", "structured"],
"default": "text"
} }
}, },
"required": ["url"] "required": ["url"]
}, },
category="crawler" category="crawler"
) )
def web_fetch(arguments: Dict[str, Any]) -> Dict[str, Any]: def web_fetch(arguments: dict) -> dict:
"""Fetch and parse web page content""" """
url = arguments.get("url", "") Page fetch tool
extract_text = arguments.get("extract_text", True) """
url = arguments["url"]
extract_type = arguments.get("extract_type", "text")
if not url: if not url:
return {"success": False, "error": "URL is required"} return {"success": False, "error": "URL is required"}
# Simple URL validation service = FetchService(timeout=15)
if not url.startswith(("http://", "https://")): result = service.fetch(url, extract_type)
url = "https://" + url
try: if "error" in result:
headers = { return {"success": False, "error": result["error"]}
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
if extract_text: return {"success": True, "data": result}
soup = BeautifulSoup(response.text, "html.parser")
# Remove script and style tags
for tag in soup(["script", "style"]):
tag.decompose()
text = soup.get_text(separator="\n", strip=True)
# Clean up extra blank lines
lines = [line.strip() for line in text.split("\n") if line.strip()]
text = "\n".join(lines)
return {
"success": True,
"data": {
"url": url,
"title": soup.title.string if soup.title else "",
"content": text[:10000] # Limit content length
}
}
else:
return {
"success": True,
"data": {
"url": url,
"html": response.text[:50000] # Limit HTML length
}
}
except requests.RequestException as e:
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}
@tool( @tool(
name="extract_links", name="batch_fetch",
description="Extract all links from a web page", description="Batch fetch multiple webpages. Use when you need to get content from multiple pages.",
parameters={ parameters={
"type": "object", "type": "object",
"properties": { "properties": {
"url": { "urls": {
"type": "string", "type": "array",
"description": "URL of the web page" "items": {"type": "string"},
"description": "List of URLs to fetch"
}, },
"max_links": { "extract_type": {
"type": "integer", "type": "string",
"description": "Maximum number of links to extract", "enum": ["text", "links", "structured"],
"default": 20 "default": "text"
} }
}, },
"required": ["url"] "required": ["urls"]
}, },
category="crawler" category="crawler"
) )
def extract_links(arguments: Dict[str, Any]) -> Dict[str, Any]: def batch_fetch(arguments: dict) -> dict:
"""Extract all links from a web page""" """
url = arguments.get("url", "") Batch fetch tool
max_links = arguments.get("max_links", 20) """
urls = arguments["urls"]
extract_type = arguments.get("extract_type", "text")
if not url: if not urls:
return {"success": False, "error": "URL is required"} return {"success": False, "error": "URLs list is required"}
if not url.startswith(("http://", "https://")): if len(urls) > 10:
url = "https://" + url return {"success": False, "error": "Maximum 10 pages allowed"}
try: service = FetchService(timeout=10)
headers = { results = service.fetch_batch(urls, extract_type)
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
successful = sum(1 for r in results if "error" not in r)
return {
"success": True,
"data": {
"results": results,
"total": len(results),
"successful": successful
} }
response = requests.get(url, headers=headers, timeout=10) }
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
links = []
for a_tag in soup.find_all("a", href=True)[:max_links]:
href = a_tag["href"]
# Handle relative URLs
if href.startswith("/"):
from urllib.parse import urljoin
href = urljoin(url, href)
links.append({
"text": a_tag.get_text(strip=True) or href,
"url": href
})
return {
"success": True,
"data": {
"url": url,
"links": links
}
}
except requests.RequestException as e:
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}

View File

@ -156,7 +156,7 @@ class ToolExecutor:
"tool_call_id": call_id, "tool_call_id": call_id,
"role": "tool", "role": "tool",
"name": name, "name": name,
"content": json.dumps(result) "content": json.dumps(result, ensure_ascii=False)
} }
def _create_error_result(self, call_id: str, name: str, error: str) -> Dict[str, Any]: def _create_error_result(self, call_id: str, name: str, error: str) -> Dict[str, Any]:
@ -165,7 +165,7 @@ class ToolExecutor:
"tool_call_id": call_id, "tool_call_id": call_id,
"role": "tool", "role": "tool",
"name": name, "name": name,
"content": json.dumps({"success": False, "error": error}) "content": json.dumps({"success": False, "error": error}, ensure_ascii=False)
} }
def clear_cache(self) -> None: def clear_cache(self) -> None:

247
luxx/tools/services.py Normal file
View File

@ -0,0 +1,247 @@
"""Tool helper services"""
import re
import httpx
from urllib.parse import parse_qs, urlparse
from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed
class SearchService:
"""Search service using DuckDuckGo"""
def __init__(self, engine: str = "duckduckgo"):
self.engine = engine
def search(
self,
query: str,
max_results: int = 5,
region: str = "cn-zh"
) -> List[dict]:
"""
Execute search
Args:
query: Search keywords
max_results: Max result count
region: Region setting
Returns:
Search result list
"""
if self.engine == "duckduckgo":
return self._search_duckduckgo(query, max_results, region)
else:
raise ValueError(f"Unsupported search engine: {self.engine}")
def _search_duckduckgo(
self,
query: str,
max_results: int,
region: str
) -> List[dict]:
"""DuckDuckGo search via HTML"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml",
}
from urllib.parse import quote
encoded_query = quote(query)
url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
try:
resp = httpx.get(url, headers=headers, timeout=15, follow_redirects=True)
resp.raise_for_status()
except Exception:
return []
from bs4 import BeautifulSoup
soup = BeautifulSoup(resp.text, "html.parser")
results = []
for result in soup.select(".result")[:max_results]:
title_elem = result.select_one(".result__title a")
snippet_elem = result.select_one(".result__snippet")
if title_elem:
raw_url = title_elem.get("href", "")
# Clean DuckDuckGo redirect URL
if "uddg=" in raw_url:
parsed = urlparse(raw_url)
params = parse_qs(parsed.query)
clean_url = params.get("uddg", [raw_url])[0]
else:
clean_url = raw_url
results.append({
"title": title_elem.get_text(strip=True),
"url": clean_url,
"snippet": snippet_elem.get_text(strip=True) if snippet_elem else ""
})
return results
class FetchService:
"""Page fetch service"""
def __init__(self, timeout: float = 15.0):
self.timeout = timeout
self.user_agent = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
def fetch(
self,
url: str,
extract_type: str = "text"
) -> dict:
"""
Fetch a single page
Args:
url: Page URL
extract_type: Extract type (text, links, structured)
Returns:
Fetch result
"""
if not url.startswith(("http://", "https://")):
url = "https://" + url
try:
resp = httpx.get(
url,
timeout=self.timeout,
follow_redirects=True,
headers={"User-Agent": self.user_agent}
)
resp.raise_for_status()
except httpx.TimeoutException:
return {"error": "Request timeout", "url": url}
except Exception as e:
return {"error": str(e), "url": url}
html = resp.text
extractor = ContentExtractor(html)
if extract_type == "text":
return {
"url": url,
"title": extractor.extract_title(),
"text": extractor.extract_text()[:15000]
}
elif extract_type == "links":
return {
"url": url,
"links": extractor.extract_links()
}
else:
return extractor.extract_structured(url)
def fetch_batch(
self,
urls: List[str],
extract_type: str = "text",
max_concurrent: int = 5
) -> List[dict]:
"""
Batch fetch pages concurrently.
Args:
urls: URL list
extract_type: Extract type
max_concurrent: Max concurrent requests (1-5, default 5)
Returns:
Result list (same order as input URLs)
"""
if len(urls) <= 1:
return [self.fetch(url, extract_type) for url in urls]
max_concurrent = min(max(max_concurrent, 1), 5)
results = [None] * len(urls)
with ThreadPoolExecutor(max_workers=max_concurrent) as pool:
futures = {
pool.submit(self.fetch, url, extract_type): i
for i, url in enumerate(urls)
}
for future in as_completed(futures):
idx = futures[future]
try:
results[idx] = future.result()
except Exception as e:
results[idx] = {"error": str(e)}
return results
class ContentExtractor:
"""Content extractor using BeautifulSoup"""
def __init__(self, html: str):
self.html = html
self._soup = None
@property
def soup(self):
if self._soup is None:
from bs4 import BeautifulSoup
self._soup = BeautifulSoup(self.html, "html.parser")
return self._soup
def extract_title(self) -> str:
"""Extract page title"""
if self.soup.title:
return self.soup.title.string or ""
return ""
def extract_text(self) -> str:
"""Extract plain text"""
# Remove script and style
for tag in self.soup(["script", "style", "nav", "footer", "header", "aside"]):
tag.decompose()
text = self.soup.get_text(separator="\n", strip=True)
# Clean extra whitespace
text = re.sub(r"\n{3,}", "\n\n", text)
return text
def extract_links(self, max_count: int = 50) -> List[dict]:
"""Extract links"""
links = []
for a in self.soup.find_all("a", href=True):
text = a.get_text(strip=True)
href = a["href"]
if text and href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
links.append({"text": text, "url": href})
if len(links) >= max_count:
break
return links
def extract_structured(self, url: str = "") -> dict:
"""Extract structured content"""
soup = self.soup
# Extract title
title = ""
if soup.title:
title = soup.title.string or ""
# Extract meta description
description = ""
meta_desc = soup.find("meta", attrs={"name": "description"})
if meta_desc:
description = meta_desc.get("content", "")
return {
"url": url,
"title": title.strip(),
"description": description.strip(),
"text": self.extract_text()[:5000],
"links": self.extract_links(20)
}

View File

@ -2,7 +2,6 @@
name = "luxx" name = "luxx"
version = "1.0.0" version = "1.0.0"
description = "luxx - FastAPI + SQLAlchemy" description = "luxx - FastAPI + SQLAlchemy"
readme = "docs/README.md"
requires-python = ">=3.10" requires-python = ">=3.10"
dependencies = [ dependencies = [
@ -19,6 +18,7 @@ dependencies = [
"requests>=2.31.0", "requests>=2.31.0",
"beautifulsoup4>=4.12.3", "beautifulsoup4>=4.12.3",
"lxml>=5.1.0", "lxml>=5.1.0",
"httpx>=0.26.0",
"pyyaml>=6.0.1", "pyyaml>=6.0.1",
"shortuuid>=1.0.11", "shortuuid>=1.0.11",
"pydantic>=2.5.0", "pydantic>=2.5.0",
@ -34,3 +34,6 @@ dev = [
"black>=24.0.0", "black>=24.0.0",
"ruff>=0.1.0", "ruff>=0.1.0",
] ]
[tool.setuptools]
packages = ["luxx"]

View File