feat: 完善爬虫功能
This commit is contained in:
parent
805f8c86da
commit
a84b8617a6
|
|
@ -9,7 +9,7 @@
|
|||
!README.md
|
||||
!.gitignore
|
||||
|
||||
!luxx/**/*.py
|
||||
!*.py
|
||||
!asserts/**/*.md
|
||||
|
||||
# Dashboard
|
||||
|
|
|
|||
|
|
@ -5,9 +5,13 @@
|
|||
- **框架**: FastAPI 0.109+
|
||||
- **数据库**: SQLAlchemy 2.0+
|
||||
- **认证**: JWT (PyJWT)
|
||||
- **HTTP客户端**: httpx
|
||||
- **HTTP客户端**: httpx, requests
|
||||
- **配置**: YAML (PyYAML)
|
||||
- **代码执行**: Python 原生执行
|
||||
- **网页爬虫**:
|
||||
- `httpx` - HTTP 客户端
|
||||
- `beautifulsoup4` - HTML 解析
|
||||
- `lxml` - XML/HTML 解析器
|
||||
|
||||
## 目录结构
|
||||
|
||||
|
|
@ -36,6 +40,7 @@ luxx/
|
|||
│ ├── crawler.py # 网页爬虫
|
||||
│ ├── data.py # 数据处理
|
||||
│ └── weather.py # 天气查询
|
||||
│ └── services.py # 工具服务层
|
||||
└── utils/ # 工具函数
|
||||
└── helpers.py
|
||||
```
|
||||
|
|
@ -205,7 +210,9 @@ classDiagram
|
|||
|------|------|------|
|
||||
| `python_execute` | 执行 Python 代码 | 支持 print 输出、变量访问 |
|
||||
| `python_eval` | 计算表达式 | 快速求值 |
|
||||
| `web_crawl` | 网页抓取 | BeautifulSoup + httpx |
|
||||
| `web_search` | DuckDuckGo HTML | DuckDuckGo HTML 搜索 |
|
||||
| `web_fetch` | 网页抓取 | httpx + BeautifulSoup,支持 text/links/structured |
|
||||
| `batch_fetch` | 批量抓取 | 并发获取多个页面 |
|
||||
| `get_weather` | 天气查询 | 支持城市名查询 |
|
||||
| `process_data` | 数据处理 | JSON 转换、格式化等 |
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ app:
|
|||
|
||||
database:
|
||||
type: sqlite
|
||||
url: sqlite:///../chat.db
|
||||
url: sqlite:///./chat.db
|
||||
|
||||
llm:
|
||||
provider: deepseek
|
||||
|
|
|
|||
|
|
@ -66,7 +66,9 @@ const renderedContent = computed(() => {
|
|||
|
||||
function formatTime(time) {
|
||||
if (!time) return ''
|
||||
return new Date(time).toLocaleTimeString('zh-CN', { hour: '2-digit', minute: '2-digit' })
|
||||
const date = new Date(time)
|
||||
// 使用本地时区显示
|
||||
return date.toLocaleTimeString('zh-CN', { hour: '2-digit', minute: '2-digit' })
|
||||
}
|
||||
|
||||
function copyContent() {
|
||||
|
|
|
|||
|
|
@ -9,10 +9,11 @@
|
|||
<span class="step-label">思考中</span>
|
||||
<span class="step-brief">{{ item.brief || '正在思考...' }}</span>
|
||||
<span v-if="streaming && item.key === lastThinkingKey" class="loading-dots">...</span>
|
||||
<span v-else-if="item.content && item.content.length > 1024" class="truncate-hint">已截断</span>
|
||||
<span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span>
|
||||
</div>
|
||||
<div v-if="expandedKeys.has(item.key)" class="step-content">
|
||||
<div class="thinking-text">{{ item.content }}</div>
|
||||
<div class="thinking-text">{{ item.displayContent }}</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
@ -25,6 +26,7 @@
|
|||
<span v-if="item.loading" class="loading-dots">...</span>
|
||||
<span v-else-if="item.isSuccess === true" class="step-badge success">成功</span>
|
||||
<span v-else-if="item.isSuccess === false" class="step-badge error">失败</span>
|
||||
<span v-if="item.fullResult && item.fullResult.length > 1024" class="truncate-hint">已截断</span>
|
||||
<span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span>
|
||||
</div>
|
||||
<div v-if="expandedKeys.has(item.key)" class="step-content">
|
||||
|
|
@ -34,7 +36,7 @@
|
|||
</div>
|
||||
<div v-if="item.resultSummary || item.fullResult" class="tool-detail" style="margin-top: 8px;">
|
||||
<span class="detail-label">结果</span>
|
||||
<pre>{{ item.fullResult || item.resultSummary }}</pre>
|
||||
<pre>{{ item.displayResult }}</pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
|
@ -71,12 +73,14 @@ const allItems = computed(() => {
|
|||
if (props.processSteps && props.processSteps.length > 0) {
|
||||
for (const step of props.processSteps) {
|
||||
if (step.type === 'thinking') {
|
||||
const content = step.content || ''
|
||||
items.push({
|
||||
key: step.id || `thinking-${step.index}`,
|
||||
type: 'thinking',
|
||||
index: step.index,
|
||||
content: step.content || '',
|
||||
brief: step.content ? step.content.slice(0, 50) + (step.content.length > 50 ? '...' : '') : '',
|
||||
content: content,
|
||||
displayContent: content.length > 1024 ? content.slice(0, 1024) + '\n\n[... 内容已截断 ...]' : content,
|
||||
brief: content.slice(0, 50) + (content.length > 50 ? '...' : ''),
|
||||
})
|
||||
} else if (step.type === 'tool_call') {
|
||||
items.push({
|
||||
|
|
@ -97,12 +101,15 @@ const allItems = computed(() => {
|
|||
const toolId = step.id_ref || step.id
|
||||
const match = items.findLast(it => it.type === 'tool_call' && it.id === toolId)
|
||||
if (match) {
|
||||
match.resultSummary = step.content ? step.content.slice(0, 200) : ''
|
||||
match.fullResult = step.content || ''
|
||||
const resultContent = step.content || ''
|
||||
match.resultSummary = resultContent.slice(0, 200)
|
||||
match.fullResult = resultContent
|
||||
match.displayResult = resultContent.length > 1024 ? resultContent.slice(0, 1024) + '\n\n[... 结果已截断 ...]' : resultContent
|
||||
match.isSuccess = step.success !== false
|
||||
match.loading = false
|
||||
} else {
|
||||
// 如果没有找到对应的 tool_call,创建一个占位符
|
||||
const placeholderContent = step.content || ''
|
||||
items.push({
|
||||
key: `result-${step.id || step.index}`,
|
||||
type: 'tool_call',
|
||||
|
|
@ -113,8 +120,9 @@ const allItems = computed(() => {
|
|||
brief: step.name || '工具结果',
|
||||
loading: false,
|
||||
isSuccess: true,
|
||||
resultSummary: step.content ? step.content.slice(0, 200) : '',
|
||||
fullResult: step.content || ''
|
||||
resultSummary: placeholderContent.slice(0, 200),
|
||||
fullResult: placeholderContent,
|
||||
displayResult: placeholderContent.length > 1024 ? placeholderContent.slice(0, 1024) + '\n\n[... 结果已截断 ...]' : placeholderContent
|
||||
})
|
||||
}
|
||||
} else if (step.type === 'text') {
|
||||
|
|
@ -280,6 +288,15 @@ const sparkleIcon = `<svg viewBox="0 0 24 24" width="14" height="14" fill="none"
|
|||
color: var(--success-color);
|
||||
}
|
||||
|
||||
.truncate-hint {
|
||||
font-size: 10px;
|
||||
padding: 2px 6px;
|
||||
background: var(--warning-bg);
|
||||
color: var(--warning-color);
|
||||
border-radius: 4px;
|
||||
margin-left: 4px;
|
||||
}
|
||||
|
||||
.step-badge.error {
|
||||
background: var(--danger-bg);
|
||||
color: var(--danger-color);
|
||||
|
|
|
|||
|
|
@ -41,6 +41,8 @@
|
|||
/* 状态颜色 */
|
||||
--success-color: #059669;
|
||||
--success-bg: rgba(16, 185, 129, 0.1);
|
||||
--warning-color: #d97706;
|
||||
--warning-bg: rgba(217, 119, 6, 0.1);
|
||||
--danger-color: #ef4444;
|
||||
--danger-bg: rgba(239, 68, 68, 0.08);
|
||||
|
||||
|
|
@ -112,6 +114,8 @@
|
|||
|
||||
--success-color: #34d399;
|
||||
--success-bg: rgba(52, 211, 153, 0.15);
|
||||
--warning-color: #fbbf24;
|
||||
--warning-bg: rgba(251, 191, 36, 0.15);
|
||||
--danger-color: #f87171;
|
||||
--danger-bg: rgba(248, 113, 113, 0.15);
|
||||
|
||||
|
|
|
|||
|
|
@ -52,9 +52,9 @@ const blockMathExtension = {
|
|||
}
|
||||
|
||||
marked.use({
|
||||
extensions: [blockMathExtension, mathExtension],
|
||||
gfm: true,
|
||||
breaks: true,
|
||||
gfm: true
|
||||
extensions: [blockMathExtension, mathExtension]
|
||||
})
|
||||
|
||||
export function renderMarkdown(text) {
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@
|
|||
</div>
|
||||
</div>
|
||||
|
||||
<div ref="messagesContainer" class="messages-container">
|
||||
<div ref="messagesContainer" class="messages-container" @scroll="handleScroll">
|
||||
<div v-if="loading" class="load-more-top">
|
||||
<span>加载中...</span>
|
||||
</div>
|
||||
|
|
@ -106,6 +106,7 @@ const sending = ref(false)
|
|||
const streamingMessage = ref(null)
|
||||
const messagesContainer = ref(null)
|
||||
const textareaRef = ref(null)
|
||||
const autoScroll = ref(true)
|
||||
const conversationId = ref(route.params.id)
|
||||
const conversationTitle = ref('')
|
||||
|
||||
|
|
@ -128,6 +129,7 @@ function onKeydown(e) {
|
|||
}
|
||||
|
||||
const loadMessages = async () => {
|
||||
autoScroll.value = true
|
||||
loading.value = true
|
||||
try {
|
||||
const res = await messagesAPI.list(conversationId.value)
|
||||
|
|
@ -191,6 +193,7 @@ const sendMessage = async () => {
|
|||
{ conversation_id: conversationId.value, content },
|
||||
{
|
||||
onProcessStep: (step) => {
|
||||
autoScroll.value = true // 流式开始时启用自动滚动
|
||||
if (!streamingMessage.value) return
|
||||
// 按 id 更新或追加步骤
|
||||
const idx = streamingMessage.value.process_steps.findIndex(s => s.id === step.id)
|
||||
|
|
@ -202,6 +205,7 @@ const sendMessage = async () => {
|
|||
},
|
||||
onDone: () => {
|
||||
// 完成,添加到消息列表
|
||||
autoScroll.value = true
|
||||
if (streamingMessage.value) {
|
||||
messages.value.push({
|
||||
...streamingMessage.value,
|
||||
|
|
@ -230,6 +234,7 @@ const sendMessage = async () => {
|
|||
}
|
||||
|
||||
const scrollToBottom = () => {
|
||||
if (!autoScroll.value) return
|
||||
nextTick(() => {
|
||||
if (messagesContainer.value) {
|
||||
messagesContainer.value.scrollTo({
|
||||
|
|
@ -240,6 +245,15 @@ const scrollToBottom = () => {
|
|||
})
|
||||
}
|
||||
|
||||
// 处理滚动事件,检测用户是否手动滚动
|
||||
const handleScroll = () => {
|
||||
if (!messagesContainer.value) return
|
||||
const { scrollTop, scrollHeight, clientHeight } = messagesContainer.value
|
||||
const distanceToBottom = scrollHeight - scrollTop - clientHeight
|
||||
// 距离底部超过50px时停止自动跟随
|
||||
autoScroll.value = distanceToBottom < 50
|
||||
}
|
||||
|
||||
// 监听流式消息变化,自动滚动
|
||||
watch(() => streamingMessage.value?.process_steps?.length, () => {
|
||||
if (streamingMessage.value) {
|
||||
|
|
|
|||
|
|
@ -102,6 +102,11 @@
|
|||
<label>模型名称</label>
|
||||
<input v-model="form.default_model" placeholder="deepseek-chat / gpt-4" required />
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>最大 Tokens</label>
|
||||
<input v-model.number="form.max_tokens" type="number" placeholder="8192" min="1" />
|
||||
<span class="hint">单次回复最大 token 数,默认 8192</span>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label class="switch-card" :class="{ active: form.is_default }">
|
||||
<div class="switch-content">
|
||||
|
|
@ -201,7 +206,7 @@ const testResult = ref(null)
|
|||
const formError = ref('')
|
||||
|
||||
const form = ref({
|
||||
name: '', base_url: '', api_key: '', default_model: '', is_default: false
|
||||
name: '', base_url: '', api_key: '', default_model: '', max_tokens: 8192, is_default: false
|
||||
})
|
||||
|
||||
const fetchProviders = async () => {
|
||||
|
|
@ -218,7 +223,7 @@ const fetchProviders = async () => {
|
|||
const closeModal = () => {
|
||||
showModal.value = false
|
||||
editing.value = null
|
||||
form.value = { name: '', base_url: '', api_key: '', default_model: '', is_default: false }
|
||||
form.value = { name: '', base_url: '', api_key: '', default_model: '', max_tokens: 8192, is_default: false }
|
||||
formError.value = ''
|
||||
}
|
||||
|
||||
|
|
@ -232,6 +237,7 @@ const editProvider = async (p) => {
|
|||
base_url: res.data.base_url,
|
||||
api_key: res.data.api_key || '',
|
||||
default_model: res.data.default_model,
|
||||
max_tokens: res.data.max_tokens || 8192,
|
||||
is_default: res.data.is_default
|
||||
}
|
||||
}
|
||||
|
|
@ -381,6 +387,7 @@ input:checked + .slider:before { transform: translateX(22px); }
|
|||
.switch-card input:checked + .slider { background-color: var(--accent); }
|
||||
.switch-card input:checked + .slider:before { transform: translateX(22px); }
|
||||
.modal-actions { display: flex; justify-content: flex-end; gap: 1rem; margin-top: 1.5rem; }
|
||||
.form-group .hint { font-size: 0.85rem; color: var(--text); margin-top: 4px; display: block; }
|
||||
.spinner { width: 48px; height: 48px; border: 4px solid var(--border); border-top-color: var(--accent); border-radius: 50%; animation: spin 1s linear infinite; margin: 0 auto 1rem; }
|
||||
@keyframes spin { to { transform: rotate(360deg); } }
|
||||
</style>
|
||||
|
|
|
|||
|
|
@ -7,6 +7,10 @@ from sqlalchemy.orm import Mapped, mapped_column, relationship
|
|||
from luxx.database import Base
|
||||
|
||||
|
||||
def local_now():
|
||||
return datetime.now()
|
||||
|
||||
|
||||
class LLMProvider(Base):
|
||||
"""LLM Provider configuration model"""
|
||||
__tablename__ = "llm_providers"
|
||||
|
|
@ -18,10 +22,11 @@ class LLMProvider(Base):
|
|||
base_url: Mapped[str] = mapped_column(String(500), nullable=False)
|
||||
api_key: Mapped[str] = mapped_column(String(500), nullable=False)
|
||||
default_model: Mapped[str] = mapped_column(String(100), nullable=False, default="gpt-4")
|
||||
max_tokens: Mapped[int] = mapped_column(Integer, default=8192) # 默认 8192
|
||||
is_default: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||
enabled: Mapped[bool] = mapped_column(Boolean, default=True)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime, default=local_now, onupdate=local_now)
|
||||
|
||||
# Relationships
|
||||
user: Mapped["User"] = relationship("User", backref="llm_providers")
|
||||
|
|
@ -35,6 +40,7 @@ class LLMProvider(Base):
|
|||
"provider_type": self.provider_type,
|
||||
"base_url": self.base_url,
|
||||
"default_model": self.default_model,
|
||||
"max_tokens": self.max_tokens,
|
||||
"is_default": self.is_default,
|
||||
"enabled": self.enabled,
|
||||
"created_at": self.created_at.isoformat() if self.created_at else None,
|
||||
|
|
@ -53,8 +59,8 @@ class Project(Base):
|
|||
user_id: Mapped[int] = mapped_column(Integer, ForeignKey("users.id"), nullable=False)
|
||||
name: Mapped[str] = mapped_column(String(255), nullable=False)
|
||||
description: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime, default=local_now, onupdate=local_now)
|
||||
|
||||
# Relationships
|
||||
user: Mapped["User"] = relationship("User", backref="projects")
|
||||
|
|
@ -70,7 +76,7 @@ class User(Base):
|
|||
password_hash: Mapped[Optional[str]] = mapped_column(String(255), nullable=True)
|
||||
role: Mapped[str] = mapped_column(String(20), default="user")
|
||||
is_active: Mapped[bool] = mapped_column(Boolean, default=True)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
|
||||
|
||||
# Relationships
|
||||
conversations: Mapped[List["Conversation"]] = relationship(
|
||||
|
|
@ -102,8 +108,8 @@ class Conversation(Base):
|
|||
temperature: Mapped[float] = mapped_column(Float, default=0.7)
|
||||
max_tokens: Mapped[int] = mapped_column(Integer, default=2000)
|
||||
thinking_enabled: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime, default=local_now, onupdate=local_now)
|
||||
|
||||
# Relationships
|
||||
user: Mapped["User"] = relationship("User", back_populates="conversations")
|
||||
|
|
@ -161,7 +167,7 @@ class Message(Base):
|
|||
role: Mapped[str] = mapped_column(String(16), nullable=False) # user, assistant, system, tool
|
||||
content: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
||||
token_count: Mapped[int] = mapped_column(Integer, default=0)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
|
||||
|
||||
# Relationships
|
||||
conversation: Mapped["Conversation"] = relationship("Conversation", back_populates="messages")
|
||||
|
|
|
|||
|
|
@ -79,8 +79,8 @@ def send_message(
|
|||
)
|
||||
db.add(user_message)
|
||||
|
||||
from datetime import datetime
|
||||
conversation.updated_at = datetime.utcnow()
|
||||
from datetime import datetime, timezone, timedelta
|
||||
conversation.updated_at = datetime.now(timezone(timedelta(hours=8)))
|
||||
|
||||
response = chat_service.non_stream_response(
|
||||
conversation=conversation,
|
||||
|
|
@ -133,7 +133,7 @@ async def stream_message(
|
|||
token_count=len(data.content) // 4
|
||||
)
|
||||
db.add(user_message)
|
||||
conversation.updated_at = datetime.utcnow()
|
||||
conversation.updated_at = datetime.now()
|
||||
db.commit()
|
||||
|
||||
async def event_generator():
|
||||
|
|
|
|||
|
|
@ -20,7 +20,8 @@ def _sse_event(event: str, data: dict) -> str:
|
|||
|
||||
|
||||
def get_llm_client(conversation: Conversation = None):
|
||||
"""Get LLM client, optionally using conversation's provider"""
|
||||
"""Get LLM client, optionally using conversation's provider. Returns (client, max_tokens)"""
|
||||
max_tokens = None
|
||||
if conversation and conversation.provider_id:
|
||||
from luxx.models import LLMProvider
|
||||
from luxx.database import SessionLocal
|
||||
|
|
@ -28,18 +29,19 @@ def get_llm_client(conversation: Conversation = None):
|
|||
try:
|
||||
provider = db.query(LLMProvider).filter(LLMProvider.id == conversation.provider_id).first()
|
||||
if provider:
|
||||
max_tokens = provider.max_tokens
|
||||
client = LLMClient(
|
||||
api_key=provider.api_key,
|
||||
api_url=provider.base_url,
|
||||
model=provider.default_model
|
||||
)
|
||||
return client
|
||||
return client, max_tokens
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
# Fallback to global config
|
||||
client = LLMClient()
|
||||
return client
|
||||
return client, max_tokens
|
||||
|
||||
|
||||
class ChatService:
|
||||
|
|
@ -112,8 +114,10 @@ class ChatService:
|
|||
|
||||
tools = registry.list_all() if tools_enabled else None
|
||||
|
||||
llm = get_llm_client(conversation)
|
||||
llm, provider_max_tokens = get_llm_client(conversation)
|
||||
model = conversation.model or llm.default_model or "gpt-4"
|
||||
# 使用 provider 的 max_tokens,如果 conversation 有自己的 max_tokens 则覆盖
|
||||
max_tokens = conversation.max_tokens if hasattr(conversation, 'max_tokens') and conversation.max_tokens else provider_max_tokens
|
||||
|
||||
# State tracking
|
||||
all_steps = []
|
||||
|
|
@ -146,7 +150,7 @@ class ChatService:
|
|||
messages=messages,
|
||||
tools=tools,
|
||||
temperature=conversation.temperature,
|
||||
max_tokens=conversation.max_tokens
|
||||
max_tokens=max_tokens or 8192
|
||||
):
|
||||
# Parse SSE line
|
||||
# Format: "event: xxx\ndata: {...}\n\n"
|
||||
|
|
|
|||
|
|
@ -1,14 +1,11 @@
|
|||
"""Web crawler tools"""
|
||||
import requests
|
||||
from typing import Dict, Any, List, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
"""Crawler related tools"""
|
||||
from luxx.tools.factory import tool
|
||||
from luxx.tools.services import SearchService, FetchService
|
||||
|
||||
|
||||
@tool(
|
||||
name="web_search",
|
||||
description="Search the internet for information using web search",
|
||||
description="Search the internet for information. Use when you need to find latest news or answer questions.",
|
||||
parameters={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -18,7 +15,7 @@ from luxx.tools.factory import tool
|
|||
},
|
||||
"max_results": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of results to return",
|
||||
"description": "Number of results to return, default 5",
|
||||
"default": 5
|
||||
}
|
||||
},
|
||||
|
|
@ -26,40 +23,25 @@ from luxx.tools.factory import tool
|
|||
},
|
||||
category="crawler"
|
||||
)
|
||||
def web_search(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def web_search(arguments: dict) -> dict:
|
||||
"""
|
||||
Execute web search
|
||||
|
||||
Note: This is a placeholder implementation, real usage requires integrating with actual search APIs
|
||||
such as: Google Custom Search, DuckDuckGo, SerpAPI, etc.
|
||||
Web search tool using DuckDuckGo
|
||||
"""
|
||||
query = arguments.get("query", "")
|
||||
query = arguments["query"]
|
||||
max_results = arguments.get("max_results", 5)
|
||||
|
||||
if not query:
|
||||
return {"success": False, "error": "Query is required"}
|
||||
service = SearchService()
|
||||
results = service.search(query, max_results)
|
||||
|
||||
# Simulated search results
|
||||
# Real implementation should integrate with actual search API
|
||||
return {
|
||||
"success": True,
|
||||
"data": {
|
||||
"query": query,
|
||||
"results": [
|
||||
{
|
||||
"title": f"Result for '{query}' - Example {i+1}",
|
||||
"url": f"https://example.com/result_{i+1}",
|
||||
"snippet": f"This is a sample search result for the query '{query}'. " * 3
|
||||
}
|
||||
for i in range(min(max_results, 5))
|
||||
]
|
||||
}
|
||||
}
|
||||
if not results:
|
||||
return {"success": True, "data": {"query": query, "results": []}, "message": "No results found"}
|
||||
|
||||
return {"success": True, "data": {"query": query, "results": results}}
|
||||
|
||||
|
||||
@tool(
|
||||
name="web_fetch",
|
||||
description="Fetch and parse content from a web page",
|
||||
description="Fetch content from a webpage. Use when user needs detailed information from a page.",
|
||||
parameters={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -67,123 +49,80 @@ def web_search(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
|||
"type": "string",
|
||||
"description": "URL of the webpage to fetch"
|
||||
},
|
||||
"extract_text": {
|
||||
"type": "boolean",
|
||||
"description": "Whether to extract text content only",
|
||||
"default": True
|
||||
"extract_type": {
|
||||
"type": "string",
|
||||
"description": "Extraction type: text, links, or structured",
|
||||
"enum": ["text", "links", "structured"],
|
||||
"default": "text"
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
},
|
||||
category="crawler"
|
||||
)
|
||||
def web_fetch(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Fetch and parse web page content"""
|
||||
url = arguments.get("url", "")
|
||||
extract_text = arguments.get("extract_text", True)
|
||||
def web_fetch(arguments: dict) -> dict:
|
||||
"""
|
||||
Page fetch tool
|
||||
"""
|
||||
url = arguments["url"]
|
||||
extract_type = arguments.get("extract_type", "text")
|
||||
|
||||
if not url:
|
||||
return {"success": False, "error": "URL is required"}
|
||||
|
||||
# Simple URL validation
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = "https://" + url
|
||||
service = FetchService(timeout=15)
|
||||
result = service.fetch(url, extract_type)
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
if "error" in result:
|
||||
return {"success": False, "error": result["error"]}
|
||||
|
||||
if extract_text:
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
# Remove script and style tags
|
||||
for tag in soup(["script", "style"]):
|
||||
tag.decompose()
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
# Clean up extra blank lines
|
||||
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
||||
text = "\n".join(lines)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"data": {
|
||||
"url": url,
|
||||
"title": soup.title.string if soup.title else "",
|
||||
"content": text[:10000] # Limit content length
|
||||
}
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": True,
|
||||
"data": {
|
||||
"url": url,
|
||||
"html": response.text[:50000] # Limit HTML length
|
||||
}
|
||||
}
|
||||
except requests.RequestException as e:
|
||||
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}
|
||||
return {"success": True, "data": result}
|
||||
|
||||
|
||||
@tool(
|
||||
name="extract_links",
|
||||
description="Extract all links from a web page",
|
||||
name="batch_fetch",
|
||||
description="Batch fetch multiple webpages. Use when you need to get content from multiple pages.",
|
||||
parameters={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "URL of the web page"
|
||||
"urls": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of URLs to fetch"
|
||||
},
|
||||
"max_links": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of links to extract",
|
||||
"default": 20
|
||||
"extract_type": {
|
||||
"type": "string",
|
||||
"enum": ["text", "links", "structured"],
|
||||
"default": "text"
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
"required": ["urls"]
|
||||
},
|
||||
category="crawler"
|
||||
)
|
||||
def extract_links(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Extract all links from a web page"""
|
||||
url = arguments.get("url", "")
|
||||
max_links = arguments.get("max_links", 20)
|
||||
def batch_fetch(arguments: dict) -> dict:
|
||||
"""
|
||||
Batch fetch tool
|
||||
"""
|
||||
urls = arguments["urls"]
|
||||
extract_type = arguments.get("extract_type", "text")
|
||||
|
||||
if not url:
|
||||
return {"success": False, "error": "URL is required"}
|
||||
if not urls:
|
||||
return {"success": False, "error": "URLs list is required"}
|
||||
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = "https://" + url
|
||||
if len(urls) > 10:
|
||||
return {"success": False, "error": "Maximum 10 pages allowed"}
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
service = FetchService(timeout=10)
|
||||
results = service.fetch_batch(urls, extract_type)
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
links = []
|
||||
|
||||
for a_tag in soup.find_all("a", href=True)[:max_links]:
|
||||
href = a_tag["href"]
|
||||
# Handle relative URLs
|
||||
if href.startswith("/"):
|
||||
from urllib.parse import urljoin
|
||||
href = urljoin(url, href)
|
||||
links.append({
|
||||
"text": a_tag.get_text(strip=True) or href,
|
||||
"url": href
|
||||
})
|
||||
successful = sum(1 for r in results if "error" not in r)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"data": {
|
||||
"url": url,
|
||||
"links": links
|
||||
"results": results,
|
||||
"total": len(results),
|
||||
"successful": successful
|
||||
}
|
||||
}
|
||||
except requests.RequestException as e:
|
||||
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}
|
||||
|
|
|
|||
|
|
@ -156,7 +156,7 @@ class ToolExecutor:
|
|||
"tool_call_id": call_id,
|
||||
"role": "tool",
|
||||
"name": name,
|
||||
"content": json.dumps(result)
|
||||
"content": json.dumps(result, ensure_ascii=False)
|
||||
}
|
||||
|
||||
def _create_error_result(self, call_id: str, name: str, error: str) -> Dict[str, Any]:
|
||||
|
|
@ -165,7 +165,7 @@ class ToolExecutor:
|
|||
"tool_call_id": call_id,
|
||||
"role": "tool",
|
||||
"name": name,
|
||||
"content": json.dumps({"success": False, "error": error})
|
||||
"content": json.dumps({"success": False, "error": error}, ensure_ascii=False)
|
||||
}
|
||||
|
||||
def clear_cache(self) -> None:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,247 @@
|
|||
"""Tool helper services"""
|
||||
import re
|
||||
import httpx
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
|
||||
class SearchService:
|
||||
"""Search service using DuckDuckGo"""
|
||||
|
||||
def __init__(self, engine: str = "duckduckgo"):
|
||||
self.engine = engine
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
max_results: int = 5,
|
||||
region: str = "cn-zh"
|
||||
) -> List[dict]:
|
||||
"""
|
||||
Execute search
|
||||
|
||||
Args:
|
||||
query: Search keywords
|
||||
max_results: Max result count
|
||||
region: Region setting
|
||||
|
||||
Returns:
|
||||
Search result list
|
||||
"""
|
||||
if self.engine == "duckduckgo":
|
||||
return self._search_duckduckgo(query, max_results, region)
|
||||
else:
|
||||
raise ValueError(f"Unsupported search engine: {self.engine}")
|
||||
|
||||
def _search_duckduckgo(
|
||||
self,
|
||||
query: str,
|
||||
max_results: int,
|
||||
region: str
|
||||
) -> List[dict]:
|
||||
"""DuckDuckGo search via HTML"""
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml",
|
||||
}
|
||||
|
||||
from urllib.parse import quote
|
||||
encoded_query = quote(query)
|
||||
url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
|
||||
|
||||
try:
|
||||
resp = httpx.get(url, headers=headers, timeout=15, follow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
results = []
|
||||
|
||||
for result in soup.select(".result")[:max_results]:
|
||||
title_elem = result.select_one(".result__title a")
|
||||
snippet_elem = result.select_one(".result__snippet")
|
||||
|
||||
if title_elem:
|
||||
raw_url = title_elem.get("href", "")
|
||||
# Clean DuckDuckGo redirect URL
|
||||
if "uddg=" in raw_url:
|
||||
parsed = urlparse(raw_url)
|
||||
params = parse_qs(parsed.query)
|
||||
clean_url = params.get("uddg", [raw_url])[0]
|
||||
else:
|
||||
clean_url = raw_url
|
||||
|
||||
results.append({
|
||||
"title": title_elem.get_text(strip=True),
|
||||
"url": clean_url,
|
||||
"snippet": snippet_elem.get_text(strip=True) if snippet_elem else ""
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class FetchService:
|
||||
"""Page fetch service"""
|
||||
|
||||
def __init__(self, timeout: float = 15.0):
|
||||
self.timeout = timeout
|
||||
self.user_agent = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
def fetch(
|
||||
self,
|
||||
url: str,
|
||||
extract_type: str = "text"
|
||||
) -> dict:
|
||||
"""
|
||||
Fetch a single page
|
||||
|
||||
Args:
|
||||
url: Page URL
|
||||
extract_type: Extract type (text, links, structured)
|
||||
|
||||
Returns:
|
||||
Fetch result
|
||||
"""
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = "https://" + url
|
||||
|
||||
try:
|
||||
resp = httpx.get(
|
||||
url,
|
||||
timeout=self.timeout,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": self.user_agent}
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except httpx.TimeoutException:
|
||||
return {"error": "Request timeout", "url": url}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "url": url}
|
||||
|
||||
html = resp.text
|
||||
extractor = ContentExtractor(html)
|
||||
|
||||
if extract_type == "text":
|
||||
return {
|
||||
"url": url,
|
||||
"title": extractor.extract_title(),
|
||||
"text": extractor.extract_text()[:15000]
|
||||
}
|
||||
elif extract_type == "links":
|
||||
return {
|
||||
"url": url,
|
||||
"links": extractor.extract_links()
|
||||
}
|
||||
else:
|
||||
return extractor.extract_structured(url)
|
||||
|
||||
def fetch_batch(
|
||||
self,
|
||||
urls: List[str],
|
||||
extract_type: str = "text",
|
||||
max_concurrent: int = 5
|
||||
) -> List[dict]:
|
||||
"""
|
||||
Batch fetch pages concurrently.
|
||||
|
||||
Args:
|
||||
urls: URL list
|
||||
extract_type: Extract type
|
||||
max_concurrent: Max concurrent requests (1-5, default 5)
|
||||
|
||||
Returns:
|
||||
Result list (same order as input URLs)
|
||||
"""
|
||||
if len(urls) <= 1:
|
||||
return [self.fetch(url, extract_type) for url in urls]
|
||||
|
||||
max_concurrent = min(max(max_concurrent, 1), 5)
|
||||
results = [None] * len(urls)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_concurrent) as pool:
|
||||
futures = {
|
||||
pool.submit(self.fetch, url, extract_type): i
|
||||
for i, url in enumerate(urls)
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
idx = futures[future]
|
||||
try:
|
||||
results[idx] = future.result()
|
||||
except Exception as e:
|
||||
results[idx] = {"error": str(e)}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class ContentExtractor:
|
||||
"""Content extractor using BeautifulSoup"""
|
||||
|
||||
def __init__(self, html: str):
|
||||
self.html = html
|
||||
self._soup = None
|
||||
|
||||
@property
|
||||
def soup(self):
|
||||
if self._soup is None:
|
||||
from bs4 import BeautifulSoup
|
||||
self._soup = BeautifulSoup(self.html, "html.parser")
|
||||
return self._soup
|
||||
|
||||
def extract_title(self) -> str:
|
||||
"""Extract page title"""
|
||||
if self.soup.title:
|
||||
return self.soup.title.string or ""
|
||||
return ""
|
||||
|
||||
def extract_text(self) -> str:
|
||||
"""Extract plain text"""
|
||||
# Remove script and style
|
||||
for tag in self.soup(["script", "style", "nav", "footer", "header", "aside"]):
|
||||
tag.decompose()
|
||||
|
||||
text = self.soup.get_text(separator="\n", strip=True)
|
||||
# Clean extra whitespace
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text
|
||||
|
||||
def extract_links(self, max_count: int = 50) -> List[dict]:
|
||||
"""Extract links"""
|
||||
links = []
|
||||
for a in self.soup.find_all("a", href=True):
|
||||
text = a.get_text(strip=True)
|
||||
href = a["href"]
|
||||
if text and href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
||||
links.append({"text": text, "url": href})
|
||||
if len(links) >= max_count:
|
||||
break
|
||||
return links
|
||||
|
||||
def extract_structured(self, url: str = "") -> dict:
|
||||
"""Extract structured content"""
|
||||
soup = self.soup
|
||||
|
||||
# Extract title
|
||||
title = ""
|
||||
if soup.title:
|
||||
title = soup.title.string or ""
|
||||
|
||||
# Extract meta description
|
||||
description = ""
|
||||
meta_desc = soup.find("meta", attrs={"name": "description"})
|
||||
if meta_desc:
|
||||
description = meta_desc.get("content", "")
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"title": title.strip(),
|
||||
"description": description.strip(),
|
||||
"text": self.extract_text()[:5000],
|
||||
"links": self.extract_links(20)
|
||||
}
|
||||
|
|
@ -2,7 +2,6 @@
|
|||
name = "luxx"
|
||||
version = "1.0.0"
|
||||
description = "luxx - FastAPI + SQLAlchemy"
|
||||
readme = "docs/README.md"
|
||||
requires-python = ">=3.10"
|
||||
|
||||
dependencies = [
|
||||
|
|
@ -19,6 +18,7 @@ dependencies = [
|
|||
"requests>=2.31.0",
|
||||
"beautifulsoup4>=4.12.3",
|
||||
"lxml>=5.1.0",
|
||||
"httpx>=0.26.0",
|
||||
"pyyaml>=6.0.1",
|
||||
"shortuuid>=1.0.11",
|
||||
"pydantic>=2.5.0",
|
||||
|
|
@ -34,3 +34,6 @@ dev = [
|
|||
"black>=24.0.0",
|
||||
"ruff>=0.1.0",
|
||||
]
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["luxx"]
|
||||
|
|
|
|||
Loading…
Reference in New Issue