feat: 完善爬虫功能
This commit is contained in:
parent
805f8c86da
commit
a84b8617a6
|
|
@ -9,7 +9,7 @@
|
||||||
!README.md
|
!README.md
|
||||||
!.gitignore
|
!.gitignore
|
||||||
|
|
||||||
!luxx/**/*.py
|
!*.py
|
||||||
!asserts/**/*.md
|
!asserts/**/*.md
|
||||||
|
|
||||||
# Dashboard
|
# Dashboard
|
||||||
|
|
|
||||||
|
|
@ -5,9 +5,13 @@
|
||||||
- **框架**: FastAPI 0.109+
|
- **框架**: FastAPI 0.109+
|
||||||
- **数据库**: SQLAlchemy 2.0+
|
- **数据库**: SQLAlchemy 2.0+
|
||||||
- **认证**: JWT (PyJWT)
|
- **认证**: JWT (PyJWT)
|
||||||
- **HTTP客户端**: httpx
|
- **HTTP客户端**: httpx, requests
|
||||||
- **配置**: YAML (PyYAML)
|
- **配置**: YAML (PyYAML)
|
||||||
- **代码执行**: Python 原生执行
|
- **代码执行**: Python 原生执行
|
||||||
|
- **网页爬虫**:
|
||||||
|
- `httpx` - HTTP 客户端
|
||||||
|
- `beautifulsoup4` - HTML 解析
|
||||||
|
- `lxml` - XML/HTML 解析器
|
||||||
|
|
||||||
## 目录结构
|
## 目录结构
|
||||||
|
|
||||||
|
|
@ -36,6 +40,7 @@ luxx/
|
||||||
│ ├── crawler.py # 网页爬虫
|
│ ├── crawler.py # 网页爬虫
|
||||||
│ ├── data.py # 数据处理
|
│ ├── data.py # 数据处理
|
||||||
│ └── weather.py # 天气查询
|
│ └── weather.py # 天气查询
|
||||||
|
│ └── services.py # 工具服务层
|
||||||
└── utils/ # 工具函数
|
└── utils/ # 工具函数
|
||||||
└── helpers.py
|
└── helpers.py
|
||||||
```
|
```
|
||||||
|
|
@ -205,7 +210,9 @@ classDiagram
|
||||||
|------|------|------|
|
|------|------|------|
|
||||||
| `python_execute` | 执行 Python 代码 | 支持 print 输出、变量访问 |
|
| `python_execute` | 执行 Python 代码 | 支持 print 输出、变量访问 |
|
||||||
| `python_eval` | 计算表达式 | 快速求值 |
|
| `python_eval` | 计算表达式 | 快速求值 |
|
||||||
| `web_crawl` | 网页抓取 | BeautifulSoup + httpx |
|
| `web_search` | DuckDuckGo HTML | DuckDuckGo HTML 搜索 |
|
||||||
|
| `web_fetch` | 网页抓取 | httpx + BeautifulSoup,支持 text/links/structured |
|
||||||
|
| `batch_fetch` | 批量抓取 | 并发获取多个页面 |
|
||||||
| `get_weather` | 天气查询 | 支持城市名查询 |
|
| `get_weather` | 天气查询 | 支持城市名查询 |
|
||||||
| `process_data` | 数据处理 | JSON 转换、格式化等 |
|
| `process_data` | 数据处理 | JSON 转换、格式化等 |
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ app:
|
||||||
|
|
||||||
database:
|
database:
|
||||||
type: sqlite
|
type: sqlite
|
||||||
url: sqlite:///../chat.db
|
url: sqlite:///./chat.db
|
||||||
|
|
||||||
llm:
|
llm:
|
||||||
provider: deepseek
|
provider: deepseek
|
||||||
|
|
|
||||||
|
|
@ -66,7 +66,9 @@ const renderedContent = computed(() => {
|
||||||
|
|
||||||
function formatTime(time) {
|
function formatTime(time) {
|
||||||
if (!time) return ''
|
if (!time) return ''
|
||||||
return new Date(time).toLocaleTimeString('zh-CN', { hour: '2-digit', minute: '2-digit' })
|
const date = new Date(time)
|
||||||
|
// 使用本地时区显示
|
||||||
|
return date.toLocaleTimeString('zh-CN', { hour: '2-digit', minute: '2-digit' })
|
||||||
}
|
}
|
||||||
|
|
||||||
function copyContent() {
|
function copyContent() {
|
||||||
|
|
|
||||||
|
|
@ -9,10 +9,11 @@
|
||||||
<span class="step-label">思考中</span>
|
<span class="step-label">思考中</span>
|
||||||
<span class="step-brief">{{ item.brief || '正在思考...' }}</span>
|
<span class="step-brief">{{ item.brief || '正在思考...' }}</span>
|
||||||
<span v-if="streaming && item.key === lastThinkingKey" class="loading-dots">...</span>
|
<span v-if="streaming && item.key === lastThinkingKey" class="loading-dots">...</span>
|
||||||
|
<span v-else-if="item.content && item.content.length > 1024" class="truncate-hint">已截断</span>
|
||||||
<span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span>
|
<span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span>
|
||||||
</div>
|
</div>
|
||||||
<div v-if="expandedKeys.has(item.key)" class="step-content">
|
<div v-if="expandedKeys.has(item.key)" class="step-content">
|
||||||
<div class="thinking-text">{{ item.content }}</div>
|
<div class="thinking-text">{{ item.displayContent }}</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
@ -25,6 +26,7 @@
|
||||||
<span v-if="item.loading" class="loading-dots">...</span>
|
<span v-if="item.loading" class="loading-dots">...</span>
|
||||||
<span v-else-if="item.isSuccess === true" class="step-badge success">成功</span>
|
<span v-else-if="item.isSuccess === true" class="step-badge success">成功</span>
|
||||||
<span v-else-if="item.isSuccess === false" class="step-badge error">失败</span>
|
<span v-else-if="item.isSuccess === false" class="step-badge error">失败</span>
|
||||||
|
<span v-if="item.fullResult && item.fullResult.length > 1024" class="truncate-hint">已截断</span>
|
||||||
<span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span>
|
<span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span>
|
||||||
</div>
|
</div>
|
||||||
<div v-if="expandedKeys.has(item.key)" class="step-content">
|
<div v-if="expandedKeys.has(item.key)" class="step-content">
|
||||||
|
|
@ -34,7 +36,7 @@
|
||||||
</div>
|
</div>
|
||||||
<div v-if="item.resultSummary || item.fullResult" class="tool-detail" style="margin-top: 8px;">
|
<div v-if="item.resultSummary || item.fullResult" class="tool-detail" style="margin-top: 8px;">
|
||||||
<span class="detail-label">结果</span>
|
<span class="detail-label">结果</span>
|
||||||
<pre>{{ item.fullResult || item.resultSummary }}</pre>
|
<pre>{{ item.displayResult }}</pre>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -71,12 +73,14 @@ const allItems = computed(() => {
|
||||||
if (props.processSteps && props.processSteps.length > 0) {
|
if (props.processSteps && props.processSteps.length > 0) {
|
||||||
for (const step of props.processSteps) {
|
for (const step of props.processSteps) {
|
||||||
if (step.type === 'thinking') {
|
if (step.type === 'thinking') {
|
||||||
|
const content = step.content || ''
|
||||||
items.push({
|
items.push({
|
||||||
key: step.id || `thinking-${step.index}`,
|
key: step.id || `thinking-${step.index}`,
|
||||||
type: 'thinking',
|
type: 'thinking',
|
||||||
index: step.index,
|
index: step.index,
|
||||||
content: step.content || '',
|
content: content,
|
||||||
brief: step.content ? step.content.slice(0, 50) + (step.content.length > 50 ? '...' : '') : '',
|
displayContent: content.length > 1024 ? content.slice(0, 1024) + '\n\n[... 内容已截断 ...]' : content,
|
||||||
|
brief: content.slice(0, 50) + (content.length > 50 ? '...' : ''),
|
||||||
})
|
})
|
||||||
} else if (step.type === 'tool_call') {
|
} else if (step.type === 'tool_call') {
|
||||||
items.push({
|
items.push({
|
||||||
|
|
@ -97,12 +101,15 @@ const allItems = computed(() => {
|
||||||
const toolId = step.id_ref || step.id
|
const toolId = step.id_ref || step.id
|
||||||
const match = items.findLast(it => it.type === 'tool_call' && it.id === toolId)
|
const match = items.findLast(it => it.type === 'tool_call' && it.id === toolId)
|
||||||
if (match) {
|
if (match) {
|
||||||
match.resultSummary = step.content ? step.content.slice(0, 200) : ''
|
const resultContent = step.content || ''
|
||||||
match.fullResult = step.content || ''
|
match.resultSummary = resultContent.slice(0, 200)
|
||||||
|
match.fullResult = resultContent
|
||||||
|
match.displayResult = resultContent.length > 1024 ? resultContent.slice(0, 1024) + '\n\n[... 结果已截断 ...]' : resultContent
|
||||||
match.isSuccess = step.success !== false
|
match.isSuccess = step.success !== false
|
||||||
match.loading = false
|
match.loading = false
|
||||||
} else {
|
} else {
|
||||||
// 如果没有找到对应的 tool_call,创建一个占位符
|
// 如果没有找到对应的 tool_call,创建一个占位符
|
||||||
|
const placeholderContent = step.content || ''
|
||||||
items.push({
|
items.push({
|
||||||
key: `result-${step.id || step.index}`,
|
key: `result-${step.id || step.index}`,
|
||||||
type: 'tool_call',
|
type: 'tool_call',
|
||||||
|
|
@ -113,8 +120,9 @@ const allItems = computed(() => {
|
||||||
brief: step.name || '工具结果',
|
brief: step.name || '工具结果',
|
||||||
loading: false,
|
loading: false,
|
||||||
isSuccess: true,
|
isSuccess: true,
|
||||||
resultSummary: step.content ? step.content.slice(0, 200) : '',
|
resultSummary: placeholderContent.slice(0, 200),
|
||||||
fullResult: step.content || ''
|
fullResult: placeholderContent,
|
||||||
|
displayResult: placeholderContent.length > 1024 ? placeholderContent.slice(0, 1024) + '\n\n[... 结果已截断 ...]' : placeholderContent
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
} else if (step.type === 'text') {
|
} else if (step.type === 'text') {
|
||||||
|
|
@ -280,6 +288,15 @@ const sparkleIcon = `<svg viewBox="0 0 24 24" width="14" height="14" fill="none"
|
||||||
color: var(--success-color);
|
color: var(--success-color);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.truncate-hint {
|
||||||
|
font-size: 10px;
|
||||||
|
padding: 2px 6px;
|
||||||
|
background: var(--warning-bg);
|
||||||
|
color: var(--warning-color);
|
||||||
|
border-radius: 4px;
|
||||||
|
margin-left: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
.step-badge.error {
|
.step-badge.error {
|
||||||
background: var(--danger-bg);
|
background: var(--danger-bg);
|
||||||
color: var(--danger-color);
|
color: var(--danger-color);
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,8 @@
|
||||||
/* 状态颜色 */
|
/* 状态颜色 */
|
||||||
--success-color: #059669;
|
--success-color: #059669;
|
||||||
--success-bg: rgba(16, 185, 129, 0.1);
|
--success-bg: rgba(16, 185, 129, 0.1);
|
||||||
|
--warning-color: #d97706;
|
||||||
|
--warning-bg: rgba(217, 119, 6, 0.1);
|
||||||
--danger-color: #ef4444;
|
--danger-color: #ef4444;
|
||||||
--danger-bg: rgba(239, 68, 68, 0.08);
|
--danger-bg: rgba(239, 68, 68, 0.08);
|
||||||
|
|
||||||
|
|
@ -112,6 +114,8 @@
|
||||||
|
|
||||||
--success-color: #34d399;
|
--success-color: #34d399;
|
||||||
--success-bg: rgba(52, 211, 153, 0.15);
|
--success-bg: rgba(52, 211, 153, 0.15);
|
||||||
|
--warning-color: #fbbf24;
|
||||||
|
--warning-bg: rgba(251, 191, 36, 0.15);
|
||||||
--danger-color: #f87171;
|
--danger-color: #f87171;
|
||||||
--danger-bg: rgba(248, 113, 113, 0.15);
|
--danger-bg: rgba(248, 113, 113, 0.15);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -52,9 +52,9 @@ const blockMathExtension = {
|
||||||
}
|
}
|
||||||
|
|
||||||
marked.use({
|
marked.use({
|
||||||
extensions: [blockMathExtension, mathExtension],
|
gfm: true,
|
||||||
breaks: true,
|
breaks: true,
|
||||||
gfm: true
|
extensions: [blockMathExtension, mathExtension]
|
||||||
})
|
})
|
||||||
|
|
||||||
export function renderMarkdown(text) {
|
export function renderMarkdown(text) {
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div ref="messagesContainer" class="messages-container">
|
<div ref="messagesContainer" class="messages-container" @scroll="handleScroll">
|
||||||
<div v-if="loading" class="load-more-top">
|
<div v-if="loading" class="load-more-top">
|
||||||
<span>加载中...</span>
|
<span>加载中...</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -106,6 +106,7 @@ const sending = ref(false)
|
||||||
const streamingMessage = ref(null)
|
const streamingMessage = ref(null)
|
||||||
const messagesContainer = ref(null)
|
const messagesContainer = ref(null)
|
||||||
const textareaRef = ref(null)
|
const textareaRef = ref(null)
|
||||||
|
const autoScroll = ref(true)
|
||||||
const conversationId = ref(route.params.id)
|
const conversationId = ref(route.params.id)
|
||||||
const conversationTitle = ref('')
|
const conversationTitle = ref('')
|
||||||
|
|
||||||
|
|
@ -128,6 +129,7 @@ function onKeydown(e) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const loadMessages = async () => {
|
const loadMessages = async () => {
|
||||||
|
autoScroll.value = true
|
||||||
loading.value = true
|
loading.value = true
|
||||||
try {
|
try {
|
||||||
const res = await messagesAPI.list(conversationId.value)
|
const res = await messagesAPI.list(conversationId.value)
|
||||||
|
|
@ -191,6 +193,7 @@ const sendMessage = async () => {
|
||||||
{ conversation_id: conversationId.value, content },
|
{ conversation_id: conversationId.value, content },
|
||||||
{
|
{
|
||||||
onProcessStep: (step) => {
|
onProcessStep: (step) => {
|
||||||
|
autoScroll.value = true // 流式开始时启用自动滚动
|
||||||
if (!streamingMessage.value) return
|
if (!streamingMessage.value) return
|
||||||
// 按 id 更新或追加步骤
|
// 按 id 更新或追加步骤
|
||||||
const idx = streamingMessage.value.process_steps.findIndex(s => s.id === step.id)
|
const idx = streamingMessage.value.process_steps.findIndex(s => s.id === step.id)
|
||||||
|
|
@ -202,6 +205,7 @@ const sendMessage = async () => {
|
||||||
},
|
},
|
||||||
onDone: () => {
|
onDone: () => {
|
||||||
// 完成,添加到消息列表
|
// 完成,添加到消息列表
|
||||||
|
autoScroll.value = true
|
||||||
if (streamingMessage.value) {
|
if (streamingMessage.value) {
|
||||||
messages.value.push({
|
messages.value.push({
|
||||||
...streamingMessage.value,
|
...streamingMessage.value,
|
||||||
|
|
@ -230,6 +234,7 @@ const sendMessage = async () => {
|
||||||
}
|
}
|
||||||
|
|
||||||
const scrollToBottom = () => {
|
const scrollToBottom = () => {
|
||||||
|
if (!autoScroll.value) return
|
||||||
nextTick(() => {
|
nextTick(() => {
|
||||||
if (messagesContainer.value) {
|
if (messagesContainer.value) {
|
||||||
messagesContainer.value.scrollTo({
|
messagesContainer.value.scrollTo({
|
||||||
|
|
@ -240,6 +245,15 @@ const scrollToBottom = () => {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 处理滚动事件,检测用户是否手动滚动
|
||||||
|
const handleScroll = () => {
|
||||||
|
if (!messagesContainer.value) return
|
||||||
|
const { scrollTop, scrollHeight, clientHeight } = messagesContainer.value
|
||||||
|
const distanceToBottom = scrollHeight - scrollTop - clientHeight
|
||||||
|
// 距离底部超过50px时停止自动跟随
|
||||||
|
autoScroll.value = distanceToBottom < 50
|
||||||
|
}
|
||||||
|
|
||||||
// 监听流式消息变化,自动滚动
|
// 监听流式消息变化,自动滚动
|
||||||
watch(() => streamingMessage.value?.process_steps?.length, () => {
|
watch(() => streamingMessage.value?.process_steps?.length, () => {
|
||||||
if (streamingMessage.value) {
|
if (streamingMessage.value) {
|
||||||
|
|
|
||||||
|
|
@ -102,6 +102,11 @@
|
||||||
<label>模型名称</label>
|
<label>模型名称</label>
|
||||||
<input v-model="form.default_model" placeholder="deepseek-chat / gpt-4" required />
|
<input v-model="form.default_model" placeholder="deepseek-chat / gpt-4" required />
|
||||||
</div>
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<label>最大 Tokens</label>
|
||||||
|
<input v-model.number="form.max_tokens" type="number" placeholder="8192" min="1" />
|
||||||
|
<span class="hint">单次回复最大 token 数,默认 8192</span>
|
||||||
|
</div>
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label class="switch-card" :class="{ active: form.is_default }">
|
<label class="switch-card" :class="{ active: form.is_default }">
|
||||||
<div class="switch-content">
|
<div class="switch-content">
|
||||||
|
|
@ -201,7 +206,7 @@ const testResult = ref(null)
|
||||||
const formError = ref('')
|
const formError = ref('')
|
||||||
|
|
||||||
const form = ref({
|
const form = ref({
|
||||||
name: '', base_url: '', api_key: '', default_model: '', is_default: false
|
name: '', base_url: '', api_key: '', default_model: '', max_tokens: 8192, is_default: false
|
||||||
})
|
})
|
||||||
|
|
||||||
const fetchProviders = async () => {
|
const fetchProviders = async () => {
|
||||||
|
|
@ -218,7 +223,7 @@ const fetchProviders = async () => {
|
||||||
const closeModal = () => {
|
const closeModal = () => {
|
||||||
showModal.value = false
|
showModal.value = false
|
||||||
editing.value = null
|
editing.value = null
|
||||||
form.value = { name: '', base_url: '', api_key: '', default_model: '', is_default: false }
|
form.value = { name: '', base_url: '', api_key: '', default_model: '', max_tokens: 8192, is_default: false }
|
||||||
formError.value = ''
|
formError.value = ''
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -232,6 +237,7 @@ const editProvider = async (p) => {
|
||||||
base_url: res.data.base_url,
|
base_url: res.data.base_url,
|
||||||
api_key: res.data.api_key || '',
|
api_key: res.data.api_key || '',
|
||||||
default_model: res.data.default_model,
|
default_model: res.data.default_model,
|
||||||
|
max_tokens: res.data.max_tokens || 8192,
|
||||||
is_default: res.data.is_default
|
is_default: res.data.is_default
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -381,6 +387,7 @@ input:checked + .slider:before { transform: translateX(22px); }
|
||||||
.switch-card input:checked + .slider { background-color: var(--accent); }
|
.switch-card input:checked + .slider { background-color: var(--accent); }
|
||||||
.switch-card input:checked + .slider:before { transform: translateX(22px); }
|
.switch-card input:checked + .slider:before { transform: translateX(22px); }
|
||||||
.modal-actions { display: flex; justify-content: flex-end; gap: 1rem; margin-top: 1.5rem; }
|
.modal-actions { display: flex; justify-content: flex-end; gap: 1rem; margin-top: 1.5rem; }
|
||||||
|
.form-group .hint { font-size: 0.85rem; color: var(--text); margin-top: 4px; display: block; }
|
||||||
.spinner { width: 48px; height: 48px; border: 4px solid var(--border); border-top-color: var(--accent); border-radius: 50%; animation: spin 1s linear infinite; margin: 0 auto 1rem; }
|
.spinner { width: 48px; height: 48px; border: 4px solid var(--border); border-top-color: var(--accent); border-radius: 50%; animation: spin 1s linear infinite; margin: 0 auto 1rem; }
|
||||||
@keyframes spin { to { transform: rotate(360deg); } }
|
@keyframes spin { to { transform: rotate(360deg); } }
|
||||||
</style>
|
</style>
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,10 @@ from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
from luxx.database import Base
|
from luxx.database import Base
|
||||||
|
|
||||||
|
|
||||||
|
def local_now():
|
||||||
|
return datetime.now()
|
||||||
|
|
||||||
|
|
||||||
class LLMProvider(Base):
|
class LLMProvider(Base):
|
||||||
"""LLM Provider configuration model"""
|
"""LLM Provider configuration model"""
|
||||||
__tablename__ = "llm_providers"
|
__tablename__ = "llm_providers"
|
||||||
|
|
@ -18,10 +22,11 @@ class LLMProvider(Base):
|
||||||
base_url: Mapped[str] = mapped_column(String(500), nullable=False)
|
base_url: Mapped[str] = mapped_column(String(500), nullable=False)
|
||||||
api_key: Mapped[str] = mapped_column(String(500), nullable=False)
|
api_key: Mapped[str] = mapped_column(String(500), nullable=False)
|
||||||
default_model: Mapped[str] = mapped_column(String(100), nullable=False, default="gpt-4")
|
default_model: Mapped[str] = mapped_column(String(100), nullable=False, default="gpt-4")
|
||||||
|
max_tokens: Mapped[int] = mapped_column(Integer, default=8192) # 默认 8192
|
||||||
is_default: Mapped[bool] = mapped_column(Boolean, default=False)
|
is_default: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||||
enabled: Mapped[bool] = mapped_column(Boolean, default=True)
|
enabled: Mapped[bool] = mapped_column(Boolean, default=True)
|
||||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
|
||||||
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
updated_at: Mapped[datetime] = mapped_column(DateTime, default=local_now, onupdate=local_now)
|
||||||
|
|
||||||
# Relationships
|
# Relationships
|
||||||
user: Mapped["User"] = relationship("User", backref="llm_providers")
|
user: Mapped["User"] = relationship("User", backref="llm_providers")
|
||||||
|
|
@ -35,6 +40,7 @@ class LLMProvider(Base):
|
||||||
"provider_type": self.provider_type,
|
"provider_type": self.provider_type,
|
||||||
"base_url": self.base_url,
|
"base_url": self.base_url,
|
||||||
"default_model": self.default_model,
|
"default_model": self.default_model,
|
||||||
|
"max_tokens": self.max_tokens,
|
||||||
"is_default": self.is_default,
|
"is_default": self.is_default,
|
||||||
"enabled": self.enabled,
|
"enabled": self.enabled,
|
||||||
"created_at": self.created_at.isoformat() if self.created_at else None,
|
"created_at": self.created_at.isoformat() if self.created_at else None,
|
||||||
|
|
@ -53,8 +59,8 @@ class Project(Base):
|
||||||
user_id: Mapped[int] = mapped_column(Integer, ForeignKey("users.id"), nullable=False)
|
user_id: Mapped[int] = mapped_column(Integer, ForeignKey("users.id"), nullable=False)
|
||||||
name: Mapped[str] = mapped_column(String(255), nullable=False)
|
name: Mapped[str] = mapped_column(String(255), nullable=False)
|
||||||
description: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
|
description: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
|
||||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
|
||||||
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
updated_at: Mapped[datetime] = mapped_column(DateTime, default=local_now, onupdate=local_now)
|
||||||
|
|
||||||
# Relationships
|
# Relationships
|
||||||
user: Mapped["User"] = relationship("User", backref="projects")
|
user: Mapped["User"] = relationship("User", backref="projects")
|
||||||
|
|
@ -70,7 +76,7 @@ class User(Base):
|
||||||
password_hash: Mapped[Optional[str]] = mapped_column(String(255), nullable=True)
|
password_hash: Mapped[Optional[str]] = mapped_column(String(255), nullable=True)
|
||||||
role: Mapped[str] = mapped_column(String(20), default="user")
|
role: Mapped[str] = mapped_column(String(20), default="user")
|
||||||
is_active: Mapped[bool] = mapped_column(Boolean, default=True)
|
is_active: Mapped[bool] = mapped_column(Boolean, default=True)
|
||||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
|
||||||
|
|
||||||
# Relationships
|
# Relationships
|
||||||
conversations: Mapped[List["Conversation"]] = relationship(
|
conversations: Mapped[List["Conversation"]] = relationship(
|
||||||
|
|
@ -102,8 +108,8 @@ class Conversation(Base):
|
||||||
temperature: Mapped[float] = mapped_column(Float, default=0.7)
|
temperature: Mapped[float] = mapped_column(Float, default=0.7)
|
||||||
max_tokens: Mapped[int] = mapped_column(Integer, default=2000)
|
max_tokens: Mapped[int] = mapped_column(Integer, default=2000)
|
||||||
thinking_enabled: Mapped[bool] = mapped_column(Boolean, default=False)
|
thinking_enabled: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
|
||||||
updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
updated_at: Mapped[datetime] = mapped_column(DateTime, default=local_now, onupdate=local_now)
|
||||||
|
|
||||||
# Relationships
|
# Relationships
|
||||||
user: Mapped["User"] = relationship("User", back_populates="conversations")
|
user: Mapped["User"] = relationship("User", back_populates="conversations")
|
||||||
|
|
@ -161,7 +167,7 @@ class Message(Base):
|
||||||
role: Mapped[str] = mapped_column(String(16), nullable=False) # user, assistant, system, tool
|
role: Mapped[str] = mapped_column(String(16), nullable=False) # user, assistant, system, tool
|
||||||
content: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
content: Mapped[str] = mapped_column(Text, nullable=False, default="")
|
||||||
token_count: Mapped[int] = mapped_column(Integer, default=0)
|
token_count: Mapped[int] = mapped_column(Integer, default=0)
|
||||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
created_at: Mapped[datetime] = mapped_column(DateTime, default=local_now)
|
||||||
|
|
||||||
# Relationships
|
# Relationships
|
||||||
conversation: Mapped["Conversation"] = relationship("Conversation", back_populates="messages")
|
conversation: Mapped["Conversation"] = relationship("Conversation", back_populates="messages")
|
||||||
|
|
|
||||||
|
|
@ -79,8 +79,8 @@ def send_message(
|
||||||
)
|
)
|
||||||
db.add(user_message)
|
db.add(user_message)
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone, timedelta
|
||||||
conversation.updated_at = datetime.utcnow()
|
conversation.updated_at = datetime.now(timezone(timedelta(hours=8)))
|
||||||
|
|
||||||
response = chat_service.non_stream_response(
|
response = chat_service.non_stream_response(
|
||||||
conversation=conversation,
|
conversation=conversation,
|
||||||
|
|
@ -133,7 +133,7 @@ async def stream_message(
|
||||||
token_count=len(data.content) // 4
|
token_count=len(data.content) // 4
|
||||||
)
|
)
|
||||||
db.add(user_message)
|
db.add(user_message)
|
||||||
conversation.updated_at = datetime.utcnow()
|
conversation.updated_at = datetime.now()
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
async def event_generator():
|
async def event_generator():
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,8 @@ def _sse_event(event: str, data: dict) -> str:
|
||||||
|
|
||||||
|
|
||||||
def get_llm_client(conversation: Conversation = None):
|
def get_llm_client(conversation: Conversation = None):
|
||||||
"""Get LLM client, optionally using conversation's provider"""
|
"""Get LLM client, optionally using conversation's provider. Returns (client, max_tokens)"""
|
||||||
|
max_tokens = None
|
||||||
if conversation and conversation.provider_id:
|
if conversation and conversation.provider_id:
|
||||||
from luxx.models import LLMProvider
|
from luxx.models import LLMProvider
|
||||||
from luxx.database import SessionLocal
|
from luxx.database import SessionLocal
|
||||||
|
|
@ -28,18 +29,19 @@ def get_llm_client(conversation: Conversation = None):
|
||||||
try:
|
try:
|
||||||
provider = db.query(LLMProvider).filter(LLMProvider.id == conversation.provider_id).first()
|
provider = db.query(LLMProvider).filter(LLMProvider.id == conversation.provider_id).first()
|
||||||
if provider:
|
if provider:
|
||||||
|
max_tokens = provider.max_tokens
|
||||||
client = LLMClient(
|
client = LLMClient(
|
||||||
api_key=provider.api_key,
|
api_key=provider.api_key,
|
||||||
api_url=provider.base_url,
|
api_url=provider.base_url,
|
||||||
model=provider.default_model
|
model=provider.default_model
|
||||||
)
|
)
|
||||||
return client
|
return client, max_tokens
|
||||||
finally:
|
finally:
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
# Fallback to global config
|
# Fallback to global config
|
||||||
client = LLMClient()
|
client = LLMClient()
|
||||||
return client
|
return client, max_tokens
|
||||||
|
|
||||||
|
|
||||||
class ChatService:
|
class ChatService:
|
||||||
|
|
@ -112,8 +114,10 @@ class ChatService:
|
||||||
|
|
||||||
tools = registry.list_all() if tools_enabled else None
|
tools = registry.list_all() if tools_enabled else None
|
||||||
|
|
||||||
llm = get_llm_client(conversation)
|
llm, provider_max_tokens = get_llm_client(conversation)
|
||||||
model = conversation.model or llm.default_model or "gpt-4"
|
model = conversation.model or llm.default_model or "gpt-4"
|
||||||
|
# 使用 provider 的 max_tokens,如果 conversation 有自己的 max_tokens 则覆盖
|
||||||
|
max_tokens = conversation.max_tokens if hasattr(conversation, 'max_tokens') and conversation.max_tokens else provider_max_tokens
|
||||||
|
|
||||||
# State tracking
|
# State tracking
|
||||||
all_steps = []
|
all_steps = []
|
||||||
|
|
@ -146,7 +150,7 @@ class ChatService:
|
||||||
messages=messages,
|
messages=messages,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
temperature=conversation.temperature,
|
temperature=conversation.temperature,
|
||||||
max_tokens=conversation.max_tokens
|
max_tokens=max_tokens or 8192
|
||||||
):
|
):
|
||||||
# Parse SSE line
|
# Parse SSE line
|
||||||
# Format: "event: xxx\ndata: {...}\n\n"
|
# Format: "event: xxx\ndata: {...}\n\n"
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,11 @@
|
||||||
"""Web crawler tools"""
|
"""Crawler related tools"""
|
||||||
import requests
|
|
||||||
from typing import Dict, Any, List, Optional
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from luxx.tools.factory import tool
|
from luxx.tools.factory import tool
|
||||||
|
from luxx.tools.services import SearchService, FetchService
|
||||||
|
|
||||||
|
|
||||||
@tool(
|
@tool(
|
||||||
name="web_search",
|
name="web_search",
|
||||||
description="Search the internet for information using web search",
|
description="Search the internet for information. Use when you need to find latest news or answer questions.",
|
||||||
parameters={
|
parameters={
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
|
@ -18,7 +15,7 @@ from luxx.tools.factory import tool
|
||||||
},
|
},
|
||||||
"max_results": {
|
"max_results": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Maximum number of results to return",
|
"description": "Number of results to return, default 5",
|
||||||
"default": 5
|
"default": 5
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
@ -26,164 +23,106 @@ from luxx.tools.factory import tool
|
||||||
},
|
},
|
||||||
category="crawler"
|
category="crawler"
|
||||||
)
|
)
|
||||||
def web_search(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
def web_search(arguments: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
Execute web search
|
Web search tool using DuckDuckGo
|
||||||
|
|
||||||
Note: This is a placeholder implementation, real usage requires integrating with actual search APIs
|
|
||||||
such as: Google Custom Search, DuckDuckGo, SerpAPI, etc.
|
|
||||||
"""
|
"""
|
||||||
query = arguments.get("query", "")
|
query = arguments["query"]
|
||||||
max_results = arguments.get("max_results", 5)
|
max_results = arguments.get("max_results", 5)
|
||||||
|
|
||||||
if not query:
|
service = SearchService()
|
||||||
return {"success": False, "error": "Query is required"}
|
results = service.search(query, max_results)
|
||||||
|
|
||||||
# Simulated search results
|
if not results:
|
||||||
# Real implementation should integrate with actual search API
|
return {"success": True, "data": {"query": query, "results": []}, "message": "No results found"}
|
||||||
return {
|
|
||||||
"success": True,
|
return {"success": True, "data": {"query": query, "results": results}}
|
||||||
"data": {
|
|
||||||
"query": query,
|
|
||||||
"results": [
|
|
||||||
{
|
|
||||||
"title": f"Result for '{query}' - Example {i+1}",
|
|
||||||
"url": f"https://example.com/result_{i+1}",
|
|
||||||
"snippet": f"This is a sample search result for the query '{query}'. " * 3
|
|
||||||
}
|
|
||||||
for i in range(min(max_results, 5))
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@tool(
|
@tool(
|
||||||
name="web_fetch",
|
name="web_fetch",
|
||||||
description="Fetch and parse content from a web page",
|
description="Fetch content from a webpage. Use when user needs detailed information from a page.",
|
||||||
parameters={
|
parameters={
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"url": {
|
"url": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "URL of the web page to fetch"
|
"description": "URL of the webpage to fetch"
|
||||||
},
|
},
|
||||||
"extract_text": {
|
"extract_type": {
|
||||||
"type": "boolean",
|
"type": "string",
|
||||||
"description": "Whether to extract text content only",
|
"description": "Extraction type: text, links, or structured",
|
||||||
"default": True
|
"enum": ["text", "links", "structured"],
|
||||||
|
"default": "text"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": ["url"]
|
"required": ["url"]
|
||||||
},
|
},
|
||||||
category="crawler"
|
category="crawler"
|
||||||
)
|
)
|
||||||
def web_fetch(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
def web_fetch(arguments: dict) -> dict:
|
||||||
"""Fetch and parse web page content"""
|
"""
|
||||||
url = arguments.get("url", "")
|
Page fetch tool
|
||||||
extract_text = arguments.get("extract_text", True)
|
"""
|
||||||
|
url = arguments["url"]
|
||||||
|
extract_type = arguments.get("extract_type", "text")
|
||||||
|
|
||||||
if not url:
|
if not url:
|
||||||
return {"success": False, "error": "URL is required"}
|
return {"success": False, "error": "URL is required"}
|
||||||
|
|
||||||
# Simple URL validation
|
service = FetchService(timeout=15)
|
||||||
if not url.startswith(("http://", "https://")):
|
result = service.fetch(url, extract_type)
|
||||||
url = "https://" + url
|
|
||||||
|
|
||||||
try:
|
if "error" in result:
|
||||||
headers = {
|
return {"success": False, "error": result["error"]}
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
||||||
}
|
|
||||||
response = requests.get(url, headers=headers, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
if extract_text:
|
return {"success": True, "data": result}
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
# Remove script and style tags
|
|
||||||
for tag in soup(["script", "style"]):
|
|
||||||
tag.decompose()
|
|
||||||
text = soup.get_text(separator="\n", strip=True)
|
|
||||||
# Clean up extra blank lines
|
|
||||||
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
|
||||||
text = "\n".join(lines)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"data": {
|
|
||||||
"url": url,
|
|
||||||
"title": soup.title.string if soup.title else "",
|
|
||||||
"content": text[:10000] # Limit content length
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"data": {
|
|
||||||
"url": url,
|
|
||||||
"html": response.text[:50000] # Limit HTML length
|
|
||||||
}
|
|
||||||
}
|
|
||||||
except requests.RequestException as e:
|
|
||||||
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}
|
|
||||||
|
|
||||||
|
|
||||||
@tool(
|
@tool(
|
||||||
name="extract_links",
|
name="batch_fetch",
|
||||||
description="Extract all links from a web page",
|
description="Batch fetch multiple webpages. Use when you need to get content from multiple pages.",
|
||||||
parameters={
|
parameters={
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"url": {
|
"urls": {
|
||||||
"type": "string",
|
"type": "array",
|
||||||
"description": "URL of the web page"
|
"items": {"type": "string"},
|
||||||
|
"description": "List of URLs to fetch"
|
||||||
},
|
},
|
||||||
"max_links": {
|
"extract_type": {
|
||||||
"type": "integer",
|
"type": "string",
|
||||||
"description": "Maximum number of links to extract",
|
"enum": ["text", "links", "structured"],
|
||||||
"default": 20
|
"default": "text"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": ["url"]
|
"required": ["urls"]
|
||||||
},
|
},
|
||||||
category="crawler"
|
category="crawler"
|
||||||
)
|
)
|
||||||
def extract_links(arguments: Dict[str, Any]) -> Dict[str, Any]:
|
def batch_fetch(arguments: dict) -> dict:
|
||||||
"""Extract all links from a web page"""
|
"""
|
||||||
url = arguments.get("url", "")
|
Batch fetch tool
|
||||||
max_links = arguments.get("max_links", 20)
|
"""
|
||||||
|
urls = arguments["urls"]
|
||||||
|
extract_type = arguments.get("extract_type", "text")
|
||||||
|
|
||||||
if not url:
|
if not urls:
|
||||||
return {"success": False, "error": "URL is required"}
|
return {"success": False, "error": "URLs list is required"}
|
||||||
|
|
||||||
if not url.startswith(("http://", "https://")):
|
if len(urls) > 10:
|
||||||
url = "https://" + url
|
return {"success": False, "error": "Maximum 10 pages allowed"}
|
||||||
|
|
||||||
try:
|
service = FetchService(timeout=10)
|
||||||
headers = {
|
results = service.fetch_batch(urls, extract_type)
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
||||||
|
successful = sum(1 for r in results if "error" not in r)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"data": {
|
||||||
|
"results": results,
|
||||||
|
"total": len(results),
|
||||||
|
"successful": successful
|
||||||
}
|
}
|
||||||
response = requests.get(url, headers=headers, timeout=10)
|
}
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
links = []
|
|
||||||
|
|
||||||
for a_tag in soup.find_all("a", href=True)[:max_links]:
|
|
||||||
href = a_tag["href"]
|
|
||||||
# Handle relative URLs
|
|
||||||
if href.startswith("/"):
|
|
||||||
from urllib.parse import urljoin
|
|
||||||
href = urljoin(url, href)
|
|
||||||
links.append({
|
|
||||||
"text": a_tag.get_text(strip=True) or href,
|
|
||||||
"url": href
|
|
||||||
})
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"data": {
|
|
||||||
"url": url,
|
|
||||||
"links": links
|
|
||||||
}
|
|
||||||
}
|
|
||||||
except requests.RequestException as e:
|
|
||||||
return {"success": False, "error": f"Failed to fetch URL: {str(e)}"}
|
|
||||||
|
|
|
||||||
|
|
@ -156,7 +156,7 @@ class ToolExecutor:
|
||||||
"tool_call_id": call_id,
|
"tool_call_id": call_id,
|
||||||
"role": "tool",
|
"role": "tool",
|
||||||
"name": name,
|
"name": name,
|
||||||
"content": json.dumps(result)
|
"content": json.dumps(result, ensure_ascii=False)
|
||||||
}
|
}
|
||||||
|
|
||||||
def _create_error_result(self, call_id: str, name: str, error: str) -> Dict[str, Any]:
|
def _create_error_result(self, call_id: str, name: str, error: str) -> Dict[str, Any]:
|
||||||
|
|
@ -165,7 +165,7 @@ class ToolExecutor:
|
||||||
"tool_call_id": call_id,
|
"tool_call_id": call_id,
|
||||||
"role": "tool",
|
"role": "tool",
|
||||||
"name": name,
|
"name": name,
|
||||||
"content": json.dumps({"success": False, "error": error})
|
"content": json.dumps({"success": False, "error": error}, ensure_ascii=False)
|
||||||
}
|
}
|
||||||
|
|
||||||
def clear_cache(self) -> None:
|
def clear_cache(self) -> None:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,247 @@
|
||||||
|
"""Tool helper services"""
|
||||||
|
import re
|
||||||
|
import httpx
|
||||||
|
from urllib.parse import parse_qs, urlparse
|
||||||
|
from typing import List
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
|
||||||
|
class SearchService:
|
||||||
|
"""Search service using DuckDuckGo"""
|
||||||
|
|
||||||
|
def __init__(self, engine: str = "duckduckgo"):
|
||||||
|
self.engine = engine
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
max_results: int = 5,
|
||||||
|
region: str = "cn-zh"
|
||||||
|
) -> List[dict]:
|
||||||
|
"""
|
||||||
|
Execute search
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search keywords
|
||||||
|
max_results: Max result count
|
||||||
|
region: Region setting
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Search result list
|
||||||
|
"""
|
||||||
|
if self.engine == "duckduckgo":
|
||||||
|
return self._search_duckduckgo(query, max_results, region)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported search engine: {self.engine}")
|
||||||
|
|
||||||
|
def _search_duckduckgo(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
max_results: int,
|
||||||
|
region: str
|
||||||
|
) -> List[dict]:
|
||||||
|
"""DuckDuckGo search via HTML"""
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||||
|
"Accept": "text/html,application/xhtml+xml",
|
||||||
|
}
|
||||||
|
|
||||||
|
from urllib.parse import quote
|
||||||
|
encoded_query = quote(query)
|
||||||
|
url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = httpx.get(url, headers=headers, timeout=15, follow_redirects=True)
|
||||||
|
resp.raise_for_status()
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for result in soup.select(".result")[:max_results]:
|
||||||
|
title_elem = result.select_one(".result__title a")
|
||||||
|
snippet_elem = result.select_one(".result__snippet")
|
||||||
|
|
||||||
|
if title_elem:
|
||||||
|
raw_url = title_elem.get("href", "")
|
||||||
|
# Clean DuckDuckGo redirect URL
|
||||||
|
if "uddg=" in raw_url:
|
||||||
|
parsed = urlparse(raw_url)
|
||||||
|
params = parse_qs(parsed.query)
|
||||||
|
clean_url = params.get("uddg", [raw_url])[0]
|
||||||
|
else:
|
||||||
|
clean_url = raw_url
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"title": title_elem.get_text(strip=True),
|
||||||
|
"url": clean_url,
|
||||||
|
"snippet": snippet_elem.get_text(strip=True) if snippet_elem else ""
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
class FetchService:
|
||||||
|
"""Page fetch service"""
|
||||||
|
|
||||||
|
def __init__(self, timeout: float = 15.0):
|
||||||
|
self.timeout = timeout
|
||||||
|
self.user_agent = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
|
def fetch(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
extract_type: str = "text"
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Fetch a single page
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Page URL
|
||||||
|
extract_type: Extract type (text, links, structured)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Fetch result
|
||||||
|
"""
|
||||||
|
if not url.startswith(("http://", "https://")):
|
||||||
|
url = "https://" + url
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = httpx.get(
|
||||||
|
url,
|
||||||
|
timeout=self.timeout,
|
||||||
|
follow_redirects=True,
|
||||||
|
headers={"User-Agent": self.user_agent}
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
return {"error": "Request timeout", "url": url}
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e), "url": url}
|
||||||
|
|
||||||
|
html = resp.text
|
||||||
|
extractor = ContentExtractor(html)
|
||||||
|
|
||||||
|
if extract_type == "text":
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"title": extractor.extract_title(),
|
||||||
|
"text": extractor.extract_text()[:15000]
|
||||||
|
}
|
||||||
|
elif extract_type == "links":
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"links": extractor.extract_links()
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return extractor.extract_structured(url)
|
||||||
|
|
||||||
|
def fetch_batch(
|
||||||
|
self,
|
||||||
|
urls: List[str],
|
||||||
|
extract_type: str = "text",
|
||||||
|
max_concurrent: int = 5
|
||||||
|
) -> List[dict]:
|
||||||
|
"""
|
||||||
|
Batch fetch pages concurrently.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: URL list
|
||||||
|
extract_type: Extract type
|
||||||
|
max_concurrent: Max concurrent requests (1-5, default 5)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Result list (same order as input URLs)
|
||||||
|
"""
|
||||||
|
if len(urls) <= 1:
|
||||||
|
return [self.fetch(url, extract_type) for url in urls]
|
||||||
|
|
||||||
|
max_concurrent = min(max(max_concurrent, 1), 5)
|
||||||
|
results = [None] * len(urls)
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=max_concurrent) as pool:
|
||||||
|
futures = {
|
||||||
|
pool.submit(self.fetch, url, extract_type): i
|
||||||
|
for i, url in enumerate(urls)
|
||||||
|
}
|
||||||
|
for future in as_completed(futures):
|
||||||
|
idx = futures[future]
|
||||||
|
try:
|
||||||
|
results[idx] = future.result()
|
||||||
|
except Exception as e:
|
||||||
|
results[idx] = {"error": str(e)}
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
class ContentExtractor:
|
||||||
|
"""Content extractor using BeautifulSoup"""
|
||||||
|
|
||||||
|
def __init__(self, html: str):
|
||||||
|
self.html = html
|
||||||
|
self._soup = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def soup(self):
|
||||||
|
if self._soup is None:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
self._soup = BeautifulSoup(self.html, "html.parser")
|
||||||
|
return self._soup
|
||||||
|
|
||||||
|
def extract_title(self) -> str:
|
||||||
|
"""Extract page title"""
|
||||||
|
if self.soup.title:
|
||||||
|
return self.soup.title.string or ""
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def extract_text(self) -> str:
|
||||||
|
"""Extract plain text"""
|
||||||
|
# Remove script and style
|
||||||
|
for tag in self.soup(["script", "style", "nav", "footer", "header", "aside"]):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
text = self.soup.get_text(separator="\n", strip=True)
|
||||||
|
# Clean extra whitespace
|
||||||
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def extract_links(self, max_count: int = 50) -> List[dict]:
|
||||||
|
"""Extract links"""
|
||||||
|
links = []
|
||||||
|
for a in self.soup.find_all("a", href=True):
|
||||||
|
text = a.get_text(strip=True)
|
||||||
|
href = a["href"]
|
||||||
|
if text and href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
||||||
|
links.append({"text": text, "url": href})
|
||||||
|
if len(links) >= max_count:
|
||||||
|
break
|
||||||
|
return links
|
||||||
|
|
||||||
|
def extract_structured(self, url: str = "") -> dict:
|
||||||
|
"""Extract structured content"""
|
||||||
|
soup = self.soup
|
||||||
|
|
||||||
|
# Extract title
|
||||||
|
title = ""
|
||||||
|
if soup.title:
|
||||||
|
title = soup.title.string or ""
|
||||||
|
|
||||||
|
# Extract meta description
|
||||||
|
description = ""
|
||||||
|
meta_desc = soup.find("meta", attrs={"name": "description"})
|
||||||
|
if meta_desc:
|
||||||
|
description = meta_desc.get("content", "")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"title": title.strip(),
|
||||||
|
"description": description.strip(),
|
||||||
|
"text": self.extract_text()[:5000],
|
||||||
|
"links": self.extract_links(20)
|
||||||
|
}
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
name = "luxx"
|
name = "luxx"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
description = "luxx - FastAPI + SQLAlchemy"
|
description = "luxx - FastAPI + SQLAlchemy"
|
||||||
readme = "docs/README.md"
|
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
|
@ -19,6 +18,7 @@ dependencies = [
|
||||||
"requests>=2.31.0",
|
"requests>=2.31.0",
|
||||||
"beautifulsoup4>=4.12.3",
|
"beautifulsoup4>=4.12.3",
|
||||||
"lxml>=5.1.0",
|
"lxml>=5.1.0",
|
||||||
|
"httpx>=0.26.0",
|
||||||
"pyyaml>=6.0.1",
|
"pyyaml>=6.0.1",
|
||||||
"shortuuid>=1.0.11",
|
"shortuuid>=1.0.11",
|
||||||
"pydantic>=2.5.0",
|
"pydantic>=2.5.0",
|
||||||
|
|
@ -34,3 +34,6 @@ dev = [
|
||||||
"black>=24.0.0",
|
"black>=24.0.0",
|
||||||
"ruff>=0.1.0",
|
"ruff>=0.1.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
packages = ["luxx"]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue