refactor: 精简项目代码
This commit is contained in:
parent
a84b8617a6
commit
f10c5de950
|
|
@ -1,6 +1,6 @@
|
|||
<script setup>
|
||||
import { ref } from 'vue'
|
||||
import { useAuth } from './composables/useAuth.js'
|
||||
import { useAuth } from './utils/useAuth.js'
|
||||
import AppSidebar from './components/AppSidebar.vue'
|
||||
|
||||
const { isLoggedIn } = useAuth()
|
||||
|
|
|
|||
|
|
@ -9,7 +9,6 @@
|
|||
<span class="step-label">思考中</span>
|
||||
<span class="step-brief">{{ item.brief || '正在思考...' }}</span>
|
||||
<span v-if="streaming && item.key === lastThinkingKey" class="loading-dots">...</span>
|
||||
<span v-else-if="item.content && item.content.length > 1024" class="truncate-hint">已截断</span>
|
||||
<span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span>
|
||||
</div>
|
||||
<div v-if="expandedKeys.has(item.key)" class="step-content">
|
||||
|
|
@ -26,7 +25,6 @@
|
|||
<span v-if="item.loading" class="loading-dots">...</span>
|
||||
<span v-else-if="item.isSuccess === true" class="step-badge success">成功</span>
|
||||
<span v-else-if="item.isSuccess === false" class="step-badge error">失败</span>
|
||||
<span v-if="item.fullResult && item.fullResult.length > 1024" class="truncate-hint">已截断</span>
|
||||
<span class="arrow" :class="{ open: expandedKeys.has(item.key) }" v-html="chevronDown"></span>
|
||||
</div>
|
||||
<div v-if="expandedKeys.has(item.key)" class="step-content">
|
||||
|
|
@ -79,7 +77,7 @@ const allItems = computed(() => {
|
|||
type: 'thinking',
|
||||
index: step.index,
|
||||
content: content,
|
||||
displayContent: content.length > 1024 ? content.slice(0, 1024) + '\n\n[... 内容已截断 ...]' : content,
|
||||
displayContent: content.length > 1024 ? content.slice(0, 1024) + '...' : content,
|
||||
brief: content.slice(0, 50) + (content.length > 50 ? '...' : ''),
|
||||
})
|
||||
} else if (step.type === 'tool_call') {
|
||||
|
|
@ -104,7 +102,7 @@ const allItems = computed(() => {
|
|||
const resultContent = step.content || ''
|
||||
match.resultSummary = resultContent.slice(0, 200)
|
||||
match.fullResult = resultContent
|
||||
match.displayResult = resultContent.length > 1024 ? resultContent.slice(0, 1024) + '\n\n[... 结果已截断 ...]' : resultContent
|
||||
match.displayResult = resultContent.length > 1024 ? resultContent.slice(0, 1024) + '...' : resultContent
|
||||
match.isSuccess = step.success !== false
|
||||
match.loading = false
|
||||
} else {
|
||||
|
|
@ -122,7 +120,7 @@ const allItems = computed(() => {
|
|||
isSuccess: true,
|
||||
resultSummary: placeholderContent.slice(0, 200),
|
||||
fullResult: placeholderContent,
|
||||
displayResult: placeholderContent.length > 1024 ? placeholderContent.slice(0, 1024) + '\n\n[... 结果已截断 ...]' : placeholderContent
|
||||
displayResult: placeholderContent.length > 1024 ? placeholderContent.slice(0, 1024) + '...' : placeholderContent
|
||||
})
|
||||
}
|
||||
} else if (step.type === 'text') {
|
||||
|
|
@ -288,15 +286,6 @@ const sparkleIcon = `<svg viewBox="0 0 24 24" width="14" height="14" fill="none"
|
|||
color: var(--success-color);
|
||||
}
|
||||
|
||||
.truncate-hint {
|
||||
font-size: 10px;
|
||||
padding: 2px 6px;
|
||||
background: var(--warning-bg);
|
||||
color: var(--warning-color);
|
||||
border-radius: 4px;
|
||||
margin-left: 4px;
|
||||
}
|
||||
|
||||
.step-badge.error {
|
||||
background: var(--danger-bg);
|
||||
color: var(--danger-color);
|
||||
|
|
|
|||
|
|
@ -1,5 +0,0 @@
|
|||
// 导出所有组合式函数
|
||||
export { useAuth } from './useAuth.js'
|
||||
export { useApi, usePagination, useForm } from './useApi.js'
|
||||
export { formatDate, formatNumber, truncate } from './useFormatters.js'
|
||||
export { debounce, throttle, storage, copyToClipboard } from './useUtils.js'
|
||||
|
|
@ -2,7 +2,7 @@ import { createApp } from 'vue'
|
|||
import './style.css'
|
||||
import App from './App.vue'
|
||||
import router from './router'
|
||||
import pinia from './stores'
|
||||
import { pinia } from './utils'
|
||||
|
||||
createApp(App)
|
||||
.use(router)
|
||||
|
|
|
|||
|
|
@ -1,60 +0,0 @@
|
|||
import { defineStore } from 'pinia'
|
||||
import api from '../services/api'
|
||||
|
||||
export const useAuthStore = defineStore('auth', {
|
||||
state: () => ({
|
||||
user: null,
|
||||
token: localStorage.getItem('access_token') || null,
|
||||
isAuthenticated: !!localStorage.getItem('access_token')
|
||||
}),
|
||||
|
||||
actions: {
|
||||
async login(credentials) {
|
||||
try {
|
||||
const response = await api.post('/auth/login', credentials)
|
||||
this.token = response.data.access_token
|
||||
this.user = response.data.user
|
||||
this.isAuthenticated = true
|
||||
localStorage.setItem('access_token', this.token)
|
||||
return { success: true }
|
||||
} catch (error) {
|
||||
return { success: false, error: error.message }
|
||||
}
|
||||
},
|
||||
|
||||
async register(userData) {
|
||||
try {
|
||||
const response = await api.post('/auth/register', userData)
|
||||
// 注册后自动登录
|
||||
return this.login({ username: userData.username, password: userData.password })
|
||||
} catch (error) {
|
||||
return { success: false, error: error.message }
|
||||
}
|
||||
},
|
||||
|
||||
async logout() {
|
||||
try {
|
||||
await api.post('/auth/logout')
|
||||
} catch (error) {
|
||||
// 忽略错误
|
||||
}
|
||||
this.token = null
|
||||
this.user = null
|
||||
this.isAuthenticated = false
|
||||
localStorage.removeItem('access_token')
|
||||
},
|
||||
|
||||
async fetchUser() {
|
||||
try {
|
||||
const response = await api.get('/auth/me')
|
||||
this.user = response.data
|
||||
return { success: true }
|
||||
} catch (error) {
|
||||
this.token = null
|
||||
this.isAuthenticated = false
|
||||
localStorage.removeItem('access_token')
|
||||
return { success: false }
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
/**
|
||||
* Luxx 前端工具库
|
||||
* 合并了 composables、services 和 stores 的统一导出
|
||||
*/
|
||||
|
||||
// ============ API 服务 ============
|
||||
export { default as api, authAPI, conversationsAPI, messagesAPI, toolsAPI, providersAPI, createSSEStream } from './api.js'
|
||||
|
||||
// ============ Pinia 状态管理 ============
|
||||
export { default as pinia } from './store.js'
|
||||
|
||||
// ============ 认证相关 ============
|
||||
export { useAuth } from './useAuth.js'
|
||||
|
||||
// ============ API 请求组合式函数 ============
|
||||
export { useApi, usePagination, useForm } from './useApi.js'
|
||||
|
||||
// ============ 格式化工具 ============
|
||||
export { formatDate, formatNumber, truncate, formatFileSize, capitalize, formatTokens } from './useFormatters.js'
|
||||
|
||||
// ============ 通用工具函数 ============
|
||||
export { debounce, throttle, deepClone, generateId, storage, getDeviceType, copyToClipboard } from './useUtils.js'
|
||||
|
||||
// ============ Markdown 渲染 ============
|
||||
export { renderMarkdown } from './markdown.js'
|
||||
|
|
@ -3,6 +3,3 @@ import { createPinia } from 'pinia'
|
|||
const pinia = createPinia()
|
||||
|
||||
export default pinia
|
||||
|
||||
// 方便导入 store
|
||||
export * from './auth'
|
||||
|
|
@ -51,7 +51,7 @@ export function deepClone(obj) {
|
|||
}
|
||||
|
||||
/**
|
||||
* 生成随机 ID
|
||||
* 生成随机 Id
|
||||
* @param {number} length - 长度
|
||||
*/
|
||||
export function generateId(length = 8) {
|
||||
|
|
@ -48,8 +48,8 @@
|
|||
<script setup>
|
||||
import { ref, reactive } from 'vue'
|
||||
import { useRouter } from 'vue-router'
|
||||
import { authAPI } from '../services/api.js'
|
||||
import { useAuth } from '../composables/useAuth.js'
|
||||
import { authAPI } from '../utils/api.js'
|
||||
import { useAuth } from '../utils/useAuth.js'
|
||||
|
||||
const router = useRouter()
|
||||
const { login } = useAuth()
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@
|
|||
<script setup>
|
||||
import { ref, computed, onMounted, nextTick, watch } from 'vue'
|
||||
import { useRoute } from 'vue-router'
|
||||
import { conversationsAPI, messagesAPI } from '../services/api.js'
|
||||
import { conversationsAPI, messagesAPI } from '../utils/api.js'
|
||||
import ProcessBlock from '../components/ProcessBlock.vue'
|
||||
import MessageBubble from '../components/MessageBubble.vue'
|
||||
import { renderMarkdown } from '../utils/markdown.js'
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@
|
|||
<script setup>
|
||||
import { ref, computed, onMounted } from 'vue'
|
||||
import { useRouter } from 'vue-router'
|
||||
import { conversationsAPI, providersAPI } from '../services/api.js'
|
||||
import { conversationsAPI, providersAPI } from '../utils/api.js'
|
||||
|
||||
const router = useRouter()
|
||||
const list = ref([])
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@
|
|||
|
||||
<script setup>
|
||||
import { ref, onMounted } from 'vue'
|
||||
import { conversationsAPI, toolsAPI } from '../services/api.js'
|
||||
import { conversationsAPI, toolsAPI } from '../utils/api.js'
|
||||
|
||||
const stats = ref({ conversations: 0, tools: 0, messages: 0, models: 1 })
|
||||
|
||||
|
|
|
|||
|
|
@ -133,9 +133,9 @@
|
|||
|
||||
<script setup>
|
||||
import { ref, onMounted } from 'vue'
|
||||
import { providersAPI } from '../services/api.js'
|
||||
import { useAuth } from '../composables/useAuth.js'
|
||||
import { authAPI } from '../services/api.js'
|
||||
import { providersAPI } from '../utils/api.js'
|
||||
import { useAuth } from '../utils/useAuth.js'
|
||||
import { authAPI } from '../utils/api.js'
|
||||
import { useRouter } from 'vue-router'
|
||||
|
||||
const router = useRouter()
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@
|
|||
|
||||
<script setup>
|
||||
import { ref, onMounted } from 'vue'
|
||||
import { toolsAPI } from '../services/api.js'
|
||||
import { toolsAPI } from '../utils/api.js'
|
||||
|
||||
const list = ref([])
|
||||
const loading = ref(true)
|
||||
|
|
|
|||
|
|
@ -1,128 +1,49 @@
|
|||
"""Crawler related tools"""
|
||||
"""Crawler tools"""
|
||||
from luxx.tools.factory import tool
|
||||
from luxx.tools.services import SearchService, FetchService
|
||||
|
||||
|
||||
@tool(
|
||||
name="web_search",
|
||||
description="Search the internet for information. Use when you need to find latest news or answer questions.",
|
||||
parameters={
|
||||
@tool(name="web_search", description="Search the internet. Use when you need to find latest news or answer questions.", parameters={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Search keywords"
|
||||
},
|
||||
"max_results": {
|
||||
"type": "integer",
|
||||
"description": "Number of results to return, default 5",
|
||||
"default": 5
|
||||
}
|
||||
"query": {"type": "string", "description": "Search keywords"},
|
||||
"max_results": {"type": "integer", "description": "Number of results, default 5", "default": 5}
|
||||
},
|
||||
"required": ["query"]
|
||||
},
|
||||
category="crawler"
|
||||
)
|
||||
}, category="crawler")
|
||||
def web_search(arguments: dict) -> dict:
|
||||
"""
|
||||
Web search tool using DuckDuckGo
|
||||
"""
|
||||
query = arguments["query"]
|
||||
max_results = arguments.get("max_results", 5)
|
||||
|
||||
service = SearchService()
|
||||
results = service.search(query, max_results)
|
||||
|
||||
if not results:
|
||||
return {"success": True, "data": {"query": query, "results": []}, "message": "No results found"}
|
||||
|
||||
return {"success": True, "data": {"query": query, "results": results}}
|
||||
results = SearchService().search(arguments["query"], arguments.get("max_results", 5))
|
||||
return {"success": True, "data": {"query": arguments["query"], "results": results or []}}
|
||||
|
||||
|
||||
@tool(
|
||||
name="web_fetch",
|
||||
description="Fetch content from a webpage. Use when user needs detailed information from a page.",
|
||||
parameters={
|
||||
@tool(name="web_fetch", description="Fetch content from a webpage.", parameters={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": "URL of the webpage to fetch"
|
||||
},
|
||||
"extract_type": {
|
||||
"type": "string",
|
||||
"description": "Extraction type: text, links, or structured",
|
||||
"enum": ["text", "links", "structured"],
|
||||
"default": "text"
|
||||
}
|
||||
"url": {"type": "string", "description": "URL to fetch"},
|
||||
"extract_type": {"type": "string", "enum": ["text", "links", "structured"], "default": "text"}
|
||||
},
|
||||
"required": ["url"]
|
||||
},
|
||||
category="crawler"
|
||||
)
|
||||
}, category="crawler")
|
||||
def web_fetch(arguments: dict) -> dict:
|
||||
"""
|
||||
Page fetch tool
|
||||
"""
|
||||
url = arguments["url"]
|
||||
extract_type = arguments.get("extract_type", "text")
|
||||
|
||||
if not url:
|
||||
if not arguments.get("url"):
|
||||
return {"success": False, "error": "URL is required"}
|
||||
|
||||
service = FetchService(timeout=15)
|
||||
result = service.fetch(url, extract_type)
|
||||
|
||||
if "error" in result:
|
||||
return {"success": False, "error": result["error"]}
|
||||
|
||||
return {"success": True, "data": result}
|
||||
result = FetchService().fetch(arguments["url"], arguments.get("extract_type", "text"))
|
||||
return {"success": "error" not in result, "data": result, "error": result.get("error")}
|
||||
|
||||
|
||||
@tool(
|
||||
name="batch_fetch",
|
||||
description="Batch fetch multiple webpages. Use when you need to get content from multiple pages.",
|
||||
parameters={
|
||||
@tool(name="batch_fetch", description="Batch fetch multiple webpages.", parameters={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"urls": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of URLs to fetch"
|
||||
},
|
||||
"extract_type": {
|
||||
"type": "string",
|
||||
"enum": ["text", "links", "structured"],
|
||||
"default": "text"
|
||||
}
|
||||
"urls": {"type": "array", "items": {"type": "string"}, "description": "URLs to fetch"},
|
||||
"extract_type": {"type": "string", "enum": ["text", "links", "structured"], "default": "text"}
|
||||
},
|
||||
"required": ["urls"]
|
||||
},
|
||||
category="crawler"
|
||||
)
|
||||
}, category="crawler")
|
||||
def batch_fetch(arguments: dict) -> dict:
|
||||
"""
|
||||
Batch fetch tool
|
||||
"""
|
||||
urls = arguments["urls"]
|
||||
extract_type = arguments.get("extract_type", "text")
|
||||
|
||||
urls = arguments.get("urls", [])
|
||||
if not urls:
|
||||
return {"success": False, "error": "URLs list is required"}
|
||||
|
||||
if len(urls) > 10:
|
||||
return {"success": False, "error": "Maximum 10 pages allowed"}
|
||||
|
||||
service = FetchService(timeout=10)
|
||||
results = service.fetch_batch(urls, extract_type)
|
||||
|
||||
successful = sum(1 for r in results if "error" not in r)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"data": {
|
||||
"results": results,
|
||||
"total": len(results),
|
||||
"successful": successful
|
||||
}
|
||||
}
|
||||
results = FetchService().fetch_batch(urls, arguments.get("extract_type", "text"))
|
||||
return {"success": True, "data": {"results": results, "total": len(results)}}
|
||||
|
|
|
|||
|
|
@ -1,54 +1,18 @@
|
|||
"""Tool helper services"""
|
||||
import re
|
||||
import httpx
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
from urllib.parse import parse_qs, urlparse, quote
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
class SearchService:
|
||||
"""Search service using DuckDuckGo"""
|
||||
"""DuckDuckGo search"""
|
||||
|
||||
def __init__(self, engine: str = "duckduckgo"):
|
||||
self.engine = engine
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
max_results: int = 5,
|
||||
region: str = "cn-zh"
|
||||
) -> List[dict]:
|
||||
"""
|
||||
Execute search
|
||||
|
||||
Args:
|
||||
query: Search keywords
|
||||
max_results: Max result count
|
||||
region: Region setting
|
||||
|
||||
Returns:
|
||||
Search result list
|
||||
"""
|
||||
if self.engine == "duckduckgo":
|
||||
return self._search_duckduckgo(query, max_results, region)
|
||||
else:
|
||||
raise ValueError(f"Unsupported search engine: {self.engine}")
|
||||
|
||||
def _search_duckduckgo(
|
||||
self,
|
||||
query: str,
|
||||
max_results: int,
|
||||
region: str
|
||||
) -> List[dict]:
|
||||
"""DuckDuckGo search via HTML"""
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml",
|
||||
}
|
||||
|
||||
from urllib.parse import quote
|
||||
encoded_query = quote(query)
|
||||
url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
|
||||
def search(self, query: str, max_results: int = 5) -> List[dict]:
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
|
||||
url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
|
||||
|
||||
try:
|
||||
resp = httpx.get(url, headers=headers, timeout=15, follow_redirects=True)
|
||||
|
|
@ -56,7 +20,6 @@ class SearchService:
|
|||
except Exception:
|
||||
return []
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
results = []
|
||||
|
||||
|
|
@ -66,10 +29,8 @@ class SearchService:
|
|||
|
||||
if title_elem:
|
||||
raw_url = title_elem.get("href", "")
|
||||
# Clean DuckDuckGo redirect URL
|
||||
if "uddg=" in raw_url:
|
||||
parsed = urlparse(raw_url)
|
||||
params = parse_qs(parsed.query)
|
||||
params = parse_qs(urlparse(raw_url).query)
|
||||
clean_url = params.get("uddg", [raw_url])[0]
|
||||
else:
|
||||
clean_url = raw_url
|
||||
|
|
@ -84,164 +45,55 @@ class SearchService:
|
|||
|
||||
|
||||
class FetchService:
|
||||
"""Page fetch service"""
|
||||
"""Page fetch with concurrent support"""
|
||||
|
||||
def __init__(self, timeout: float = 15.0):
|
||||
self.timeout = timeout
|
||||
self.user_agent = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
|
||||
def fetch(
|
||||
self,
|
||||
url: str,
|
||||
extract_type: str = "text"
|
||||
) -> dict:
|
||||
"""
|
||||
Fetch a single page
|
||||
|
||||
Args:
|
||||
url: Page URL
|
||||
extract_type: Extract type (text, links, structured)
|
||||
|
||||
Returns:
|
||||
Fetch result
|
||||
"""
|
||||
def fetch(self, url: str, extract_type: str = "text") -> dict:
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = "https://" + url
|
||||
|
||||
try:
|
||||
resp = httpx.get(
|
||||
url,
|
||||
timeout=self.timeout,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": self.user_agent}
|
||||
)
|
||||
resp = httpx.get(url, timeout=self.timeout, follow_redirects=True, headers={"User-Agent": self.user_agent})
|
||||
resp.raise_for_status()
|
||||
except httpx.TimeoutException:
|
||||
return {"error": "Request timeout", "url": url}
|
||||
return {"error": "Request timeout"}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "url": url}
|
||||
return {"error": str(e)}
|
||||
|
||||
html = resp.text
|
||||
extractor = ContentExtractor(html)
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
title = soup.title.string if soup.title else ""
|
||||
|
||||
if extract_type == "text":
|
||||
return {
|
||||
"url": url,
|
||||
"title": extractor.extract_title(),
|
||||
"text": extractor.extract_text()[:15000]
|
||||
}
|
||||
elif extract_type == "links":
|
||||
return {
|
||||
"url": url,
|
||||
"links": extractor.extract_links()
|
||||
}
|
||||
else:
|
||||
return extractor.extract_structured(url)
|
||||
# Remove noise
|
||||
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
||||
tag.decompose()
|
||||
|
||||
def fetch_batch(
|
||||
self,
|
||||
urls: List[str],
|
||||
extract_type: str = "text",
|
||||
max_concurrent: int = 5
|
||||
) -> List[dict]:
|
||||
"""
|
||||
Batch fetch pages concurrently.
|
||||
if extract_type == "links":
|
||||
links = [{"text": a.get_text(strip=True), "url": a["href"]}
|
||||
for a in soup.find_all("a", href=True)
|
||||
if a.get_text(strip=True) and not a["href"].startswith(("#", "javascript:"))]
|
||||
return {"url": url, "links": links[:50]}
|
||||
|
||||
Args:
|
||||
urls: URL list
|
||||
extract_type: Extract type
|
||||
max_concurrent: Max concurrent requests (1-5, default 5)
|
||||
text = re.sub(r"\n{3,}", "\n\n", soup.get_text(separator="\n", strip=True))
|
||||
|
||||
Returns:
|
||||
Result list (same order as input URLs)
|
||||
"""
|
||||
if extract_type == "structured":
|
||||
meta_desc = soup.find("meta", attrs={"name": "description"})
|
||||
return {"url": url, "title": title, "description": (meta_desc.get("content", "") if meta_desc else ""), "text": text[:5000]}
|
||||
|
||||
return {"url": url, "title": title, "text": text[:15000]}
|
||||
|
||||
def fetch_batch(self, urls: List[str], extract_type: str = "text", max_concurrent: int = 5) -> List[dict]:
|
||||
if len(urls) <= 1:
|
||||
return [self.fetch(url, extract_type) for url in urls]
|
||||
|
||||
max_concurrent = min(max(max_concurrent, 1), 5)
|
||||
results = [None] * len(urls)
|
||||
max_concurrent = min(max(max_concurrent, 1), 5)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_concurrent) as pool:
|
||||
futures = {
|
||||
pool.submit(self.fetch, url, extract_type): i
|
||||
for i, url in enumerate(urls)
|
||||
}
|
||||
futures = {pool.submit(self.fetch, url, extract_type): i for i, url in enumerate(urls)}
|
||||
for future in as_completed(futures):
|
||||
idx = futures[future]
|
||||
try:
|
||||
results[idx] = future.result()
|
||||
except Exception as e:
|
||||
results[idx] = {"error": str(e)}
|
||||
results[futures[future]] = future.result()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class ContentExtractor:
|
||||
"""Content extractor using BeautifulSoup"""
|
||||
|
||||
def __init__(self, html: str):
|
||||
self.html = html
|
||||
self._soup = None
|
||||
|
||||
@property
|
||||
def soup(self):
|
||||
if self._soup is None:
|
||||
from bs4 import BeautifulSoup
|
||||
self._soup = BeautifulSoup(self.html, "html.parser")
|
||||
return self._soup
|
||||
|
||||
def extract_title(self) -> str:
|
||||
"""Extract page title"""
|
||||
if self.soup.title:
|
||||
return self.soup.title.string or ""
|
||||
return ""
|
||||
|
||||
def extract_text(self) -> str:
|
||||
"""Extract plain text"""
|
||||
# Remove script and style
|
||||
for tag in self.soup(["script", "style", "nav", "footer", "header", "aside"]):
|
||||
tag.decompose()
|
||||
|
||||
text = self.soup.get_text(separator="\n", strip=True)
|
||||
# Clean extra whitespace
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text
|
||||
|
||||
def extract_links(self, max_count: int = 50) -> List[dict]:
|
||||
"""Extract links"""
|
||||
links = []
|
||||
for a in self.soup.find_all("a", href=True):
|
||||
text = a.get_text(strip=True)
|
||||
href = a["href"]
|
||||
if text and href and not href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
||||
links.append({"text": text, "url": href})
|
||||
if len(links) >= max_count:
|
||||
break
|
||||
return links
|
||||
|
||||
def extract_structured(self, url: str = "") -> dict:
|
||||
"""Extract structured content"""
|
||||
soup = self.soup
|
||||
|
||||
# Extract title
|
||||
title = ""
|
||||
if soup.title:
|
||||
title = soup.title.string or ""
|
||||
|
||||
# Extract meta description
|
||||
description = ""
|
||||
meta_desc = soup.find("meta", attrs={"name": "description"})
|
||||
if meta_desc:
|
||||
description = meta_desc.get("content", "")
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"title": title.strip(),
|
||||
"description": description.strip(),
|
||||
"text": self.extract_text()[:5000],
|
||||
"links": self.extract_links(20)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,8 +7,6 @@ requires-python = ">=3.10"
|
|||
dependencies = [
|
||||
"fastapi>=0.109.0",
|
||||
"uvicorn[standard]>=0.27.0",
|
||||
"python-multipart>=0.0.6",
|
||||
"sse-starlette>=2.0.0",
|
||||
"sqlalchemy>=2.0.25",
|
||||
"aiosqlite>=0.19.0",
|
||||
"pyjwt>=2.8.0",
|
||||
|
|
@ -18,22 +16,16 @@ dependencies = [
|
|||
"requests>=2.31.0",
|
||||
"beautifulsoup4>=4.12.3",
|
||||
"lxml>=5.1.0",
|
||||
"httpx>=0.26.0",
|
||||
"pyyaml>=6.0.1",
|
||||
"shortuuid>=1.0.11",
|
||||
"pydantic>=2.5.0",
|
||||
"pydantic-settings>=2.1.0",
|
||||
"email-validator>=2.1.0",
|
||||
"shortuuid>=1.0.11",
|
||||
"sse-starlette>=2.0.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=8.0.0",
|
||||
"pytest-asyncio>=0.23.0",
|
||||
"pytest-cov>=4.1.0",
|
||||
"black>=24.0.0",
|
||||
"ruff>=0.1.0",
|
||||
]
|
||||
dev = ["pytest>=8.0.0", "pytest-asyncio>=0.23.0", "pytest-cov>=4.1.0", "black>=24.0.0", "ruff>=0.1.0"]
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["luxx"]
|
||||
|
|
|
|||
Loading…
Reference in New Issue