From 408f0cb513904592ada8b61c98a4897af62cc205 Mon Sep 17 00:00:00 2001 From: ViperEkura <3081035982@qq.com> Date: Mon, 6 Apr 2026 13:39:51 +0800 Subject: [PATCH] =?UTF-8?q?docs:=20=E6=9B=B4=E6=96=B0=E7=BD=91=E7=BB=9C?= =?UTF-8?q?=E6=8E=A5=E5=8F=A3=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 32 +++++++++++ assets/docs/README-zh-CN.md | 32 +++++++++++ assets/docs/introduction.md | 106 +++++++++++++++++++++++++++++++++++- 3 files changed, 169 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ce5df5d..4283198 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,38 @@ python scripts/tools/train.py \ python scripts/tools/generate.py --param_path=/path/to/param_path ``` +#### Start HTTP Server + +Start the inference server with OpenAI-compatible HTTP API: + +```bash +python -m scripts.tools.server --port 8000 --device cuda +``` + +Make requests: + +```bash +# Chat API (OpenAI compatible) +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "Hello"}], + "max_tokens": 512 + }' + +# Streaming response +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "Tell a story"}], + "stream": true, + "max_tokens": 500 + }' + +# Health check +curl http://localhost:8000/health +``` + #### Demo Check out the demos in the `scripts/demo/` folder: diff --git a/assets/docs/README-zh-CN.md b/assets/docs/README-zh-CN.md index 313f3c9..f62e1af 100644 --- a/assets/docs/README-zh-CN.md +++ b/assets/docs/README-zh-CN.md @@ -85,6 +85,38 @@ python scripts/tools/train.py \ python scripts/tools/generate.py --param_path=/path/to/param_path ``` +#### 启动 HTTP 服务 + +启动推理服务器,支持 OpenAI 兼容的 HTTP API: + +```bash +python -m scripts.tools.server --port 8000 --device cuda +``` + +发起请求: + +```bash +# Chat API(OpenAI 兼容) +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "你好"}], + "max_tokens": 512 + }' + +# 流式响应 +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "讲个故事"}], + "stream": true, + "max_tokens": 500 + }' + +# 健康检查 +curl http://localhost:8000/health +``` + #### 演示 查看 `scripts/demo/` 文件夹中的演示: diff --git a/assets/docs/introduction.md b/assets/docs/introduction.md index 1f75e78..6448e28 100644 --- a/assets/docs/introduction.md +++ b/assets/docs/introduction.md @@ -190,4 +190,108 @@ for token in engine.generate_with_request(request): print(token, end="", flush=True) ``` -The continuous batching feature allows dynamic batch composition where new requests can join at any time and completed requests are released immediately. \ No newline at end of file +The continuous batching feature allows dynamic batch composition where new requests can join at any time and completed requests are released immediately. + +## HTTP API Usage + +The inference server provides HTTP endpoints for remote inference. Start the server first: + +```bash +python -m scripts.tools.server --port 8000 +``` + +### OpenAI-Compatible Endpoint + +The server provides an OpenAI-compatible chat completion endpoint at `/v1/chat/completions`: + +```bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello, how are you?"} + ], + "temperature": 0.8, + "max_tokens": 2048, + "stream": false + }' +``` + +**Request Parameters:** +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `messages` | List[dict] | Required | Chat messages with role and content | +| `temperature` | float | 0.8 | Sampling temperature (0.0-2.0) | +| `top_p` | float | 0.95 | Nucleus sampling threshold | +| `top_k` | int | 50 | Top-k sampling parameter | +| `max_tokens` | int | 2048 | Maximum tokens to generate | +| `stream` | bool | false | Enable streaming response | +| `system_prompt` | str | None | System prompt override | + +**Response (non-streaming):** +```json +{ + "id": "chatcmpl-1234567890", + "object": "chat.completion", + "created": 1234567890, + "model": "astrai", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "Hello! I'm doing well..."}, + "finish_reason": "stop" + } + ] +} +``` + +### Streaming Response + +Enable streaming for real-time token-by-token output: + +```bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "Write a story"}], + "stream": true, + "max_tokens": 500 + }' +``` + +The server uses Server-Sent Events (SSE) with content type `text/event-stream`. + +### Simple Generation Endpoint + +For basic text generation without chat format: + +```bash +curl -X POST "http://localhost:8000/generate?query=Hello&max_len=1000" \ + -H "Content-Type: application/json" +``` + +Or with conversation history: + +```bash +curl -X POST "http://localhost:8000/generate" \ + -H "Content-Type: application/json" \ + -d '{ + "query": "What is AI?", + "history": [["Hello", "Hi there!"], ["How are you?", "I'm doing well"]], + "temperature": 0.8, + "max_len": 2048 + }' +``` + +### Health Check + +Monitor server and model status: + +```bash +curl http://localhost:8000/health +# {"status": "ok", "model_loaded": true, "engine_ready": true} + +curl http://localhost:8000/stats +# {"requests_total": 10, "tokens_generated": 5000, ...} +``` \ No newline at end of file