refactor: 从data 模块分离tokenizer

2026-04-04 16:12:58 +08:00 · 2026-04-04 16:12:58 +08:00 · bd9741dc5f
parent b531232a9b
commit bd9741dc5f
16 changed files with 108 additions and 92 deletions
--- a/astrai/init.py
+++ b/astrai/init.py
@ -6,7 +6,8 @@ from astrai.config import (
    TrainConfig,
 )
 from astrai.factory import BaseFactory
-from astrai.data import BpeTokenizer, DatasetFactory
+from astrai.data import DatasetFactory
 from astrai.tokenizer import BpeTokenizer
 from astrai.inference.generator import (
    BatchGenerator,
    EmbeddingEncoder,
--- a/astrai/config/param_config.py
+++ b/astrai/config/param_config.py
@ -7,7 +7,7 @@ import safetensors.torch as st
 import torch.nn as nn
 from astrai.config.model_config import ModelConfig
-from astrai.data.tokenizer import BpeTokenizer
+from astrai.tokenizer import BpeTokenizer
 from astrai.model.transformer import Transformer
--- a/astrai/data/init.py
+++ b/astrai/data/init.py
@ -1,29 +1,19 @@
 from astrai.data.dataset import (
    BaseDataset,
    DatasetFactory,
-    DPODataset,
+    BaseSegmentFetcher,
    GRPODataset,
    MultiSegmentFetcher,
    SEQDataset,
    SFTDataset,
 )
 from astrai.data.sampler import ResumableDistributedSampler
 from astrai.data.tokenizer import BpeTokenizer
 __all__ = [
    # Base classes
    "BaseDataset",
-    # Dataset implementations
+    # Factory
-    "SEQDataset",
+    "DatasetFactory",
    "SFTDataset",
    "DPODataset",
    "GRPODataset",
    # Fetchers
    "BaseSegmentFetcher",
    "MultiSegmentFetcher",
-    # Factory (DatasetFactory is alias for backward compatibility)
+    # Sampler
    "DatasetFactory",
    "DatasetFactory",
    # Tokenizer and sampler
    "BpeTokenizer",
    "ResumableDistributedSampler",
 ]
--- a/astrai/data/dataset.py
+++ b/astrai/data/dataset.py
@ -9,7 +9,7 @@ from torch import Tensor
 from torch.utils.data import Dataset
 from astrai.factory import BaseFactory
-from astrai.data.serialization import load_h5
+from astrai.serialization import load_h5
 class BaseSegmentFetcher:
--- a/astrai/inference/generator.py
+++ b/astrai/inference/generator.py
@ -1,78 +1,13 @@
 from dataclasses import dataclass
-from typing import Dict, Generator, List, Optional, Tuple, Union
+from typing import Generator, List, Optional, Tuple, Union
 import torch
 from jinja2 import Template
 from torch import Tensor
 from astrai.config.param_config import ModelParameter
 from astrai.factory import BaseFactory
 from astrai.inference.core import EmbeddingEncoderCore, GeneratorCore, KVCacheManager
-
+from astrai.tokenizer.chat_template import HistoryType, build_prompt
 HistoryType = List[Tuple[str, str]]
 MessageType = Dict[str, str]
 # Predefined chat templates using jinja2
 CHAT_TEMPLATES: Dict[str, str] = {
    "chatml": """{%- if system_prompt -%}
 <｜im▁start｜>system
 {{ system_prompt }}<｜im▁end｜>
 {%- endif -%}
 {%- for message in messages -%}
 <｜im▁start｜>{{ message['role'] }}
 {{ message['content'] }}<｜im▁end｜>
 {%- endfor -%}
 <｜im▁start｜>assistant
 """,
 }
 def build_prompt(
    query: str,
    system_prompt: Optional[str] = None,
    history: Optional[HistoryType] = None,
    template: Optional[str] = None,
 ) -> str:
    """Build prompt using jinja2 template for query and history.
    Args:
        query (str): query string.
        system_prompt (Optional[str]): system prompt string.
        history (Optional[HistoryType]): history list of query and response.
        template (Optional[str]): jinja2 template string. If None, uses default chatml template.
    Returns:
        str: prompt string formatted according to the template.
    Example:
        # Use default template
        prompt = build_prompt(query="Hello", history=[...])
        # Use custom template
        custom_template = '''
        {%- for msg in messages -%}
        {{ msg['role'] }}: {{ msg['content'] }}
        {%- endfor -%}
        '''
        prompt = build_prompt(query="Hello", template=custom_template)
    """
    # Convert history to message format
    messages: List[MessageType] = []
    if history:
        for user_msg, assistant_msg in history:
            messages.append({"role": "user", "content": user_msg})
            messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": query})
    # Use provided template or default chatml template
    template_str = template if template is not None else CHAT_TEMPLATES["chatml"]
    # Render template
    jinja_template = Template(template_str)
    return jinja_template.render(
        messages=messages,
        system_prompt=system_prompt,
    )
 def pad_sequence(ids_list: List[List[int]], pad_id: int) -> Tuple[List[List[int]], int]:
--- a/astrai/data/serialization.py
+++ b/astrai/data/serialization.py
--- a/astrai/tokenizer/init.py
+++ b/astrai/tokenizer/init.py
@ -0,0 +1,23 @@
 from astrai.tokenizer.tokenizer import (
    BaseTokenizer,
    BpeTokenizer,
    BaseTrainer,
    BpeTrainer,
 )
 from astrai.tokenizer.chat_template import (
    HistoryType,
    MessageType,
    CHAT_TEMPLATES,
    build_prompt,
 )
 __all__ = [
    "BaseTokenizer",
    "BpeTokenizer",
    "BaseTrainer",
    "BpeTrainer",
    "HistoryType",
    "MessageType",
    "CHAT_TEMPLATES",
    "build_prompt",
 ]
--- a/astrai/tokenizer/chat_template.py
+++ b/astrai/tokenizer/chat_template.py
@ -0,0 +1,67 @@
 from typing import Dict, List, Optional, Tuple
 from jinja2 import Template
 HistoryType = List[Tuple[str, str]]
 MessageType = Dict[str, str]
 # Predefined chat templates using jinja2
 CHAT_TEMPLATES: Dict[str, str] = {
    "chatml": """{%- if system_prompt -%}
 <｜im▁start｜>system
 {{ system_prompt }}<｜im▁end｜>
 {%- endif -%}
 {%- for message in messages -%}
 <｜im▁start｜>{{ message['role'] }}
 {{ message['content'] }}<｜im▁end｜>
 {%- endfor -%}
 <｜im▁start｜>assistant
 """,
 }
 def build_prompt(
    query: str,
    system_prompt: Optional[str] = None,
    history: Optional[HistoryType] = None,
    template: Optional[str] = None,
 ) -> str:
    """Build prompt using jinja2 template for query and history.
    Args:
        query (str): query string.
        system_prompt (Optional[str]): system prompt string.
        history (Optional[HistoryType]): history list of query and response.
        template (Optional[str]): jinja2 template string. If None, uses default chatml template.
    Returns:
        str: prompt string formatted according to the template.
    Example:
        # Use default template
        prompt = build_prompt(query="Hello", history=[...])
        # Use custom template
        custom_template = '''
        {%- for msg in messages -%}
        {{ msg['role'] }}: {{ msg['content'] }}
        {%- endfor -%}
        '''
        prompt = build_prompt(query="Hello", template=custom_template)
    """
    # Convert history to message format
    messages: List[MessageType] = []
    if history:
        for user_msg, assistant_msg in history:
            messages.append({"role": "user", "content": user_msg})
            messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": query})
    # Use provided template or default chatml template
    template_str = template if template is not None else CHAT_TEMPLATES["chatml"]
    # Render template
    jinja_template = Template(template_str)
    return jinja_template.render(
        messages=messages,
        system_prompt=system_prompt,
    )
--- a/astrai/tokenizer/tokenizer.py
+++ b/astrai/tokenizer/tokenizer.py
--- a/astrai/trainer/train_callback.py
+++ b/astrai/trainer/train_callback.py
@ -8,7 +8,7 @@ import torch.nn as nn
 from torch.nn.utils import clip_grad_norm_
 from tqdm import tqdm
-from astrai.data.serialization import Checkpoint
+from astrai.serialization import Checkpoint
 from astrai.parallel import only_on_rank
 from astrai.trainer.metric_util import (
    ctx_get_grad_max,
--- a/astrai/trainer/train_context.py
+++ b/astrai/trainer/train_context.py
@ -8,7 +8,7 @@ from torch.utils.data import DataLoader
 from astrai.config.train_config import TrainConfig
 from astrai.data import ResumableDistributedSampler
-from astrai.data.serialization import Checkpoint
+from astrai.serialization import Checkpoint
 from astrai.parallel.setup import get_current_device, get_rank, get_world_size
 from astrai.trainer.strategy import BaseStrategy, StrategyFactory
--- a/astrai/trainer/trainer.py
+++ b/astrai/trainer/trainer.py
@ -2,7 +2,7 @@ import logging
 from typing import List, Optional
 from astrai.config import TrainConfig
-from astrai.data.serialization import Checkpoint
+from astrai.serialization import Checkpoint
 from astrai.parallel.setup import spawn_parallel_fn
 from astrai.trainer.train_callback import (
    TrainCallback,
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -11,7 +11,7 @@ from tokenizers import pre_tokenizers
 from torch.utils.data import Dataset
 from astrai.config.model_config import ModelConfig
-from astrai.data.tokenizer import BpeTokenizer, BpeTrainer
+from astrai.tokenizer import BpeTokenizer, BpeTrainer
 from astrai.model.transformer import Transformer
--- a/tests/data/test_checkpoint.py
+++ b/tests/data/test_checkpoint.py
@ -5,7 +5,7 @@ import torch.distributed as dist
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import CosineAnnealingLR
-from astrai.data.serialization import Checkpoint
+from astrai.serialization import Checkpoint
 from astrai.parallel.setup import get_rank, spawn_parallel_fn
--- a/tests/data/test_dataset.py
+++ b/tests/data/test_dataset.py
@ -2,7 +2,7 @@ import numpy as np
 import torch
 from astrai.data.dataset import DatasetFactory
-from astrai.data.serialization import save_h5
+from astrai.serialization import save_h5
 def test_dataset_loader_random_paths(base_test_env):
--- a/tests/trainer/test_early_stopping.py
+++ b/tests/trainer/test_early_stopping.py
@ -4,7 +4,7 @@ import numpy as np
 import torch
 from astrai.config.train_config import TrainConfig
-from astrai.data.serialization import Checkpoint
+from astrai.serialization import Checkpoint
 from astrai.trainer.schedule import SchedulerFactory
 from astrai.trainer.trainer import Trainer