fix: 修复tokenizer存储的问题

This commit is contained in:
ViperEkura 2026-04-06 09:36:29 +08:00
parent 3fee87897d
commit a57a16430d
1 changed files with 13 additions and 8 deletions

View File

@ -64,10 +64,23 @@ class AutoTokenizer:
Args:
save_path: Path to save the tokenizer
"""
save_path = Path(save_path)
save_path.mkdir(parents=True, exist_ok=True)
# Save tokenizer
self._tokenizer.save(str(save_path / "tokenizer.json"))
# Save tokenizer config
config = {}
if self._special_token_map is not None:
config["special_tokens"] = self._special_token_map
if self._chat_template is not None:
config["chat_template"] = self._chat_template.template_str
with open(save_path / "tokenizer_config.json", "w", encoding="utf-8") as f:
json.dump(config, f, ensure_ascii=False, indent=2)
@classmethod
def register_tokenizer(cls, name: str, tokenizer_class: type):
"""
@ -166,14 +179,6 @@ class AutoTokenizer:
def vocab_size(self) -> int:
return len(self)
@property
def pad_id(self) -> Optional[int]:
"""Return the pad token ID if available."""
pad_token = self._special_token_map.get("pad")
if pad_token is None or self._tokenizer is None:
return None
return self._tokenizer.token_to_id(pad_token)
def set_chat_template(self, template: Union[str, ChatTemplate]):
"""
Set the chat template for the tokenizer.