From a57a16430dd3019175718473456de22cd8af4f28 Mon Sep 17 00:00:00 2001 From: ViperEkura <3081035982@qq.com> Date: Mon, 6 Apr 2026 09:36:29 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8Dtokenizer=E5=AD=98?= =?UTF-8?q?=E5=82=A8=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- astrai/tokenize/tokenizer.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/astrai/tokenize/tokenizer.py b/astrai/tokenize/tokenizer.py index 6c449af..35d634e 100644 --- a/astrai/tokenize/tokenizer.py +++ b/astrai/tokenize/tokenizer.py @@ -64,10 +64,23 @@ class AutoTokenizer: Args: save_path: Path to save the tokenizer """ + save_path = Path(save_path) save_path.mkdir(parents=True, exist_ok=True) + + # Save tokenizer self._tokenizer.save(str(save_path / "tokenizer.json")) + # Save tokenizer config + config = {} + if self._special_token_map is not None: + config["special_tokens"] = self._special_token_map + if self._chat_template is not None: + config["chat_template"] = self._chat_template.template_str + + with open(save_path / "tokenizer_config.json", "w", encoding="utf-8") as f: + json.dump(config, f, ensure_ascii=False, indent=2) + @classmethod def register_tokenizer(cls, name: str, tokenizer_class: type): """ @@ -166,14 +179,6 @@ class AutoTokenizer: def vocab_size(self) -> int: return len(self) - @property - def pad_id(self) -> Optional[int]: - """Return the pad token ID if available.""" - pad_token = self._special_token_map.get("pad") - if pad_token is None or self._tokenizer is None: - return None - return self._tokenizer.token_to_id(pad_token) - def set_chat_template(self, template: Union[str, ChatTemplate]): """ Set the chat template for the tokenizer.