fix: 修复tokenizer存储的问题
This commit is contained in:
parent
3fee87897d
commit
a57a16430d
|
|
@ -64,10 +64,23 @@ class AutoTokenizer:
|
||||||
Args:
|
Args:
|
||||||
save_path: Path to save the tokenizer
|
save_path: Path to save the tokenizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
save_path = Path(save_path)
|
save_path = Path(save_path)
|
||||||
save_path.mkdir(parents=True, exist_ok=True)
|
save_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Save tokenizer
|
||||||
self._tokenizer.save(str(save_path / "tokenizer.json"))
|
self._tokenizer.save(str(save_path / "tokenizer.json"))
|
||||||
|
|
||||||
|
# Save tokenizer config
|
||||||
|
config = {}
|
||||||
|
if self._special_token_map is not None:
|
||||||
|
config["special_tokens"] = self._special_token_map
|
||||||
|
if self._chat_template is not None:
|
||||||
|
config["chat_template"] = self._chat_template.template_str
|
||||||
|
|
||||||
|
with open(save_path / "tokenizer_config.json", "w", encoding="utf-8") as f:
|
||||||
|
json.dump(config, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def register_tokenizer(cls, name: str, tokenizer_class: type):
|
def register_tokenizer(cls, name: str, tokenizer_class: type):
|
||||||
"""
|
"""
|
||||||
|
|
@ -166,14 +179,6 @@ class AutoTokenizer:
|
||||||
def vocab_size(self) -> int:
|
def vocab_size(self) -> int:
|
||||||
return len(self)
|
return len(self)
|
||||||
|
|
||||||
@property
|
|
||||||
def pad_id(self) -> Optional[int]:
|
|
||||||
"""Return the pad token ID if available."""
|
|
||||||
pad_token = self._special_token_map.get("pad")
|
|
||||||
if pad_token is None or self._tokenizer is None:
|
|
||||||
return None
|
|
||||||
return self._tokenizer.token_to_id(pad_token)
|
|
||||||
|
|
||||||
def set_chat_template(self, template: Union[str, ChatTemplate]):
|
def set_chat_template(self, template: Union[str, ChatTemplate]):
|
||||||
"""
|
"""
|
||||||
Set the chat template for the tokenizer.
|
Set the chat template for the tokenizer.
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue