From bdda1cc35ad935403c1d1f478a10e9f64c8b9122 Mon Sep 17 00:00:00 2001 From: ViperEkura <3081035982@qq.com> Date: Mon, 29 Sep 2025 13:47:37 +0800 Subject: [PATCH] =?UTF-8?q?feat(khaosz/core/tokenizer):=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=20user=5Fid=20=E5=92=8C=20system=5Fid=20=E5=B1=9E?= =?UTF-8?q?=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- khaosz/core/tokenizer.py | 8 ++++++++ train.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/khaosz/core/tokenizer.py b/khaosz/core/tokenizer.py index 1f0f1ed..65e06b4 100644 --- a/khaosz/core/tokenizer.py +++ b/khaosz/core/tokenizer.py @@ -109,3 +109,11 @@ class BpeTokenizer: @property def pad_id(self) -> int: return self._tokenizer.token_to_id("") + + @property + def user_id(self) -> int: + return self._tokenizer.token_to_id("<|user|>") + + @property + def system_id(self) -> int: + return self._tokenizer.token_to_id("<|system|>") \ No newline at end of file diff --git a/train.py b/train.py index 64805d5..bf3dc16 100644 --- a/train.py +++ b/train.py @@ -51,7 +51,7 @@ def train( "multi_turn": multi_turn, "bos_token_id": parameter.tokenizer.bos_id, "eos_token_id": parameter.tokenizer.eos_id, - "user_token_id":parameter.tokenizer.encode("<|user|>")[0], + "user_token_id":parameter.tokenizer.user_id, "dpo_beta": dpo_beta }