Skip to content

Commit 1cf7e56

Browse files
committed
fix
Signed-off-by: guangli.bao <[email protected]>
1 parent ac8ef69 commit 1cf7e56

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

src/guidellm/utils/preprocessing_sharegpt_data.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import numpy as np
99
from datasets import load_dataset
10-
from transformers import AutoTokenizer
10+
from transformers import AutoTokenizer, PreTrainedTokenizerBase
1111

1212
MIN_CHAR = 10
1313
MAX_CHAR = 1000
@@ -16,7 +16,7 @@
1616
class TokenCounter:
1717
def __init__(self, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2"):
1818
self.model_name = model_name
19-
self._tokenizer: Optional[AutoTokenizer] = None
19+
self._tokenizer: Optional[PreTrainedTokenizerBase] = None
2020

2121
def _initialize_tokenizer(self) -> None:
2222
if self._tokenizer is None:
@@ -33,7 +33,7 @@ def estimate_num_tokens(self, text: str) -> int:
3333
return 0
3434

3535
try:
36-
encoding = self._tokenizer(text, return_tensors=None)
36+
encoding = self._tokenizer.__call__(text, return_tensors=None)
3737
return len(encoding["input_ids"])
3838
except (AttributeError, TypeError, RuntimeError) as e:
3939
raise ValueError(f"Error processing text: {e}") from e

0 commit comments

Comments
 (0)