Skip to content

Commit 6af9888

Browse files
committed
Fix issue with extra token in tokenizers and pad
Signed-off-by: John St John <[email protected]>
1 parent 2f54aff commit 6af9888

File tree

3 files changed

+32
-5
lines changed

3 files changed

+32
-5
lines changed

bionemo-recipes/recipes/evo2_megatron/tests/bionemo/evo2/data/test_tokenizer.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,17 @@
1717
# limitations under the License.
1818

1919

20+
from pathlib import Path
21+
2022
import pytest
23+
from megatron.bridge.training.tokenizers.config import TokenizerConfig
24+
from megatron.bridge.training.tokenizers.tokenizer import build_tokenizer
2125

22-
from bionemo.evo2.data.dataset_tokenizer import Evo2DatasetTokenizer
26+
from bionemo.evo2.data.dataset_tokenizer import (
27+
DEFAULT_HF_TOKENIZER_MODEL_PATH,
28+
DEFAULT_HF_TOKENIZER_MODEL_PATH_512,
29+
Evo2DatasetTokenizer,
30+
)
2331
from bionemo.evo2.utils.config import Evo2PreprocessingConfig
2432

2533

@@ -29,6 +37,25 @@ def tokenizer() -> Evo2DatasetTokenizer:
2937
return Evo2DatasetTokenizer(Evo2PreprocessingConfig())
3038

3139

40+
@pytest.mark.parametrize(
41+
"tokenizer_path, expected_vocab_size",
42+
[
43+
(DEFAULT_HF_TOKENIZER_MODEL_PATH, 256),
44+
(DEFAULT_HF_TOKENIZER_MODEL_PATH_512, 512),
45+
],
46+
)
47+
def test_tokenizer_vocab_size(tokenizer_path: Path, expected_vocab_size: int) -> None:
48+
"""Verifies key tokenizers have the expected vocabulary size."""
49+
tokenizer = build_tokenizer(
50+
TokenizerConfig(
51+
tokenizer_type="HuggingFaceTokenizer",
52+
hf_tokenizer_kwargs={"trust_remote_code": False},
53+
tokenizer_model=tokenizer_path,
54+
)
55+
)
56+
assert tokenizer.vocab_size == expected_vocab_size
57+
58+
3259
def test_tokenizer_handles_long_dna_sequence(tokenizer: Evo2DatasetTokenizer) -> None:
3360
"""Verifies tokenizer correctly processes a long DNA sequence into expected token IDs.
3461

bionemo-recipes/recipes/evo2_megatron/tokenizers/nucleotide_fast_tokenizer_256/tokenizer.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,8 @@
143143
"<EOS>": 0,
144144
"<PAD>": 1,
145145
"<BOS>": 2,
146-
"<UNK>": 3,
147-
"\u0004": 4,
146+
"<SEP>": 3,
147+
"<UNK>": 4,
148148
"\u0005": 5,
149149
"\u0006": 6,
150150
"\u0007": 7,

bionemo-recipes/recipes/evo2_megatron/tokenizers/nucleotide_fast_tokenizer_512/tokenizer.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,8 @@
116116
"<EOS>": 0,
117117
"<PAD>": 1,
118118
"<BOS>": 2,
119-
"<UNK>": 3,
120-
"\u0004": 4,
119+
"<SEP>": 3,
120+
"<UNK>": 4,
121121
"\u0005": 5,
122122
"\u0006": 6,
123123
"\u0007": 7,

0 commit comments

Comments
 (0)