Skip to content

Commit 2bb8a45

Browse files
committed
Update base for Update on "Switching to use FileOpener from FileLoader"
TorchData is changing its API, renaming `FileLoader` to `FileOpener` This PR should be landed after pytorch/data#148 is merged (the CI for this PR is expected to fail until then as well) [ghstack-poisoned]
2 parents be7bc66 + 4908d3c commit 2bb8a45

File tree

12 files changed

+512
-453
lines changed

12 files changed

+512
-453
lines changed

.circleci/config.yml

Lines changed: 5 additions & 181 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.circleci/regenerate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import os.path
2121

2222

23-
PYTHON_VERSIONS = ["3.6", "3.7", "3.8", "3.9"]
23+
PYTHON_VERSIONS = ["3.7", "3.8", "3.9"]
2424

2525
DOC_VERSION = ('linux', '3.8')
2626

.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# To exclude autogenerated files from code reviews
2+
.circleci/config.yml linguist-generated=true

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ We recommend Anaconda as a Python package management system. Please refer to `py
2929
:header: "PyTorch version", "torchtext version", "Supported Python version"
3030
:widths: 10, 10, 10
3131

32-
nightly build, main, ">=3.6, <=3.9"
32+
nightly build, main, ">=3.7, <=3.9"
3333
1.10.0, 0.11.0, ">=3.6, <=3.9"
3434
1.9.1, 0.10.1, ">=3.6, <=3.9"
3535
1.9, 0.10, ">=3.6, <=3.9"

setup.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,9 @@ def run(self):
9292
],
9393
python_requires='>=3.5',
9494
classifiers=[
95-
'Programming Language :: Python :: 3',
96-
'Programming Language :: Python :: 3.5',
97-
'Programming Language :: Python :: 3.6',
9895
'Programming Language :: Python :: 3.7',
9996
'Programming Language :: Python :: 3.8',
100-
'Programming Language :: Python :: 3 :: Only',
97+
'Programming Language :: Python :: 3.9',
10198
],
10299
# Package info
103100
packages=find_packages(exclude=('test*', 'build_tools*')),

test/test_transforms.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import torch
23
from torchtext import transforms
34
from torchtext.vocab import vocab
@@ -205,7 +206,7 @@ def test_add_token_jit(self):
205206

206207

207208
class TestGPT2BPETokenizer(TorchtextTestCase):
208-
def _gpt2_bpe_tokenizer(self, test_scripting):
209+
def _load_tokenizer(self, test_scripting):
209210
encoder_json = "gpt2_bpe_encoder.json"
210211
bpe_vocab = "gpt2_bpe_vocab.bpe"
211212
tokenizer = transforms.GPT2BPETokenizer(
@@ -214,7 +215,9 @@ def _gpt2_bpe_tokenizer(self, test_scripting):
214215
)
215216
if test_scripting:
216217
tokenizer = torch.jit.script(tokenizer)
218+
return tokenizer
217219

220+
def _gpt2_bpe_tokenizer(self, tokenizer):
218221
sample_texts = [
219222
"Hello World!, how are you?",
220223
"Hélló WoŕlḊ¿",
@@ -240,8 +243,24 @@ def _gpt2_bpe_tokenizer(self, test_scripting):
240243

241244
def test_gpt2_bpe_tokenizer(self):
242245
"""test tokenization on single sentence input as well as batch on sentences"""
243-
self._gpt2_bpe_tokenizer(test_scripting=False)
246+
self._gpt2_bpe_tokenizer(self._load_tokenizer(test_scripting=False))
244247

245248
def test_gpt2_bpe_tokenizer_jit(self):
246249
"""test tokenization with scripting on single sentence input as well as batch on sentences"""
247-
self._gpt2_bpe_tokenizer(test_scripting=True)
250+
self._gpt2_bpe_tokenizer(self._load_tokenizer(test_scripting=True))
251+
252+
def test_gpt2_bpe_tokenizer_save_load_pybind(self):
253+
tokenizer = self._load_tokenizer(test_scripting=False)
254+
tokenizer_path = os.path.join(self.test_dir, 'gpt2_tokenizer_pybind.pt')
255+
torch.save(tokenizer, tokenizer_path)
256+
loaded_tokenizer = torch.load(tokenizer_path)
257+
self._gpt2_bpe_tokenizer((loaded_tokenizer))
258+
259+
def test_gpt2_bpe_tokenizer_save_load_torchscript(self):
260+
tokenizer = self._load_tokenizer(test_scripting=False)
261+
tokenizer_path = os.path.join(self.test_dir, 'gpt2_tokenizer_torchscript.pt')
262+
# Call the __prepare_scriptable__() func and convert the building block to the torbhind version
263+
# Not expect users to use the torchbind version on eager mode but still need a CI test here.
264+
torch.save(tokenizer.__prepare_scriptable__(), tokenizer_path)
265+
loaded_tokenizer = torch.load(tokenizer_path)
266+
self._gpt2_bpe_tokenizer((loaded_tokenizer))

0 commit comments

Comments
 (0)