Skip to content
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
eb77e7b
mnist basic t2t model
1vn Feb 28, 2019
b2448b8
merge master
1vn Feb 28, 2019
9917cd7
add newline
1vn Mar 1, 2019
8edcb58
fix prediction time shaping
1vn Mar 6, 2019
a6e0f47
clean reviews example
1vn Mar 6, 2019
df24557
if undefined shape, take the length
1vn Mar 7, 2019
6202511
Merge branch 'master' into t2t-example
1vn Mar 7, 2019
b5c60a3
add numpy to api image
1vn Mar 7, 2019
7145df8
remove numpy dep, dont restrict unspecified python pkgs
1vn Mar 7, 2019
e4a01c9
add TODO comment to address later
1vn Mar 7, 2019
f8c128d
clean up
1vn Mar 8, 2019
17144a2
clean up example and transform tensor api
1vn Mar 8, 2019
e73d95c
transform_tensors -> transform_tensorflow
1vn Mar 8, 2019
99e0b2a
add back dnn
1vn Mar 8, 2019
ad0be81
add back dnn
1vn Mar 8, 2019
e9e7c92
fix example
1vn Mar 8, 2019
9be820a
remove TODO
1vn Mar 8, 2019
5b5263b
add docs
1vn Mar 11, 2019
4b27fa3
checkin
1vn Mar 14, 2019
43b1a6e
checkin
1vn Mar 14, 2019
f204061
checkin
1vn Mar 15, 2019
a9fed6b
merge master
1vn Mar 15, 2019
dbec6bf
transformer model
1vn Mar 18, 2019
51b8854
remove extraenous changes
1vn Mar 18, 2019
b6e1271
clean up
1vn Mar 18, 2019
73728c8
remove unused transformer
1vn Mar 19, 2019
91f6c07
remove unused transformed column
1vn Mar 19, 2019
e3b9315
clean up
1vn Mar 23, 2019
aa950fe
Merge branch 'master' into t2t-blog
1vn Mar 23, 2019
96cd0e5
address comments, updates to reflect blog post
1vn Mar 28, 2019
346a224
clean up code, remove constants
1vn Apr 3, 2019
1bcd2a3
add back gpu
1vn Apr 3, 2019
729e613
fix code
1vn Apr 3, 2019
c41716f
fix implementation
1vn Apr 3, 2019
765b46f
Merge branch 'master' into t2t-blog
1vn Apr 4, 2019
2c26ecd
address comments
1vn Apr 4, 2019
c6f8c6d
dont use set
1vn Apr 4, 2019
d08b5f1
remove stopwords
1vn Apr 4, 2019
6140ca3
tune model
1vn Apr 4, 2019
2e2c5b1
rearrange yaml
1vn Apr 4, 2019
bc91201
fix YAML
1vn Apr 4, 2019
a81730e
remove whitespace
1vn Apr 4, 2019
52a7360
Merge branch 'master' into t2t-blog
1vn Apr 4, 2019
9cc5fc8
remove extra new line
1vn Apr 4, 2019
5b7875d
bump spark
1vn Apr 5, 2019
03eb2f6
bump spark
1vn Apr 5, 2019
8c87774
address comments
1vn Apr 5, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/mnist/implementations/models/t2t.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,6 @@ def transform_tensorflow(features, labels, model_config):
features["inputs"] = tf.reshape(features["image_pixels"], hparams["input_shape"])

# t2t expects this key and dimensionality
features["targets"] = tf.expand_dims(labels, 0)
features["targets"] = tf.expand_dims(tf.expand_dims(labels, -1), -1)

return features, labels
8 changes: 4 additions & 4 deletions examples/reviews/implementations/aggregators/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ def aggregate_spark(data, columns, args):
import pyspark.sql.functions as F
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer

input_data = data.withColumn(columns["col"], F.lower(F.col(columns["col"])))
regexTokenizer = RegexTokenizer(inputCol=columns["col"], outputCol="token_list", pattern="\\W")
regexTokenized = regexTokenizer.transform(data)

Expand All @@ -19,6 +18,7 @@ def aggregate_spark(data, columns, args):
)

vocab = [row["word"] for row in vocab_rows]
reverse_dict = {word: idx + len(args["reserved_indices"]) for idx, word in enumerate(vocab)}

return {**reverse_dict, **args["reserved_indices"]}
reverse_dict = {word: 2 + idx for idx, word in enumerate(vocab)}
reverse_dict["<PAD>"] = 0
reverse_dict["<UNKNOWN>"] = 1
return {**reverse_dict}
59 changes: 59 additions & 0 deletions examples/reviews/implementations/models/t2t_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import tensorflow as tf
from tensor2tensor.utils import trainer_lib
from tensor2tensor import models # pylint: disable=unused-import
from tensor2tensor import problems # pylint: disable=unused-import
from tensor2tensor.data_generators import problem_hparams
from tensor2tensor.utils import registry
from tensor2tensor.utils import metrics
from tensor2tensor.data_generators import imdb
from tensor2tensor.data_generators import text_encoder


def create_estimator(run_config, model_config):
# t2t expects these keys in run_config
run_config.data_parallelism = None
run_config.t2t_device_info = {"num_async_replicas": 1}

hparams = trainer_lib.create_hparams("transformer_base_single_gpu")

problem = SentimentIMDBCortex(list(model_config["aggregates"]["reviews_vocab"]))
p_hparams = problem.get_hparams(hparams)
hparams.problem = problem
hparams.problem_hparams = p_hparams

# only want ACC
problem.eval_metrics = lambda: [metrics.Metrics.ACC]

# t2t expects this key
hparams.warm_start_from = None

estimator = trainer_lib.create_estimator("transformer", hparams, run_config)
return estimator


def transform_tensorflow(features, labels, model_config):
max_length = model_config["aggregates"]["max_review_length"]

features["inputs"] = tf.expand_dims(
tf.expand_dims(tf.reshape(features["embedding_input"], [max_length]), -1), -1
)

features["targets"] = tf.expand_dims(tf.expand_dims(labels, -1), -1)

return features, labels


class SentimentIMDBCortex(imdb.SentimentIMDB):
"""IMDB sentiment classification, with an in-memory vocab"""

def __init__(self, vocab_list):
super().__init__()
self.vocab = vocab_list

def feature_encoders(self, data_dir):
encoder = text_encoder.TokenTextEncoder(vocab_filename=None, vocab_list=self.vocab)

return {
"inputs": encoder,
"targets": text_encoder.ClassLabelEncoder(self.class_labels(data_dir)),
}
Original file line number Diff line number Diff line change
@@ -1,26 +1,28 @@
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

non_word = re.compile("\\W")


def transform_python(sample, args):
text = sample["col"].lower()
token_index_list = []

reverse_vocab = args["vocab"]
stop_words = args["stop_words"]
reserved_indices = args["reserved_indices"]
vocab = args["vocab"]
stop_words = set(stopwords.words("english"))

for token in non_word.split(text):
if len(token) == 0:
continue
if token in stop_words:
continue
token_index_list.append(reverse_vocab.get(token, reserved_indices["<UNKNOWN>"]))
token_index_list.append(vocab.get(token, 1))
if len(token_index_list) == args["max_len"]:
break

for i in range(args["max_len"] - len(token_index_list)):
token_index_list.append(reserved_indices["<PAD>"])
token_index_list.append(0)

return token_index_list
2 changes: 2 additions & 0 deletions examples/reviews/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
tensor2tensor==1.10.0
nltk==3.4
1 change: 0 additions & 1 deletion examples/reviews/resources/aggregates.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
col: review
args:
vocab_size: 10000
reserved_indices: reserved_indices

- kind: aggregate
name: max_review_length
Expand Down
1 change: 0 additions & 1 deletion examples/reviews/resources/aggregators.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
col: STRING_COLUMN
args:
vocab_size: INT
reserved_indices: {STRING: INT}

- kind: aggregator
name: max_length
Expand Down
6 changes: 6 additions & 0 deletions examples/reviews/resources/apis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,9 @@
model_name: sentiment_linear
compute:
replicas: 1

- kind: api
name: sentiment-t2t
model_name: t2t_transformer
compute:
replicas: 1
192 changes: 0 additions & 192 deletions examples/reviews/resources/constants.yaml

This file was deleted.

21 changes: 21 additions & 0 deletions examples/reviews/resources/models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,24 @@
training:
batch_size: 64
num_steps: 5000

- kind: model
name: t2t_transformer
type: classification
target_column: label_indexed
feature_columns:
- embedding_input
aggregates:
- max_review_length
- reviews_vocab
prediction_key: outputs
data_partition_ratio:
training: 0.8
evaluation: 0.2
training:
batch_size: 1
num_steps: 1
evaluation:
start_delay_secs: 1
compute:
gpu: 1
2 changes: 0 additions & 2 deletions examples/reviews/resources/transformed_columns.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
col: review
args:
max_len: max_review_length
stop_words: english_stop_words
vocab: reviews_vocab
reserved_indices: reserved_indices

- kind: transformed_column
name: label_indexed
Expand Down
2 changes: 0 additions & 2 deletions examples/reviews/resources/transformers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,4 @@
col: STRING_COLUMN
args:
max_len: INT
stop_words: {STRING: BOOL}
vocab: {STRING: INT}
reserved_indices: {STRING: INT}