diff --git a/docs/applications/implementations/aggregators.md b/docs/applications/implementations/aggregators.md index 84a04442db..c329992227 100644 --- a/docs/applications/implementations/aggregators.md +++ b/docs/applications/implementations/aggregators.md @@ -42,7 +42,7 @@ def aggregate_spark(data, columns, args): The following packages have been pre-installed and can be used in your implementations: ```text -pyspark==2.4.0 +pyspark==2.4.1 boto3==1.9.78 msgpack==0.6.1 numpy>=1.13.3,<2 diff --git a/docs/applications/implementations/transformers.md b/docs/applications/implementations/transformers.md index 1d679c09e0..706d126217 100644 --- a/docs/applications/implementations/transformers.md +++ b/docs/applications/implementations/transformers.md @@ -86,7 +86,7 @@ def reverse_transform_python(transformed_value, args): The following packages have been pre-installed and can be used in your implementations: ```text -pyspark==2.4.0 +pyspark==2.4.1 boto3==1.9.78 msgpack==0.6.1 numpy>=1.13.3,<2 diff --git a/docs/applications/resources/environments.md b/docs/applications/resources/environments.md index 2249383acf..8e9f656431 100644 --- a/docs/applications/resources/environments.md +++ b/docs/applications/resources/environments.md @@ -35,7 +35,7 @@ data: #### CSV Config -To help ingest different styles of CSV files, Cortex supports the parameters listed below. All of these parameters are optional. A description and default values for each parameter can be found in the [PySpark CSV Documentation](https://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader.csv). +To help ingest different styles of CSV files, Cortex supports the parameters listed below. All of these parameters are optional. A description and default values for each parameter can be found in the [PySpark CSV Documentation](https://spark.apache.org/docs/2.4.1/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader.csv). ```yaml csv_config: diff --git a/examples/mnist/implementations/models/t2t.py b/examples/mnist/implementations/models/t2t.py index 53ce2dfa57..d14b1d5ece 100644 --- a/examples/mnist/implementations/models/t2t.py +++ b/examples/mnist/implementations/models/t2t.py @@ -35,6 +35,6 @@ def transform_tensorflow(features, labels, model_config): features["inputs"] = tf.reshape(features["image_pixels"], hparams["input_shape"]) # t2t expects this key and dimensionality - features["targets"] = tf.expand_dims(labels, 0) + features["targets"] = tf.expand_dims(tf.expand_dims(labels, -1), -1) return features, labels diff --git a/examples/reviews/implementations/aggregators/max_length.py b/examples/reviews/implementations/aggregators/max_length.py index 5e389b927e..5552024dbf 100644 --- a/examples/reviews/implementations/aggregators/max_length.py +++ b/examples/reviews/implementations/aggregators/max_length.py @@ -1,15 +1,13 @@ def aggregate_spark(data, columns, args): - from pyspark.ml.feature import StopWordsRemover, RegexTokenizer + from pyspark.ml.feature import RegexTokenizer import pyspark.sql.functions as F from pyspark.sql.types import IntegerType regexTokenizer = RegexTokenizer(inputCol=columns["col"], outputCol="token_list", pattern="\\W") regexTokenized = regexTokenizer.transform(data) - remover = StopWordsRemover(inputCol="token_list", outputCol="filtered_word_list") max_review_length_row = ( - remover.transform(regexTokenized) - .select(F.size(F.col("filtered_word_list")).alias("word_count")) + regexTokenized.select(F.size(F.col("token_list")).alias("word_count")) .agg(F.max(F.col("word_count")).alias("max_review_length")) .collect() ) diff --git a/examples/reviews/implementations/aggregators/vocab.py b/examples/reviews/implementations/aggregators/vocab.py index 9b45766e2a..39e0dc7c71 100644 --- a/examples/reviews/implementations/aggregators/vocab.py +++ b/examples/reviews/implementations/aggregators/vocab.py @@ -1,15 +1,12 @@ def aggregate_spark(data, columns, args): import pyspark.sql.functions as F - from pyspark.ml.feature import StopWordsRemover, RegexTokenizer + from pyspark.ml.feature import RegexTokenizer - input_data = data.withColumn(columns["col"], F.lower(F.col(columns["col"]))) regexTokenizer = RegexTokenizer(inputCol=columns["col"], outputCol="token_list", pattern="\\W") regexTokenized = regexTokenizer.transform(data) - remover = StopWordsRemover(inputCol="token_list", outputCol="filtered_word_list") vocab_rows = ( - remover.transform(regexTokenized) - .select(F.explode(F.col("filtered_word_list")).alias("word")) + regexTokenized.select(F.explode(F.col("token_list")).alias("word")) .groupBy("word") .count() .orderBy(F.col("count").desc()) @@ -19,6 +16,7 @@ def aggregate_spark(data, columns, args): ) vocab = [row["word"] for row in vocab_rows] - reverse_dict = {word: idx + len(args["reserved_indices"]) for idx, word in enumerate(vocab)} - - return {**reverse_dict, **args["reserved_indices"]} + reverse_dict = {word: 2 + idx for idx, word in enumerate(vocab)} + reverse_dict[""] = 0 + reverse_dict[""] = 1 + return reverse_dict diff --git a/examples/reviews/implementations/models/t2t_transformer.py b/examples/reviews/implementations/models/t2t_transformer.py new file mode 100644 index 0000000000..1e7e2eaa21 --- /dev/null +++ b/examples/reviews/implementations/models/t2t_transformer.py @@ -0,0 +1,65 @@ +import tensorflow as tf +from tensor2tensor.utils import trainer_lib +from tensor2tensor import models # pylint: disable=unused-import +from tensor2tensor import problems # pylint: disable=unused-import +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.utils import registry +from tensor2tensor.utils import metrics +from tensor2tensor.data_generators import imdb +from tensor2tensor.data_generators import text_encoder + + +def create_estimator(run_config, model_config): + # t2t expects these keys in run_config + run_config.data_parallelism = None + run_config.t2t_device_info = {"num_async_replicas": 1} + + hparams = trainer_lib.create_hparams("transformer_base_single_gpu") + + problem = SentimentIMDBCortex(list(model_config["aggregates"]["reviews_vocab"])) + p_hparams = problem.get_hparams(hparams) + hparams.problem = problem + hparams.problem_hparams = p_hparams + + problem.eval_metrics = lambda: [ + metrics.Metrics.ACC_TOP5, + metrics.Metrics.ACC_PER_SEQ, + metrics.Metrics.NEG_LOG_PERPLEXITY, + ] + + # t2t expects this key + hparams.warm_start_from = None + + # reduce memory load + hparams.num_hidden_layers = 2 + hparams.hidden_size = 32 + hparams.filter_size = 32 + hparams.num_heads = 2 + + estimator = trainer_lib.create_estimator("transformer", hparams, run_config) + return estimator + + +def transform_tensorflow(features, labels, model_config): + max_length = model_config["aggregates"]["max_review_length"] + + features["inputs"] = tf.expand_dims(tf.reshape(features["embedding_input"], [max_length]), -1) + features["targets"] = tf.expand_dims(tf.expand_dims(labels, -1), -1) + + return features, labels + + +class SentimentIMDBCortex(imdb.SentimentIMDB): + """IMDB sentiment classification, with an in-memory vocab""" + + def __init__(self, vocab_list): + super().__init__() + self.vocab = vocab_list + + def feature_encoders(self, data_dir): + encoder = text_encoder.TokenTextEncoder(vocab_filename=None, vocab_list=self.vocab) + + return { + "inputs": encoder, + "targets": text_encoder.ClassLabelEncoder(self.class_labels(data_dir)), + } diff --git a/examples/reviews/implementations/transformers/tokenize_string_to_int.py b/examples/reviews/implementations/transformers/tokenize_string_to_int.py index e20816155d..1ab078cc69 100644 --- a/examples/reviews/implementations/transformers/tokenize_string_to_int.py +++ b/examples/reviews/implementations/transformers/tokenize_string_to_int.py @@ -6,21 +6,16 @@ def transform_python(sample, args): text = sample["col"].lower() token_index_list = [] - - reverse_vocab = args["vocab"] - stop_words = args["stop_words"] - reserved_indices = args["reserved_indices"] + vocab = args["vocab"] for token in non_word.split(text): if len(token) == 0: continue - if token in stop_words: - continue - token_index_list.append(reverse_vocab.get(token, reserved_indices[""])) + token_index_list.append(vocab.get(token, vocab[""])) if len(token_index_list) == args["max_len"]: break for i in range(args["max_len"] - len(token_index_list)): - token_index_list.append(reserved_indices[""]) + token_index_list.append(vocab[""]) return token_index_list diff --git a/examples/reviews/requirements.txt b/examples/reviews/requirements.txt new file mode 100644 index 0000000000..267b3a6d6a --- /dev/null +++ b/examples/reviews/requirements.txt @@ -0,0 +1 @@ +tensor2tensor==1.10.0 diff --git a/examples/reviews/resources/aggregators.yaml b/examples/reviews/resources/aggregators.yaml deleted file mode 100644 index af746ffa1f..0000000000 --- a/examples/reviews/resources/aggregators.yaml +++ /dev/null @@ -1,16 +0,0 @@ -- kind: aggregator - name: vocab - output_type: {STRING: INT} - inputs: - columns: - col: STRING_COLUMN - args: - vocab_size: INT - reserved_indices: {STRING: INT} - -- kind: aggregator - name: max_length - inputs: - columns: - col: STRING_COLUMN - output_type: INT diff --git a/examples/reviews/resources/apis.yaml b/examples/reviews/resources/apis.yaml index 85817007f4..f8c57ed046 100644 --- a/examples/reviews/resources/apis.yaml +++ b/examples/reviews/resources/apis.yaml @@ -9,3 +9,9 @@ model_name: sentiment_linear compute: replicas: 1 + +- kind: api + name: sentiment-t2t + model_name: t2t_transformer + compute: + replicas: 1 diff --git a/examples/reviews/resources/constants.yaml b/examples/reviews/resources/constants.yaml deleted file mode 100644 index 9d412701a2..0000000000 --- a/examples/reviews/resources/constants.yaml +++ /dev/null @@ -1,192 +0,0 @@ -- kind: constant - name: reserved_indices - type: {STRING: INT} - value: - "": 0 - "": 1 - -- kind: constant - name: english_stop_words - type: {STRING: BOOL} - value: - "i": true - "me": true - "my": true - "myself": true - "we": true - "our": true - "ours": true - "ourselves": true - "you": true - "your": true - "yours": true - "yourself": true - "yourselves": true - "he": true - "him": true - "his": true - "himself": true - "she": true - "her": true - "hers": true - "herself": true - "it": true - "its": true - "itself": true - "they": true - "them": true - "their": true - "theirs": true - "themselves": true - "what": true - "which": true - "who": true - "whom": true - "this": true - "that": true - "these": true - "those": true - "am": true - "is": true - "are": true - "was": true - "were": true - "be": true - "been": true - "being": true - "have": true - "has": true - "had": true - "having": true - "do": true - "does": true - "did": true - "doing": true - "a": true - "an": true - "the": true - "and": true - "but": true - "if": true - "or": true - "because": true - "as": true - "until": true - "while": true - "of": true - "at": true - "by": true - "for": true - "with": true - "about": true - "against": true - "between": true - "into": true - "through": true - "during": true - "before": true - "after": true - "above": true - "below": true - "to": true - "from": true - "up": true - "down": true - "in": true - "out": true - "on": true - "off": true - "over": true - "under": true - "again": true - "further": true - "then": true - "once": true - "here": true - "there": true - "when": true - "where": true - "why": true - "how": true - "all": true - "any": true - "both": true - "each": true - "few": true - "more": true - "most": true - "other": true - "some": true - "such": true - "no": true - "nor": true - "not": true - "only": true - "own": true - "same": true - "so": true - "than": true - "too": true - "very": true - "s": true - "t": true - "can": true - "will": true - "just": true - "don": true - "should": true - "now": true - "i'll": true - "you'll": true - "he'll": true - "she'll": true - "we'll": true - "they'll": true - "i'd": true - "you'd": true - "he'd": true - "she'd": true - "we'd": true - "they'd": true - "i'm": true - "you're": true - "he's": true - "she's": true - "it's": true - "we're": true - "they're": true - "i've": true - "we've": true - "you've": true - "they've": true - "isn't": true - "aren't": true - "wasn't": true - "weren't": true - "haven't": true - "hasn't": true - "hadn't": true - "don't": true - "doesn't": true - "didn't": true - "won't": true - "wouldn't": true - "shan't": true - "shouldn't": true - "mustn't": true - "can't": true - "couldn't": true - "cannot": true - "could": true - "here's": true - "how's": true - "let's": true - "ought": true - "that's": true - "there's": true - "what's": true - "when's": true - "where's": true - "who's": true - "why's": true - "would": true diff --git a/examples/reviews/resources/environments.yaml b/examples/reviews/resources/environments.yaml deleted file mode 100644 index 992d784ecc..0000000000 --- a/examples/reviews/resources/environments.yaml +++ /dev/null @@ -1,9 +0,0 @@ -- kind: environment - name: dev - data: - type: csv - path: s3a://cortex-examples/reviews.csv - csv_config: - header: true - escape: "\"" - schema: ["review", "label"] diff --git a/examples/reviews/resources/max_length.yaml b/examples/reviews/resources/max_length.yaml new file mode 100644 index 0000000000..d5f642e096 --- /dev/null +++ b/examples/reviews/resources/max_length.yaml @@ -0,0 +1,13 @@ +- kind: aggregator + name: max_length + inputs: + columns: + col: STRING_COLUMN + output_type: INT + +- kind: aggregate + name: max_review_length + aggregator: max_length + inputs: + columns: + col: review diff --git a/examples/reviews/resources/models.yaml b/examples/reviews/resources/models.yaml index 2c9a1d3ba0..41142466d6 100644 --- a/examples/reviews/resources/models.yaml +++ b/examples/reviews/resources/models.yaml @@ -30,3 +30,24 @@ training: batch_size: 64 num_steps: 5000 + +- kind: model + name: t2t_transformer + type: classification + target_column: label_indexed + feature_columns: + - embedding_input + aggregates: + - max_review_length + - reviews_vocab + prediction_key: outputs + data_partition_ratio: + training: 0.8 + evaluation: 0.2 + training: + batch_size: 16 + num_steps: 250000 + evaluation: + start_delay_secs: 1 + compute: + gpu: 1 diff --git a/examples/reviews/resources/raw_columns.yaml b/examples/reviews/resources/raw_columns.yaml index a7fd7946b7..5afb119f67 100644 --- a/examples/reviews/resources/raw_columns.yaml +++ b/examples/reviews/resources/raw_columns.yaml @@ -1,3 +1,13 @@ +- kind: environment + name: dev + data: + type: csv + path: s3a://cortex-examples/reviews.csv + csv_config: + header: true + escape: "\"" + schema: ["review", "label"] + - kind: raw_column name: review type: STRING_COLUMN diff --git a/examples/reviews/resources/transformed_columns.yaml b/examples/reviews/resources/tokenized_columns.yaml similarity index 65% rename from examples/reviews/resources/transformed_columns.yaml rename to examples/reviews/resources/tokenized_columns.yaml index a8f9861497..225bd38f60 100644 --- a/examples/reviews/resources/transformed_columns.yaml +++ b/examples/reviews/resources/tokenized_columns.yaml @@ -1,3 +1,13 @@ +- kind: transformer + name: tokenize_string_to_int + output_type: INT_LIST_COLUMN + inputs: + columns: + col: STRING_COLUMN + args: + max_len: INT + vocab: {STRING: INT} + - kind: transformed_column name: embedding_input transformer: tokenize_string_to_int @@ -6,9 +16,7 @@ col: review args: max_len: max_review_length - stop_words: english_stop_words vocab: reviews_vocab - reserved_indices: reserved_indices - kind: transformed_column name: label_indexed diff --git a/examples/reviews/resources/transformers.yaml b/examples/reviews/resources/transformers.yaml deleted file mode 100644 index 890e7f07fb..0000000000 --- a/examples/reviews/resources/transformers.yaml +++ /dev/null @@ -1,11 +0,0 @@ -- kind: transformer - name: tokenize_string_to_int - output_type: INT_LIST_COLUMN - inputs: - columns: - col: STRING_COLUMN - args: - max_len: INT - stop_words: {STRING: BOOL} - vocab: {STRING: INT} - reserved_indices: {STRING: INT} diff --git a/examples/reviews/resources/aggregates.yaml b/examples/reviews/resources/vocab.yaml similarity index 68% rename from examples/reviews/resources/aggregates.yaml rename to examples/reviews/resources/vocab.yaml index c9b6edd1de..169965f2e9 100644 --- a/examples/reviews/resources/aggregates.yaml +++ b/examples/reviews/resources/vocab.yaml @@ -1,19 +1,20 @@ -- kind: aggregate - name: reviews_vocab - aggregator: vocab +- kind: aggregator + name: vocab + output_type: {STRING: INT} inputs: columns: - col: review + col: STRING_COLUMN args: - vocab_size: 10000 - reserved_indices: reserved_indices + vocab_size: INT - kind: aggregate - name: max_review_length - aggregator: max_length + name: reviews_vocab + aggregator: vocab inputs: columns: col: review + args: + vocab_size: 10000 - kind: aggregate name: label_index diff --git a/images/spark-base/Dockerfile b/images/spark-base/Dockerfile index 352702b771..dca5784f8f 100644 --- a/images/spark-base/Dockerfile +++ b/images/spark-base/Dockerfile @@ -11,7 +11,7 @@ RUN apt-get update -qq && apt-get install -y -q \ RUN mkdir -p /opt ARG HADOOP_VERSION="2.9.2" -ARG SPARK_VERSION="2.4.0" +ARG SPARK_VERSION="2.4.1" ARG TF_VERSION="1.12.0" # Check aws-java-sdk-bundle dependency version: https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/$HADOOP_VERSION ARG AWS_JAVA_SDK_VERSION="1.11.199" diff --git a/pkg/workloads/lib/package.py b/pkg/workloads/lib/package.py index 6506e5250e..b7e228706f 100644 --- a/pkg/workloads/lib/package.py +++ b/pkg/workloads/lib/package.py @@ -39,7 +39,7 @@ def get_build_order(python_packages): def get_restricted_packages(): - req_list = ["pyspark==2.4.0", "tensorflow==1.12.0"] + req_list = ["pyspark==2.4.1", "tensorflow==1.12.0"] req_files = glob.glob("/src/**/requirements.txt", recursive=True) for req_file in req_files: