JohnSnowLabs · maziyarpanahi · Nov 19, 2021 · Nov 18, 2021 · Nov 18, 2021
diff --git a/docs/en/annotator_entries/Doc2Vec.md b/docs/en/annotator_entries/Doc2Vec.md
@@ -0,0 +1,267 @@
+{%- capture title -%}
+Doc2Vec
+{%- endcapture -%}
+
+{%- capture model_description -%}
+Word2Vec model that creates vector representations of words in a text corpus.
+
+The algorithm first constructs a vocabulary from the corpus
+and then learns vector representation of words in the vocabulary.
+The vector representation can be used as features in
+natural language processing and machine learning algorithms.
+
+We use Word2Vec implemented in Spark ML. It uses skip-gram model in our implementation and a hierarchical softmax
+method to train the model. The variable names in the implementation match the original C implementation.
+
+This is the instantiated model of the Doc2VecApproach.
+For training your own model, please see the documentation of that class.
+
+Pretrained models can be loaded with `pretrained` of the companion object:
+```
+val embeddings = Doc2VecModel.pretrained()
+  .setInputCols("token")
+  .setOutputCol("embeddings")
+```
+The default model is `"doc2vec_wiki_100_uncased"`, if no name is provided.
+
+For available pretrained models please see the [Models Hub](https://nlp.johnsnowlabs.com/models).
+
+**Sources** :
+
+For the original C implementation, see https://code.google.com/p/word2vec/
+
+For the research paper, see
+[Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/abs/1301.3781)
+and [Distributed Representations of Words and Phrases and their Compositionality](https://arxiv.org/pdf/1310.4546v1.pdf).
+{%- endcapture -%}
+
+{%- capture model_input_anno -%}
+TOKEN
+{%- endcapture -%}
+
+{%- capture model_output_anno -%}
+SENTENCE_EMBEDDINGS
+{%- endcapture -%}
+
+{%- capture model_python_example -%}
+import sparknlp
+from sparknlp.base import *
+from sparknlp.annotator import *
+from pyspark.ml import Pipeline
+
+documentAssembler = DocumentAssembler() \
+    .setInputCol("text") \
+    .setOutputCol("document")
+
+tokenizer = Tokenizer() \
+    .setInputCols(["document"]) \
+    .setOutputCol("token")
+
+embeddings = Doc2VecModel.pretrained() \
+    .setInputCols(["token"]) \
+    .setOutputCol("embeddings")
+
+embeddingsFinisher = EmbeddingsFinisher() \
+    .setInputCols(["embeddings"]) \
+    .setOutputCols("finished_embeddings") \
+    .setOutputAsVector(True)
+
+pipeline = Pipeline().setStages([
+    documentAssembler,
+    tokenizer,
+    embeddings,
+    embeddingsFinisher
+])
+
+data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
+result = pipeline.fit(data).transform(data)
+result.selectExpr("explode(finished_embeddings) as result").show(1, 80)
++--------------------------------------------------------------------------------+
+|                                                                          result|
++--------------------------------------------------------------------------------+
+|[0.06222493574023247,0.011579325422644615,0.009919632226228714,0.109361454844...|
++--------------------------------------------------------------------------------+
+
+{%- endcapture -%}
+
+{%- capture model_scala_example -%}
+import spark.implicits._
+import com.johnsnowlabs.nlp.base.DocumentAssembler
+import com.johnsnowlabs.nlp.annotator.{Tokenizer, Doc2VecModel}
+import com.johnsnowlabs.nlp.EmbeddingsFinisher
+
+import org.apache.spark.ml.Pipeline
+
+val documentAssembler = new DocumentAssembler()
+  .setInputCol("text")
+  .setOutputCol("document")
+
+val tokenizer = new Tokenizer()
+  .setInputCols(Array("document"))
+  .setOutputCol("token")
+
+val embeddings = Doc2VecModel.pretrained()
+  .setInputCols("token")
+  .setOutputCol("embeddings")
+
+val embeddingsFinisher = new EmbeddingsFinisher()
+  .setInputCols("embeddings")
+  .setOutputCols("finished_embeddings")
+  .setOutputAsVector(true)
+
+val pipeline = new Pipeline().setStages(Array(
+  documentAssembler,
+  tokenizer,
+  embeddings,
+  embeddingsFinisher
+))
+
+val data = Seq("This is a sentence.").toDF("text")
+val result = pipeline.fit(data).transform(data)
+
+result.selectExpr("explode(finished_embeddings) as result").show(1, 80)
++--------------------------------------------------------------------------------+
+|                                                                          result|
++--------------------------------------------------------------------------------+
+|[0.06222493574023247,0.011579325422644615,0.009919632226228714,0.109361454844...|
++--------------------------------------------------------------------------------+
+
+{%- endcapture -%}
+
+{%- capture model_api_link -%}
+[Doc2VecModel](https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/embeddings/Doc2VecModel)
+{%- endcapture -%}
+
+{%- capture model_python_api_link -%}
+[Doc2VecModel](https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp.annotator.Doc2VecModel.html)
+{%- endcapture -%}
+
+{%- capture model_source_link -%}
+[Doc2VecModel](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/embeddings/Doc2VecModel.scala)
+{%- endcapture -%}
+
+{%- capture approach_description -%}
+Trains a Word2Vec model that creates vector representations of words in a text corpus.
+
+The algorithm first constructs a vocabulary from the corpus
+and then learns vector representation of words in the vocabulary.
+The vector representation can be used as features in
+natural language processing and machine learning algorithms.
+
+We use Word2Vec implemented in Spark ML. It uses skip-gram model in our implementation and a hierarchical softmax
+method to train the model. The variable names in the implementation match the original C implementation.
+
+For instantiated/pretrained models, see Doc2VecModel.
+
+**Sources** :
+
+For the original C implementation, see https://code.google.com/p/word2vec/
+
+For the research paper, see
+[Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/abs/1301.3781)
+and [Distributed Representations of Words and Phrases and their Compositionality](https://arxiv.org/pdf/1310.4546v1.pdf).
+{%- endcapture -%}
+
+{%- capture approach_input_anno -%}
+TOKEN
+{%- endcapture -%}
+
+{%- capture approach_output_anno -%}
+SENTENCE_EMBEDDINGS
+{%- endcapture -%}
+
+{%- capture approach_python_example -%}
+import sparknlp
+from sparknlp.base import *
+from sparknlp.annotator import *
+from pyspark.ml import Pipeline
+
+documentAssembler = DocumentAssembler() \
+    .setInputCol("text") \
+    .setOutputCol("document")
+
+tokenizer = Tokenizer() \
+    .setInputCols(["document"]) \
+    .setOutputCol("token")
+
+embeddings = Doc2VecApproach() \
+    .setInputCols(["token"]) \
+    .setOutputCol("embeddings")
+
+pipeline = Pipeline() \
+    .setStages([
+      documentAssembler,
+      tokenizer,
+      embeddings
+    ])
+
+path = "sherlockholmes.txt"
+dataset = spark.read.text(path).toDF("text")
+pipelineModel = pipeline.fit(dataset)
+
+{%- endcapture -%}
+
+{%- capture approach_scala_example -%}
+import spark.implicits._
+import com.johnsnowlabs.nlp.annotator.{Tokenizer, Doc2VecApproach}
+import com.johnsnowlabs.nlp.base.DocumentAssembler
+import org.apache.spark.ml.Pipeline
+
+val documentAssembler = new DocumentAssembler()
+  .setInputCol("text")
+  .setOutputCol("document")
+
+val tokenizer = new Tokenizer()
+  .setInputCols(Array("document"))
+  .setOutputCol("token")
+
+val embeddings = new Doc2VecApproach()
+  .setInputCols("token")
+  .setOutputCol("embeddings")
+
+val pipeline = new Pipeline()
+  .setStages(Array(
+    documentAssembler,
+    tokenizer,
+    embeddings
+  ))
+
+val path = "src/test/resources/spell/sherlockholmes.txt"
+val dataset = spark.sparkContext.textFile(path)
+  .toDF("text")
+val pipelineModel = pipeline.fit(dataset)
+
+{%- endcapture -%}
+
+{%- capture approach_api_link -%}
+[Doc2VecApproach](https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/embeddings/Doc2VecApproach)
+{%- endcapture -%}
+
+{%- capture approach_python_api_link -%}
+[Doc2VecApproach](https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp.annotator.Doc2VecApproach.html)
+{%- endcapture -%}
+
+{%- capture approach_source_link -%}
+[Doc2VecApproach](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/embeddings/Doc2VecApproach.scala)
+{%- endcapture -%}
+
+
+{% include templates/approach_model_template.md
+title=title
+model_description=model_description
+model_input_anno=model_input_anno
+model_output_anno=model_output_anno
+model_python_example=model_python_example
+model_scala_example=model_scala_example
+model_api_link=model_api_link
+model_python_api_link=model_python_api_link
+model_source_link=model_source_link
+approach_description=approach_description
+approach_input_anno=approach_input_anno
+approach_output_anno=approach_output_anno
+approach_python_example=approach_python_example
+approach_scala_example=approach_scala_example
+approach_api_link=approach_api_link
+approach_python_api_link=approach_python_api_link
+approach_source_link=approach_source_link
+%}
diff --git a/docs/en/annotators.md b/docs/en/annotators.md
@@ -56,6 +56,7 @@ There are two types of Annotators:
 {% include templates/anno_table_entry.md path="" name="DateMatcher" summary="Matches standard date formats into a provided format."%}
 {% include templates/anno_table_entry.md path="" name="DependencyParser" summary="Unlabeled parser that finds a grammatical relation between two words in a sentence."%}
 {% include templates/anno_table_entry.md path="" name="Doc2Chunk" summary="Converts `DOCUMENT` type annotations into `CHUNK` type with the contents of a `chunkCol`."%}
+{% include templates/anno_table_entry.md path="" name="Doc2Vec" summary="Word2Vec model that creates vector representations of words in a text corpus."%}
 {% include templates/anno_table_entry.md path="" name="DocumentAssembler" summary="Prepares data into a format that is processable by Spark NLP. This is the entry point for every Spark NLP pipeline."%}
 {% include templates/anno_table_entry.md path="" name="DocumentNormalizer" summary="Annotator which normalizes raw text from tagged text, e.g. scraped web pages or xml documents, from document type columns into Sentence."%}
 {% include templates/anno_table_entry.md path="" name="EntityRuler" summary="Fits an Annotator to match exact strings or regex patterns provided in a file against a Document and assigns them an named entity."%}
@@ -109,7 +110,8 @@ Additionally, these transformers are available to generate embeddings.
 {% include templates/anno_table_entry.md path="./transformers" name="BertForTokenClassification" summary="BertForTokenClassification can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks."%}
 {% include templates/anno_table_entry.md path="./transformers" name="BertSentenceEmbeddings" summary="Sentence-level embeddings using BERT. BERT (Bidirectional Encoder Representations from Transformers) provides dense vector representations for natural language by using a deep, pre-trained neural network with the Transformer architecture."%}
 {% include templates/anno_table_entry.md path="./transformers" name="DistilBertEmbeddings" summary="DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base."%}
-{% include templates/anno_table_entry.md path="./transformers" name="DistilBertForTokenClassification" summary="DistilBertForTokenClassification can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks."%}
+{% include templates/anno_table_entry.md path="./transformers" name="DistilBertForSequenceClassification" summary="DistilBertForSequenceClassification can load DistilBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks."%}
+{% include templates/anno_table_entry.md path="./transformers" name="DistilBertForTokenClassification" summary="DistilBertForTokenClassification can load DistilBERT Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks."%}
 {% include templates/anno_table_entry.md path="./transformers" name="ElmoEmbeddings" summary="Word embeddings from ELMo (Embeddings from Language Models), a language model trained on the 1 Billion Word Benchmark."%}
 {% include templates/anno_table_entry.md path="./transformers" name="LongformerEmbeddings" summary="Longformer is a BERT-like model started from the RoBERTa checkpoint and pretrained for MLM on long documents."%}
 {% include templates/anno_table_entry.md path="./transformers" name="LongformerForTokenClassification" summary="LongformerForTokenClassification can load Longformer Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks."%}

diff --git a/docs/en/training.md b/docs/en/training.md
@@ -81,6 +81,12 @@ classes, such as sentiment.
 {% include_relative training_entries/SentimentDL.md %}
 {% include_relative training_entries/ViveknSentiment.md %}
 
+## Text Representation
+These are annotators that can be trained to turn text into a numerical
+representation.
+
+{% include_relative training_entries/Doc2VecApproach.md %}
+
 ## External Trainable Models
 These are annotators that are trained in an external library, which are then
 loaded into Spark NLP.