From 0430d86d316899bb249feaf13f44eaab98729458 Mon Sep 17 00:00:00 2001 From: xusliebana Date: Wed, 10 Nov 2021 12:36:23 +0100 Subject: [PATCH 1/2] add multiple columns interface --- .../nlp/HasInputAnnotationCols.scala | 2 +- .../nlp/HasMultipleInputAnnotationCols.scala | 34 +++++++++++++ .../MultiColumnApproach.scala | 40 +++++++++++++++ .../MultiColumnsModel.scala | 43 ++++++++++++++++ .../MultiannotationsSpec.scala | 51 +++++++++++++++++++ 5 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 src/main/scala/com/johnsnowlabs/nlp/HasMultipleInputAnnotationCols.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnApproach.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnsModel.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiannotationsSpec.scala diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasInputAnnotationCols.scala b/src/main/scala/com/johnsnowlabs/nlp/HasInputAnnotationCols.scala index 15414bbf6b7568..dca299e2fb6dd7 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/HasInputAnnotationCols.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/HasInputAnnotationCols.scala @@ -34,7 +34,7 @@ trait HasInputAnnotationCols extends Params { new StringArrayParam(this, "inputCols", "the input annotation columns") /** Overrides required annotators column if different than default */ - final def setInputCols(value: Array[String]): this.type = { + def setInputCols(value: Array[String]): this.type = { require( value.length == inputAnnotatorTypes.length, s"setInputCols in ${this.uid} expecting ${inputAnnotatorTypes.length} columns. " + diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasMultipleInputAnnotationCols.scala b/src/main/scala/com/johnsnowlabs/nlp/HasMultipleInputAnnotationCols.scala new file mode 100644 index 00000000000000..1d9776e8c90d44 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/HasMultipleInputAnnotationCols.scala @@ -0,0 +1,34 @@ +/* + * Copyright 2017-2021 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp + +import com.johnsnowlabs.nlp.AnnotatorType.CHUNK +import org.apache.spark.ml.param.{Params, StringArrayParam} +import org.apache.spark.sql.types.StructType + +trait HasMultipleInputAnnotationCols extends HasInputAnnotationCols { + + val inputAnnotatorType: String + + lazy override val inputAnnotatorTypes: Array[String] = getInputCols.map(_ =>inputAnnotatorType) + + override def setInputCols(value: Array[String]): this.type = { + set(inputCols, value) + } + + +} diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnApproach.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnApproach.scala new file mode 100644 index 00000000000000..5bc6763f251e83 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnApproach.scala @@ -0,0 +1,40 @@ +package com.johnsnowlabs.nlp.annotators.multipleannotations + +import com.johnsnowlabs.nlp.{AnnotatorApproach, HasMultipleInputAnnotationCols} +import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT} +import org.apache.spark.ml.PipelineModel +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.Dataset + + +class MultiColumnApproach(override val uid: String) extends AnnotatorApproach[MultiColumnsModel] with HasMultipleInputAnnotationCols{ + + def this() = this(Identifiable.randomUID("multiplecolums")) + override val description: String = "Example multiple columns" + + /** + * Input annotator types: CHUNK + * + * @group anno + */ + override val outputAnnotatorType: AnnotatorType = DOCUMENT + /** + * Output annotator types: CHUNK, CHUNK + * + * @group anno + */ + override val inputAnnotatorType: AnnotatorType = DOCUMENT + + + /** whether to merge overlapping matched chunks. Defaults to true + * + * @group param + * */ + + override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): MultiColumnsModel = { + + new MultiColumnsModel().setInputCols($(inputCols)).setOutputCol($(outputCol)) + } + + +} diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnsModel.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnsModel.scala new file mode 100644 index 00000000000000..9bf9c4ff26e19b --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnsModel.scala @@ -0,0 +1,43 @@ +package com.johnsnowlabs.nlp.annotators.multipleannotations + +import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT} +import com.johnsnowlabs.nlp._ +import org.apache.spark.ml.util.Identifiable + + +class MultiColumnsModel(override val uid: String) extends AnnotatorModel[MultiColumnsModel] + with HasMultipleInputAnnotationCols + with HasSimpleAnnotate[MultiColumnsModel]{ + + def this() = this(Identifiable.randomUID("MERGE")) + + + /** + * Input annotator types: CHUNK + * + * @group anno + */ + override val outputAnnotatorType: AnnotatorType = DOCUMENT + + + /** + * Multiple columns + * + * @group anno + */ + + override val inputAnnotatorType: String = DOCUMENT + + /** + * Merges columns of chunk Annotations while considering false positives and replacements. + * @param annotations a Sequence of chunks to merge + * @return a Sequence of Merged CHUNK Annotations + */ + override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { + annotations + } + + +} + + diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiannotationsSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiannotationsSpec.scala new file mode 100644 index 00000000000000..8e4b3aa7e5b12a --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiannotationsSpec.scala @@ -0,0 +1,51 @@ +package com.johnsnowlabs.nlp.annotators.multipleannotations + +import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector +import com.johnsnowlabs.nlp.{ContentProvider, DocumentAssembler, LightPipeline, RecursivePipeline, SparkAccessor} +import com.johnsnowlabs.nlp.annotators.{TextMatcher, Tokenizer} +import com.johnsnowlabs.nlp.util.io.ReadAs +import com.johnsnowlabs.tags.FastTest +import org.apache.spark.ml.Pipeline +import org.scalatest.flatspec.AnyFlatSpec + +class MultiannotationsSpec extends AnyFlatSpec { + import SparkAccessor.spark.implicits._ + + "An multiple anootator chunks" should "transform data " taggedAs FastTest in { + val data = SparkAccessor.spark.sparkContext.parallelize(Seq("Example text")).toDS().toDF("text") + + val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val documentAssembler2 = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document2") + + val documentAssembler3 = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document3") + + val multipleColumns = new MultiColumnApproach().setInputCols("document","document2","document3").setOutputCol("merge") + + val pipeline = new Pipeline() + .setStages(Array( + documentAssembler, + documentAssembler2, + documentAssembler3, + multipleColumns + )) + + val pipelineModel = pipeline.fit(data) + + pipelineModel.transform(data).show(truncate = false) + + val result = new LightPipeline(pipelineModel).annotate("My document") + + println(result) + + } + + + +} From 3f5236433a342bfe926d86d086bded1628883e18 Mon Sep 17 00:00:00 2001 From: xusliebana Date: Wed, 10 Nov 2021 16:43:28 +0100 Subject: [PATCH 2/2] add scala doc --- .../nlp/HasMultipleInputAnnotationCols.scala | 12 ++++++---- ...sSpec.scala => MultiAnnotationsSpec.scala} | 19 ++++++++------- .../MultiColumnApproach.scala | 10 ++------ .../MultiColumnsModel.scala | 24 ++++++------------- 4 files changed, 27 insertions(+), 38 deletions(-) rename src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/{MultiannotationsSpec.scala => MultiAnnotationsSpec.scala} (63%) diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasMultipleInputAnnotationCols.scala b/src/main/scala/com/johnsnowlabs/nlp/HasMultipleInputAnnotationCols.scala index 1d9776e8c90d44..3e2779d3270e9a 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/HasMultipleInputAnnotationCols.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/HasMultipleInputAnnotationCols.scala @@ -16,16 +16,20 @@ package com.johnsnowlabs.nlp -import com.johnsnowlabs.nlp.AnnotatorType.CHUNK -import org.apache.spark.ml.param.{Params, StringArrayParam} -import org.apache.spark.sql.types.StructType - +/** + * Trait used to create annotators with input columns of variable length. + * */ trait HasMultipleInputAnnotationCols extends HasInputAnnotationCols { + /** Annotator reference id. The Annotator type is the same for any of the input columns*/ val inputAnnotatorType: String lazy override val inputAnnotatorTypes: Array[String] = getInputCols.map(_ =>inputAnnotatorType) + /** + * Columns that contain annotations necessary to run this annotator + * AnnotatorType is the same for all input columns in that annotator. + */ override def setInputCols(value: Array[String]): this.type = { set(inputCols, value) } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiannotationsSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiAnnotationsSpec.scala similarity index 63% rename from src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiannotationsSpec.scala rename to src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiAnnotationsSpec.scala index 8e4b3aa7e5b12a..defbf1563f324d 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiannotationsSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiAnnotationsSpec.scala @@ -1,18 +1,17 @@ package com.johnsnowlabs.nlp.annotators.multipleannotations -import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector -import com.johnsnowlabs.nlp.{ContentProvider, DocumentAssembler, LightPipeline, RecursivePipeline, SparkAccessor} -import com.johnsnowlabs.nlp.annotators.{TextMatcher, Tokenizer} -import com.johnsnowlabs.nlp.util.io.ReadAs +import com.johnsnowlabs.nlp.{DocumentAssembler, LightPipeline, SparkAccessor} import com.johnsnowlabs.tags.FastTest import org.apache.spark.ml.Pipeline import org.scalatest.flatspec.AnyFlatSpec +import com.johnsnowlabs.nlp.Annotation +import org.junit.Assert.assertEquals -class MultiannotationsSpec extends AnyFlatSpec { +class MultiAnnotationsSpec extends AnyFlatSpec { import SparkAccessor.spark.implicits._ "An multiple anootator chunks" should "transform data " taggedAs FastTest in { - val data = SparkAccessor.spark.sparkContext.parallelize(Seq("Example text")).toDS().toDF("text") + val data = SparkAccessor.spark.sparkContext.parallelize(Seq("Example text")).toDS().toDF("text") val documentAssembler = new DocumentAssembler() .setInputCol("text") @@ -26,7 +25,7 @@ class MultiannotationsSpec extends AnyFlatSpec { .setInputCol("text") .setOutputCol("document3") - val multipleColumns = new MultiColumnApproach().setInputCols("document","document2","document3").setOutputCol("merge") + val multipleColumns = new MultiColumnApproach().setInputCols("document","document2","document3").setOutputCol("multiple_document") val pipeline = new Pipeline() .setStages(Array( @@ -38,11 +37,13 @@ class MultiannotationsSpec extends AnyFlatSpec { val pipelineModel = pipeline.fit(data) - pipelineModel.transform(data).show(truncate = false) + val annotations = Annotation.collect(pipelineModel.transform(data),"multiple_document").flatten + assertEquals(annotations.length,3) val result = new LightPipeline(pipelineModel).annotate("My document") - println(result) + + assertEquals(result("multiple_document").size,3) } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnApproach.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnApproach.scala index 5bc6763f251e83..4f8932118fe0ba 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnApproach.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnApproach.scala @@ -13,23 +13,17 @@ class MultiColumnApproach(override val uid: String) extends AnnotatorApproach[Mu override val description: String = "Example multiple columns" /** - * Input annotator types: CHUNK + * Input annotator types: DOCUMEN * - * @group anno */ override val outputAnnotatorType: AnnotatorType = DOCUMENT /** - * Output annotator types: CHUNK, CHUNK + * Output annotator type:DOCUMENT * - * @group anno */ override val inputAnnotatorType: AnnotatorType = DOCUMENT - /** whether to merge overlapping matched chunks. Defaults to true - * - * @group param - * */ override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): MultiColumnsModel = { diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnsModel.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnsModel.scala index 9bf9c4ff26e19b..f71c2491372a05 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnsModel.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/multipleannotations/MultiColumnsModel.scala @@ -11,28 +11,18 @@ class MultiColumnsModel(override val uid: String) extends AnnotatorModel[MultiCo def this() = this(Identifiable.randomUID("MERGE")) - /** - * Input annotator types: CHUNK - * - * @group anno - */ + * Input annotator types: DOCUMEN + * + */ override val outputAnnotatorType: AnnotatorType = DOCUMENT - - /** - * Multiple columns - * - * @group anno - */ + * Output annotator type:DOCUMENT + * + */ + override val inputAnnotatorType: AnnotatorType = DOCUMENT - override val inputAnnotatorType: String = DOCUMENT - /** - * Merges columns of chunk Annotations while considering false positives and replacements. - * @param annotations a Sequence of chunks to merge - * @return a Sequence of Merged CHUNK Annotations - */ override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { annotations }