Skip to content
Merged
Show file tree
Hide file tree
Changes from 50 commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
072b014
Add stop words removers
SARAVANA1501 Oct 9, 2020
3cd4e0e
Make constructor to public
SARAVANA1501 Oct 10, 2020
fb203d7
Git remove lines to debug
SARAVANA1501 Oct 10, 2020
781a42b
Create df from debug
SARAVANA1501 Oct 10, 2020
7d8aa8a
Create output df
SARAVANA1501 Oct 10, 2020
a2a0c69
Convert df to column to struct array
SARAVANA1501 Oct 10, 2020
04da539
Convert df to column to struct array
SARAVANA1501 Oct 10, 2020
859ff18
Load functionality to stop words remover
SARAVANA1501 Oct 10, 2020
4c303e2
Add locale functionality to StopWordsRemover
SARAVANA1501 Oct 10, 2020
2638ba7
Add set sensitivity functionality to StopWordsRemover
SARAVANA1501 Oct 10, 2020
9fab3bf
Add set custom stop words functionality to StopWordsRemover
SARAVANA1501 Oct 10, 2020
d4b127a
Update expected locale
SARAVANA1501 Oct 10, 2020
a6f838d
Update case sensitivity to tet locale
SARAVANA1501 Oct 10, 2020
88b6f9d
Remove set local from function call
SARAVANA1501 Oct 11, 2020
224e246
Convert string array to ienumerable
SARAVANA1501 Oct 11, 2020
7bb7855
Support local support after spark v2.4.0
SARAVANA1501 Oct 11, 2020
551435b
Add transform schema method
SARAVANA1501 Oct 11, 2020
f625ede
Type Microsoft.Spark.Sql.Types.StructType not supported yet so revert…
SARAVANA1501 Oct 11, 2020
cce01f5
Add transform schema
SARAVANA1501 Oct 11, 2020
795b9cf
Send struct type using from json method
SARAVANA1501 Oct 11, 2020
8319e81
Update input schema for transformSchema
SARAVANA1501 Oct 11, 2020
15ce559
Update string to array string in expected schema
SARAVANA1501 Oct 11, 2020
1408fbe
Use explicit types and add description for tests
SARAVANA1501 Oct 12, 2020
681aba7
Refactor stop words remover feature.
SARAVANA1501 Oct 13, 2020
fee620a
Add load default words functionality
SARAVANA1501 Oct 13, 2020
4a98ee3
Refactor stop words remover
SARAVANA1501 Oct 21, 2020
43b9b7d
Refactor stop words remover
SARAVANA1501 Oct 21, 2020
fa54268
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 24, 2020
cb0a596
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
61b67e3
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
5f6c046
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
80f400a
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
8623c1a
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
ca64a6a
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
a3b8286
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
bb4f4e9
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
524ba73
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
e752b9b
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
b47e361
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
93d4608
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
66987ba
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
8c165b5
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
692598a
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
1e1f897
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
ff3948e
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
fdb1ed9
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
b3480c6
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Oct 28, 2020
f84af25
Update src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/StopWor…
SARAVANA1501 Oct 28, 2020
59ae57e
Updata documentations
SARAVANA1501 Oct 30, 2020
65c31e7
Merge branch 'master' into StopWordsRemover
suhsteve Nov 3, 2020
3a51226
Update src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/StopWor…
SARAVANA1501 Nov 7, 2020
e4553a0
Update src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/StopWor…
SARAVANA1501 Nov 7, 2020
36d2048
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Nov 17, 2020
d882c3c
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Nov 17, 2020
2f3879f
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Nov 17, 2020
ba2673b
[Tech] Saravana Refactor code
SARAVANA1501 Nov 18, 2020
f0b4f90
Merge branch 'master' into StopWordsRemover
SARAVANA1501 Nov 19, 2020
7d07cb9
Update src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
SARAVANA1501 Nov 22, 2020
d8ce37c
Update src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/StopWor…
imback82 Nov 23, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.IO;
using Microsoft.Spark.E2ETest.Utils;
using Microsoft.Spark.ML.Feature;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Types;
using Microsoft.Spark.UnitTest.TestUtils;
using Xunit;

namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class StopWordsRemoverTests : FeatureBaseTests<StopWordsRemover>
{
private readonly SparkSession _spark;

public StopWordsRemoverTests(SparkFixture fixture) : base(fixture)
{
_spark = fixture.Spark;
}

/// <summary>
/// Test stop words removers without locale,
/// because locale is not supported before spark 2.4.0 version.
/// </summary>
[Fact]
public void TestStopWordsRemoverWithoutLocale()
{
string expectedUid = "theUidWithOutLocale";
string expectedInputCol = "input_col";
string expectedOutputCol = "output_col";
bool expectedCaseSensitive = false;
var expectedStopWords = new string[] { "test1", "test2" };

DataFrame input = _spark.Sql("SELECT split('Hi I heard about Spark', ' ') as input_col");

StopWordsRemover stopWordsRemover = new StopWordsRemover(expectedUid)
.SetInputCol(expectedInputCol)
.SetOutputCol(expectedOutputCol)
.SetCaseSensitive(expectedCaseSensitive)
.SetStopWords(expectedStopWords);

Assert.Equal(expectedUid, stopWordsRemover.Uid());
Assert.Equal(expectedInputCol, stopWordsRemover.GetInputCol());
Assert.Equal(expectedOutputCol, stopWordsRemover.GetOutputCol());
Assert.Equal(expectedCaseSensitive, stopWordsRemover.GetCaseSensitive());
Assert.Equal(expectedStopWords, stopWordsRemover.GetStopWords());
Assert.NotEmpty(StopWordsRemover.LoadDefaultStopWords("english"));

using (TemporaryDirectory tempDirectory = new TemporaryDirectory())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: var

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I committed this change. I will merge this PR after CI passes.

{
string savePath = Path.Join(tempDirectory.Path, "StopWordsRemover");
stopWordsRemover.Save(savePath);

StopWordsRemover loadedStopWordsRemover = StopWordsRemover.Load(savePath);
Assert.Equal(stopWordsRemover.Uid(), loadedStopWordsRemover.Uid());
}

Assert.IsType<StructType>(stopWordsRemover.TransformSchema(input.Schema()));
Assert.IsType<DataFrame>(stopWordsRemover.Transform(input));

TestFeatureBase(stopWordsRemover, "inputCol", "input_col");
}

/// <summary>
/// Test stop words removers with locale, run if spark version is greater than spark 2.4.0
/// skip this test for rest of the spark versions.
/// </summary>
[SkipIfSparkVersionIsLessThan(Versions.V2_4_0)]
public void TestStopWordsRemoverWithLocale()
{
string expectedLocale = "en_GB";
StopWordsRemover stopWordsRemover = new StopWordsRemover().SetLocale(expectedLocale);
Assert.Equal(expectedLocale, stopWordsRemover.GetLocale());
}
}
}
173 changes: 173 additions & 0 deletions src/csharp/Microsoft.Spark/ML/Feature/StopWordsRemover.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Collections.Generic;
using Microsoft.Spark.Interop;
using Microsoft.Spark.Interop.Ipc;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Types;

namespace Microsoft.Spark.ML.Feature
{
/// <summary>
/// A <see cref="StopWordsRemover"/> feature transformer that filters out stop words from input.
/// </summary>
public class StopWordsRemover : FeatureBase<StopWordsRemover>, IJvmObjectReferenceProvider
{
private static readonly string s_stopWordsRemoverClassName =
"org.apache.spark.ml.feature.StopWordsRemover";

/// <summary>
/// Create a <see cref="StopWordsRemover"/> without any parameters
/// </summary>
public StopWordsRemover() : base(s_stopWordsRemoverClassName)
{
}

/// <summary>
/// Create a <see cref="StopWordsRemover"/> with a UID that is used to give the
/// <see cref="StopWordsRemover"/> a unique ID
/// </summary>
/// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
public StopWordsRemover(string uid) : base(s_stopWordsRemoverClassName, uid)
{
}

internal StopWordsRemover(JvmObjectReference jvmObject) : base(jvmObject)
{
}

JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;

/// <summary>
/// Sets the column that the <see cref="StopWordsRemover"/> should read from.
/// </summary>
/// <param name="value">The name of the column to as the source</param>
/// <returns>New <see cref="StopWordsRemover"/> object</returns>
public StopWordsRemover SetInputCol(string value) =>
WrapAsStopWordsRemover(_jvmObject.Invoke("setInputCol", value));

/// <summary>
/// The <see cref="StopWordsRemover"/> will create a new column in the DataFrame, this is the
/// name of the new column.
/// </summary>
/// <param name="value">The name of the column to as the target</param>
/// <returns>New <see cref="StopWordsRemover"/> object</returns>
public StopWordsRemover SetOutputCol(string value) =>
WrapAsStopWordsRemover(_jvmObject.Invoke("setOutputCol", value));

/// <summary>
/// Executes the <see cref="StopWordsRemover"/> and transforms the DataFrame to include the new
/// column.
/// </summary>
/// <param name="source">The DataFrame to transform</param>
/// <returns>
/// New <see cref="DataFrame"/> object with the source <see cref="DataFrame"/> transformed
/// </returns>
public DataFrame Transform(DataFrame source) =>
new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));

/// <summary>
/// Gets the column that the <see cref="StopWordsRemover"/> should read from.
/// </summary>
/// <returns>Input column name</returns>
public string GetInputCol() => (string)_jvmObject.Invoke("getInputCol");

/// <summary>
/// The <see cref="StopWordsRemover"/> will create a new column in the DataFrame, this is the
/// name of the new column.
/// </summary>
/// <returns>The output column name</returns>
public string GetOutputCol() => (string)_jvmObject.Invoke("getOutputCol");

/// <summary>
/// Sets locale for <see cref="StopWordsRemover"/> transform.
/// Refer java.util.locale.getavailablelocales() for all available locales.
/// </summary>
/// <param name="value">Locale to be used for transform</param>
/// <returns>New <see cref="StopWordsRemover"/> object</returns>
public StopWordsRemover SetLocale(string value) =>
WrapAsStopWordsRemover(_jvmObject.Invoke("setLocale", value));

/// <summary>
/// Gets locale for <see cref="StopWordsRemover"/> transform
/// </summary>
/// <returns>The locale</returns>
public string GetLocale() => (string)_jvmObject.Invoke("getLocale");

/// <summary>
/// Sets case sensitivity.
/// </summary>
/// <param name="value">true if case sensitive, false otherwise</param>
/// <returns>New <see cref="StopWordsRemover"/> object</returns>
public StopWordsRemover SetCaseSensitive(bool value) =>
WrapAsStopWordsRemover(_jvmObject.Invoke("setCaseSensitive", value));

/// <summary>
/// Gets case sensitivity.
/// </summary>
/// <returns>true if case sensitive, false otherwise</returns>
public bool GetCaseSensitive() => (bool)_jvmObject.Invoke("getCaseSensitive");

/// <summary>
/// Sets custom stop words.
/// </summary>
/// <param name="values">Custom stop words</param>
/// <returns>New <see cref="StopWordsRemover"/> object</returns>
public StopWordsRemover SetStopWords(IEnumerable<string> values) =>
WrapAsStopWordsRemover(_jvmObject.Invoke("setStopWords", values));

/// <summary>
/// Gets the custom stop words.
/// </summary>
/// <returns>Custom stop words</returns>
public IEnumerable<string> GetStopWords() =>
(IEnumerable<string>)_jvmObject.Invoke("getStopWords");

/// <summary>
/// Check transform validity and derive the output schema from the input schema.
///
/// This checks for validity of interactions between parameters during Transform and
/// raises an exception if any parameter value is invalid.
///
/// Typical implementation should first conduct verification on schema change and parameter
/// validity, including complex parameter interaction checks.
/// </summary>
/// <param name="value">
/// The <see cref="StructType"/> of the <see cref="DataFrame"/> which will be transformed.
/// </param>
/// <returns>
/// The <see cref="StructType"/> of the output schema that would have been derived from the
/// input schema, if Transform had been called.
/// </returns>
public StructType TransformSchema(StructType value) =>
new StructType(
(JvmObjectReference)_jvmObject.Invoke("transformSchema",
DataType.FromJson(_jvmObject.Jvm, value.Json)));

/// <summary>
/// Load default stop words of given language for <see cref="StopWordsRemover"/>
/// transform Loads the default stop words for the given language.
/// Supported languages: danish, dutch, english, finnish, french, german,
/// hungarian, italian, norwegian, portuguese, russian, spanish, swedish, turkish.
/// </summary>
/// <param name="language">Language</param>
/// <returns>Default stop words for the given language</returns>
public static string[] LoadDefaultStopWords(string language) =>
(string[])SparkEnvironment.JvmBridge.CallStaticJavaMethod(
s_stopWordsRemoverClassName, "loadDefaultStopWords", language);

/// <summary>
/// Loads the <see cref="StopWordsRemover"/> that was previously saved using Save.
/// </summary>
/// <param name="path">The path the previous <see cref="StopWordsRemover"/> was saved to</param>
/// <returns>New <see cref="StopWordsRemover"/> object, loaded from path</returns>
public static StopWordsRemover Load(string path) =>
WrapAsStopWordsRemover(
SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_stopWordsRemoverClassName, "load", path));

private static StopWordsRemover WrapAsStopWordsRemover(object obj) =>
new StopWordsRemover((JvmObjectReference)obj);
}
}