-
Notifications
You must be signed in to change notification settings - Fork 329
Add stop words removers #726
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 50 commits
072b014
3cd4e0e
fb203d7
781a42b
7d8aa8a
a2a0c69
04da539
859ff18
4c303e2
2638ba7
9fab3bf
d4b127a
a6f838d
88b6f9d
224e246
7bb7855
551435b
f625ede
cce01f5
795b9cf
8319e81
15ce559
1408fbe
681aba7
fee620a
4a98ee3
43b9b7d
fa54268
cb0a596
61b67e3
5f6c046
80f400a
8623c1a
ca64a6a
a3b8286
bb4f4e9
524ba73
e752b9b
b47e361
93d4608
66987ba
8c165b5
692598a
1e1f897
ff3948e
fdb1ed9
b3480c6
f84af25
59ae57e
65c31e7
3a51226
e4553a0
36d2048
d882c3c
2f3879f
ba2673b
f0b4f90
7d07cb9
d8ce37c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System.IO; | ||
using Microsoft.Spark.E2ETest.Utils; | ||
using Microsoft.Spark.ML.Feature; | ||
using Microsoft.Spark.Sql; | ||
using Microsoft.Spark.Sql.Types; | ||
using Microsoft.Spark.UnitTest.TestUtils; | ||
using Xunit; | ||
|
||
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature | ||
{ | ||
[Collection("Spark E2E Tests")] | ||
public class StopWordsRemoverTests : FeatureBaseTests<StopWordsRemover> | ||
{ | ||
private readonly SparkSession _spark; | ||
|
||
public StopWordsRemoverTests(SparkFixture fixture) : base(fixture) | ||
{ | ||
_spark = fixture.Spark; | ||
} | ||
|
||
/// <summary> | ||
/// Test stop words removers without locale, | ||
/// because locale is not supported before spark 2.4.0 version. | ||
/// </summary> | ||
[Fact] | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
public void TestStopWordsRemoverWithoutLocale() | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
string expectedUid = "theUidWithOutLocale"; | ||
string expectedInputCol = "input_col"; | ||
string expectedOutputCol = "output_col"; | ||
bool expectedCaseSensitive = false; | ||
var expectedStopWords = new string[] { "test1", "test2" }; | ||
|
||
DataFrame input = _spark.Sql("SELECT split('Hi I heard about Spark', ' ') as input_col"); | ||
|
||
StopWordsRemover stopWordsRemover = new StopWordsRemover(expectedUid) | ||
.SetInputCol(expectedInputCol) | ||
.SetOutputCol(expectedOutputCol) | ||
.SetCaseSensitive(expectedCaseSensitive) | ||
.SetStopWords(expectedStopWords); | ||
|
||
Assert.Equal(expectedUid, stopWordsRemover.Uid()); | ||
Assert.Equal(expectedInputCol, stopWordsRemover.GetInputCol()); | ||
Assert.Equal(expectedOutputCol, stopWordsRemover.GetOutputCol()); | ||
Assert.Equal(expectedCaseSensitive, stopWordsRemover.GetCaseSensitive()); | ||
Assert.Equal(expectedStopWords, stopWordsRemover.GetStopWords()); | ||
Assert.NotEmpty(StopWordsRemover.LoadDefaultStopWords("english")); | ||
|
||
using (TemporaryDirectory tempDirectory = new TemporaryDirectory()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ping? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I committed this change. I will merge this PR after CI passes.
imback82 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
string savePath = Path.Join(tempDirectory.Path, "StopWordsRemover"); | ||
stopWordsRemover.Save(savePath); | ||
|
||
StopWordsRemover loadedStopWordsRemover = StopWordsRemover.Load(savePath); | ||
Assert.Equal(stopWordsRemover.Uid(), loadedStopWordsRemover.Uid()); | ||
} | ||
|
||
Assert.IsType<StructType>(stopWordsRemover.TransformSchema(input.Schema())); | ||
Assert.IsType<DataFrame>(stopWordsRemover.Transform(input)); | ||
|
||
TestFeatureBase(stopWordsRemover, "inputCol", "input_col"); | ||
} | ||
|
||
/// <summary> | ||
/// Test stop words removers with locale, run if spark version is greater than spark 2.4.0 | ||
/// skip this test for rest of the spark versions. | ||
/// </summary> | ||
[SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] | ||
public void TestStopWordsRemoverWithLocale() | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
string expectedLocale = "en_GB"; | ||
StopWordsRemover stopWordsRemover = new StopWordsRemover().SetLocale(expectedLocale); | ||
Assert.Equal(expectedLocale, stopWordsRemover.GetLocale()); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System.Collections.Generic; | ||
using Microsoft.Spark.Interop; | ||
using Microsoft.Spark.Interop.Ipc; | ||
using Microsoft.Spark.Sql; | ||
using Microsoft.Spark.Sql.Types; | ||
|
||
namespace Microsoft.Spark.ML.Feature | ||
{ | ||
/// <summary> | ||
/// A <see cref="StopWordsRemover"/> feature transformer that filters out stop words from input. | ||
/// </summary> | ||
public class StopWordsRemover : FeatureBase<StopWordsRemover>, IJvmObjectReferenceProvider | ||
{ | ||
private static readonly string s_stopWordsRemoverClassName = | ||
"org.apache.spark.ml.feature.StopWordsRemover"; | ||
|
||
/// <summary> | ||
/// Create a <see cref="StopWordsRemover"/> without any parameters | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// </summary> | ||
public StopWordsRemover() : base(s_stopWordsRemoverClassName) | ||
{ | ||
} | ||
|
||
/// <summary> | ||
/// Create a <see cref="StopWordsRemover"/> with a UID that is used to give the | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// <see cref="StopWordsRemover"/> a unique ID | ||
/// </summary> | ||
/// <param name="uid">An immutable unique ID for the object and its derivatives.</param> | ||
public StopWordsRemover(string uid) : base(s_stopWordsRemoverClassName, uid) | ||
{ | ||
} | ||
|
||
internal StopWordsRemover(JvmObjectReference jvmObject) : base(jvmObject) | ||
{ | ||
} | ||
|
||
JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; | ||
|
||
/// <summary> | ||
/// Sets the column that the <see cref="StopWordsRemover"/> should read from. | ||
/// </summary> | ||
/// <param name="value">The name of the column to as the source</param> | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// <returns>New <see cref="StopWordsRemover"/> object</returns> | ||
public StopWordsRemover SetInputCol(string value) => | ||
WrapAsStopWordsRemover(_jvmObject.Invoke("setInputCol", value)); | ||
|
||
/// <summary> | ||
/// The <see cref="StopWordsRemover"/> will create a new column in the DataFrame, this is the | ||
/// name of the new column. | ||
/// </summary> | ||
/// <param name="value">The name of the column to as the target</param> | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// <returns>New <see cref="StopWordsRemover"/> object</returns> | ||
public StopWordsRemover SetOutputCol(string value) => | ||
WrapAsStopWordsRemover(_jvmObject.Invoke("setOutputCol", value)); | ||
|
||
/// <summary> | ||
/// Executes the <see cref="StopWordsRemover"/> and transforms the DataFrame to include the new | ||
/// column. | ||
/// </summary> | ||
/// <param name="source">The DataFrame to transform</param> | ||
/// <returns> | ||
/// New <see cref="DataFrame"/> object with the source <see cref="DataFrame"/> transformed | ||
/// </returns> | ||
public DataFrame Transform(DataFrame source) => | ||
new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); | ||
|
||
/// <summary> | ||
/// Gets the column that the <see cref="StopWordsRemover"/> should read from. | ||
/// </summary> | ||
/// <returns>Input column name</returns> | ||
public string GetInputCol() => (string)_jvmObject.Invoke("getInputCol"); | ||
|
||
/// <summary> | ||
/// The <see cref="StopWordsRemover"/> will create a new column in the DataFrame, this is the | ||
/// name of the new column. | ||
/// </summary> | ||
/// <returns>The output column name</returns> | ||
public string GetOutputCol() => (string)_jvmObject.Invoke("getOutputCol"); | ||
|
||
/// <summary> | ||
/// Sets locale for <see cref="StopWordsRemover"/> transform. | ||
/// Refer java.util.locale.getavailablelocales() for all available locales. | ||
/// </summary> | ||
/// <param name="value">Locale to be used for transform</param> | ||
/// <returns>New <see cref="StopWordsRemover"/> object</returns> | ||
public StopWordsRemover SetLocale(string value) => | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
WrapAsStopWordsRemover(_jvmObject.Invoke("setLocale", value)); | ||
|
||
/// <summary> | ||
/// Gets locale for <see cref="StopWordsRemover"/> transform | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// </summary> | ||
/// <returns>The locale</returns> | ||
public string GetLocale() => (string)_jvmObject.Invoke("getLocale"); | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
/// <summary> | ||
/// Sets case sensitivity. | ||
/// </summary> | ||
/// <param name="value">true if case sensitive, false otherwise</param> | ||
/// <returns>New <see cref="StopWordsRemover"/> object</returns> | ||
public StopWordsRemover SetCaseSensitive(bool value) => | ||
WrapAsStopWordsRemover(_jvmObject.Invoke("setCaseSensitive", value)); | ||
|
||
/// <summary> | ||
/// Gets case sensitivity. | ||
/// </summary> | ||
/// <returns>true if case sensitive, false otherwise</returns> | ||
public bool GetCaseSensitive() => (bool)_jvmObject.Invoke("getCaseSensitive"); | ||
|
||
/// <summary> | ||
/// Sets custom stop words. | ||
/// </summary> | ||
/// <param name="values">Custom stop words</param> | ||
/// <returns>New <see cref="StopWordsRemover"/> object</returns> | ||
public StopWordsRemover SetStopWords(IEnumerable<string> values) => | ||
WrapAsStopWordsRemover(_jvmObject.Invoke("setStopWords", values)); | ||
|
||
/// <summary> | ||
/// Gets the custom stop words. | ||
/// </summary> | ||
/// <returns>Custom stop words</returns> | ||
public IEnumerable<string> GetStopWords() => | ||
(IEnumerable<string>)_jvmObject.Invoke("getStopWords"); | ||
|
||
/// <summary> | ||
/// Check transform validity and derive the output schema from the input schema. | ||
/// | ||
/// This checks for validity of interactions between parameters during Transform and | ||
/// raises an exception if any parameter value is invalid. | ||
/// | ||
/// Typical implementation should first conduct verification on schema change and parameter | ||
/// validity, including complex parameter interaction checks. | ||
/// </summary> | ||
/// <param name="value"> | ||
/// The <see cref="StructType"/> of the <see cref="DataFrame"/> which will be transformed. | ||
/// </param> | ||
/// <returns> | ||
/// The <see cref="StructType"/> of the output schema that would have been derived from the | ||
/// input schema, if Transform had been called. | ||
/// </returns> | ||
public StructType TransformSchema(StructType value) => | ||
new StructType( | ||
(JvmObjectReference)_jvmObject.Invoke("transformSchema", | ||
DataType.FromJson(_jvmObject.Jvm, value.Json))); | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
/// <summary> | ||
/// Load default stop words of given language for <see cref="StopWordsRemover"/> | ||
/// transform Loads the default stop words for the given language. | ||
/// Supported languages: danish, dutch, english, finnish, french, german, | ||
/// hungarian, italian, norwegian, portuguese, russian, spanish, swedish, turkish. | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// </summary> | ||
SARAVANA1501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// <param name="language">Language</param> | ||
/// <returns>Default stop words for the given language</returns> | ||
public static string[] LoadDefaultStopWords(string language) => | ||
(string[])SparkEnvironment.JvmBridge.CallStaticJavaMethod( | ||
s_stopWordsRemoverClassName, "loadDefaultStopWords", language); | ||
|
||
/// <summary> | ||
/// Loads the <see cref="StopWordsRemover"/> that was previously saved using Save. | ||
/// </summary> | ||
/// <param name="path">The path the previous <see cref="StopWordsRemover"/> was saved to</param> | ||
/// <returns>New <see cref="StopWordsRemover"/> object, loaded from path</returns> | ||
public static StopWordsRemover Load(string path) => | ||
WrapAsStopWordsRemover( | ||
SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_stopWordsRemoverClassName, "load", path)); | ||
|
||
private static StopWordsRemover WrapAsStopWordsRemover(object obj) => | ||
new StopWordsRemover((JvmObjectReference)obj); | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.