diff --git a/Project.toml b/Project.toml index 3dac73d25..2f02f99f2 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +FeatureSelection = "33837fe5-dbff-4c9e-8c2f-c5612fe2b8b6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MLJBalancing = "45f359ea-796d-4f51-95a5-deb1a414c586" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" @@ -31,12 +32,13 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" CategoricalArrays = "0.8,0.9, 0.10" ComputationalResources = "0.3" Distributions = "0.21,0.22,0.23, 0.24, 0.25" +FeatureSelection = "0.1.1" MLJBalancing = "0.1" MLJBase = "1" MLJEnsembles = "0.4" MLJFlow = "0.5" MLJIteration = "0.6" -MLJModels = "0.16" +MLJModels = "0.17" MLJTestIntegration = "0.5.0" MLJTuning = "0.8" OpenML = "0.2,0.3" @@ -89,4 +91,41 @@ SymbolicRegression = "8254be44-1295-4e6a-a16d-46603ac705cb" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["BetaML", "CatBoost", "EvoLinear", "EvoTrees", "Imbalance", "InteractiveUtils", "LightGBM", "MLJClusteringInterface", "MLJDecisionTreeInterface", "MLJFlux", "MLJGLMInterface", "MLJLIBSVMInterface", "MLJLinearModels", "MLJMultivariateStatsInterface", "MLJNaiveBayesInterface", "MLJScikitLearnInterface", "MLJTSVDInterface", "MLJTestInterface", "MLJTestIntegration", "MLJText", "MLJXGBoostInterface", "Markdown", "NearestNeighborModels", "OneRule", "OutlierDetectionNeighbors", "OutlierDetectionPython", "ParallelKMeans", "PartialLeastSquaresRegressor", "PartitionedLS", "SelfOrganizingMaps", "SIRUS", "SymbolicRegression", "StableRNGs", "Suppressor", "Test"] +test = [ + "BetaML", + "CatBoost", + "EvoLinear", + "EvoTrees", + "Imbalance", + "InteractiveUtils", + "LightGBM", + "MLJClusteringInterface", + "MLJDecisionTreeInterface", + "MLJFlux", + "MLJGLMInterface", + "MLJLIBSVMInterface", + "MLJLinearModels", + "MLJMultivariateStatsInterface", + "MLJNaiveBayesInterface", + "MLJScikitLearnInterface", + "MLJTSVDInterface", + "MLJTestInterface", + "MLJTestIntegration", + "MLJText", + "MLJXGBoostInterface", + "Markdown", + "NearestNeighborModels", + "OneRule", + "OutlierDetectionNeighbors", + "OutlierDetectionPython", + "ParallelKMeans", + "PartialLeastSquaresRegressor", + "PartitionedLS", + "SelfOrganizingMaps", + "SIRUS", + "SymbolicRegression", + "StableRNGs", + "Suppressor", + "Test", +] + diff --git a/docs/ModelDescriptors.toml b/docs/ModelDescriptors.toml index d134e5bef..c98c0cbd5 100644 --- a/docs/ModelDescriptors.toml +++ b/docs/ModelDescriptors.toml @@ -11,7 +11,9 @@ AutoEncoder_BetaML = ["dimension_reduction"] BM25Transformer_MLJText = ["encoders", "text_analysis"] BaggingClassifier_MLJScikitLearnInterface = ["classification", "ensemble_models"] BaggingRegressor_MLJScikitLearnInterface = ["regression", "ensemble_models"] -BalancedBaggingClassifier_MLJBalancing = ["class_imbalance", "classification"] +BalancedBaggingClassifier_MLJBalancing = ["class_imbalance", "classification", "meta_algorithms"] +BinaryThresholdPredictor_MLJModels = ["meta_algorithms", "classification"] +BalancedModel_MLJBalancing = ["class_imbalance", "meta_algorithms"] BayesianLDA_MultivariateStats = ["dimension_reduction", "classification", "Bayesian_models"] BayesianLDA_MLJScikitLearnInterface = ["dimension_reduction", "classification", "Bayesian_models"] BayesianQDA_MLJScikitLearnInterface = ["dimension_reduction", "classification", "Bayesian_models"] @@ -52,6 +54,7 @@ ElasticNetCVRegressor_MLJScikitLearnInterface = ["regression"] ElasticNetRegressor_MLJLinearModels = ["regression"] ElasticNetRegressor_MLJScikitLearnInterface = ["regression"] ENNUndersampler_Imbalance = ["class_imbalance"] +EnsembleModel_MLJEnsembles = ["ensemble_models", "meta_algorithms"] EpsilonSVR_LIBSVM = ["regression"] EvoLinearRegressor_EvoLinear = ["regression"] EvoTreeClassifier_EvoTrees = ["classification", "ensemble_models", "iterative_models"] @@ -89,6 +92,7 @@ IForestDetector_OutlierDetectionPython = ["outlier_detection"] ImageClassifier_MLJFlux = ["classification", "image_processing", "iterative_models"] INNEDetector_OutlierDetectionPython = ["outlier_detection"] InteractionTransformer_MLJModels = ["static_models"] +IteratedModel_MLJIteration = ["iterative_models", "meta_algorithms"] KDEDetector_OutlierDetectionPython = ["outlier_detection"] KMeansClusterer_BetaML = ["clustering"] KMeans_Clustering = ["clustering", "dimension_reduction", ] @@ -104,7 +108,7 @@ KNeighborsClassifier_MLJScikitLearnInterface = ["classification"] KNeighborsRegressor_MLJScikitLearnInterface = ["regression"] KPLSRegressor_PartialLeastSquaresRegressor = ["regression"] KernelPCA_MultivariateStats = ["dimension_reduction", ] -KernelPerceptronClassifier_BetaML = ["classification"] +KernelPerceptronClassifier_BetaML = ["classification", "neural networks"] LADRegressor_MLJLinearModels = ["regression"] LDA_MultivariateStats = ["classification", "dimension_reduction", ] LGBMClassifier_LightGBM = ["classification", "ensemble_models", "iterative_models"] @@ -146,14 +150,14 @@ MultitargetGaussianMixtureRegressor_BetaML = ["regression", "distribution_fitter MultitargetKNNClassifier_NearestNeighborModels = ["classification"] MultitargetKNNRegressor_NearestNeighborModels = ["regression"] MultitargetLinearRegressor_MultivariateStats = ["regression"] -MultitargetNeuralNetworkRegressor_BetaML = ["regression"] -MultitargetNeuralNetworkRegressor_MLJFlux = ["regression", "iterative_models"] +MultitargetNeuralNetworkRegressor_BetaML = ["regression", "neural networks"] +MultitargetNeuralNetworkRegressor_MLJFlux = ["regression", "iterative_models", "neural networks"] MultitargetRidgeRegressor_MultivariateStats = ["regression"] MultitargetSRRegressor_SymbolicRegression = ["regression"] -NeuralNetworkClassifier_BetaML = ["classification"] -NeuralNetworkClassifier_MLJFlux = ["classification", "iterative_models"] -NeuralNetworkRegressor_BetaML = ["regression"] -NeuralNetworkRegressor_MLJFlux = ["regression", "iterative_models"] +NeuralNetworkClassifier_BetaML = ["classification", "neural networks"] +NeuralNetworkClassifier_MLJFlux = ["classification", "iterative_models", "neural networks"] +NeuralNetworkRegressor_BetaML = ["regression", "neural networks"] +NeuralNetworkRegressor_MLJFlux = ["regression", "iterative_models", "neural networks"] NuSVC_LIBSVM = ["classification"] NuSVR_LIBSVM = ["regression"] OCSVMDetector_OutlierDetectionPython = ["outlier_detection"] @@ -171,8 +175,9 @@ PartLS_PartitionedLS = ["regression"] PassiveAggressiveClassifier_MLJScikitLearnInterface = ["classification"] PassiveAggressiveRegressor_MLJScikitLearnInterface = ["regression"] PegasosClassifier_BetaML = ["classification"] -PerceptronClassifier_BetaML = ["classification", "iterative_models"] -PerceptronClassifier_MLJScikitLearnInterface = ["classification", "iterative_models"] +PerceptronClassifier_BetaML = ["classification", "iterative_models", "neural networks"] +PerceptronClassifier_MLJScikitLearnInterface = ["classification", "iterative_models", "neural networks"] +Pipeline_MLJBase = ["meta_algorithms"] ProbabilisticNuSVC_LIBSVM = ["classification"] ProbabilisticSGDClassifier_MLJScikitLearnInterface = ["classification"] ProbabilisticSVC_LIBSVM = ["classification"] @@ -190,6 +195,7 @@ RandomForestImputer_BetaML = ["missing_value_imputation", "ensemble_models", "it RandomForestRegressor_BetaML = ["regression", "ensemble_models", "iterative_models"] RandomForestRegressor_DecisionTree = ["regression", "ensemble_models", "iterative_models"] RandomForestRegressor_MLJScikitLearnInterface = ["regression", "ensemble_models", "iterative_models"] +Resampler_MLJBase = ["meta_algorithms"] RidgeCVClassifier_MLJScikitLearnInterface = ["classification"] RidgeCVRegressor_MLJScikitLearnInterface = ["classification"] RidgeClassifier_MLJScikitLearnInterface = ["classification"] @@ -210,6 +216,7 @@ StableForestClassifier_SIRUS = ["classification"] StableForestRegressor_SIRUS = ["regression"] StableRulesClassifier_SIRUS = ["classification"] StableRulesRegressor_SIRUS = ["regression"] +Stack_MLJBase = ["meta_algorithms", "ensemble_models"] SVC_LIBSVM = ["classification"] SVMClassifier_MLJScikitLearnInterface = ["classification"] SVMLinearClassifier_MLJScikitLearnInterface = ["classification"] @@ -222,9 +229,11 @@ SpectralClustering_MLJScikitLearnInterface = ["clustering", "static_models"] Standardizer_MLJModels = ["encoders"] SubspaceLDA_MultivariateStats = ["classification", "dimension_reduction"] TomekUndersampler_Imbalance = ["class_imbalance"] +TunedModel_MLJTuning = ["meta_algorithms"] TSVDTransformer_TSVD = ["dimension_reduction"] TfidfTransformer_MLJText = ["encoders", "text_analysis"] TheilSenRegressor_MLJScikitLearnInterface = ["regression"] +TransformedTargetModel_MLJBase = ["meta_algorithms", "outlier_detection"] UnivariateBoxCoxTransformer_MLJModels = ["encoders"] UnivariateDiscretizer_MLJModels = ["encoders"] UnivariateFillImputer_MLJModels = ["missing_value_imputation"] diff --git a/docs/make.jl b/docs/make.jl index c4d7e276f..20464d806 100755 --- a/docs/make.jl +++ b/docs/make.jl @@ -15,6 +15,7 @@ import MLJ.MLJModels import MLJ.MLJEnsembles import MLJ.ScientificTypes import MLJ.MLJBalancing +import MLJ.FeatureSelection import ScientificTypesBase import Distributions using CategoricalArrays @@ -37,7 +38,7 @@ isempty(problems) || error( # compose the individual model docstring pages: @info "Getting individual model docstrings from the registry and generating "* "pages for them, written at /docs/src/models/ ." -for model in models() +for model in models(wrappers=true) write_page(model) end @@ -118,6 +119,7 @@ makedocs( IterationControl, CategoricalDistributions, StatisticalMeasures, + FeatureSelection, ], pages = pages, warnonly = [:cross_references, :missing_docs], diff --git a/docs/model_docstring_tools.jl b/docs/model_docstring_tools.jl index 175d34370..743afcf84 100644 --- a/docs/model_docstring_tools.jl +++ b/docs/model_docstring_tools.jl @@ -5,7 +5,7 @@ const PATH_TO_MODEL_DOCS = joinpath(@__DIR__, "src", "models") """ remove_doc_refs(str::AbstractString) -Removes `@ref` references from `str. For example, a substring of the form +Removes `@ref` references from `str`. For example, a substring of the form "[`some.thing_like_this123!`](@ref)" is replaced with "`some.thing_like_this123!`". """ @@ -27,8 +27,8 @@ handle(model) = model.name*"_"*model.package_name **Private method.** Compose and write to file the documentation page for `model`. Here `model` is an entry in -the MLJ Model Registry, i.e., an element of `MLJModels.models()`. The file name has the -form `"ModelName_PackageName.md"`, for example, +the MLJ Model Registry, i.e., an element of `MLJModels.models(; wrappers=true)`. The file +name has the form `"ModelName_PackageName.md"`, for example, `"DecisionTreeClassifier_DecisionTree.md"`. Such a page can be referenced from any other markdown page in /docs/src/ like this: `[DecisionTreeClassifier](@ref DecisionTreeClassifier_DecisionTree)`. @@ -56,6 +56,7 @@ const DESCRIPTORS_GIVEN_HANDLE = # determined the list of all descriptors, ranked by frequency: const descriptors = vcat(values(DESCRIPTORS_GIVEN_HANDLE)...) const ranking = MLJBase.countmap(descriptors) +ranking["meta algorithms"] = 1e10 const DESCRIPTORS = sort(unique(descriptors), by=d -> ranking[d], rev=true) const HANDLES = keys(DESCRIPTORS_GIVEN_HANDLE) @@ -67,7 +68,7 @@ handle as key in /docs/src/ModelDescriptors.toml. """ function models_missing_descriptors() - handles = handle.(models()) + handles = handle.(models(wrappers=true)) filter(handles) do h !(h in HANDLES) end @@ -82,7 +83,7 @@ Return the list of models with a given `descriptor`, such as "regressor", as these appear in /src/docs/ModelDescriptors.toml. """ -modelswith(descriptor) = filter(models()) do model +modelswith(descriptor) = filter(models(wrappers=true)) do model descriptor in DESCRIPTORS_GIVEN_HANDLE[handle(model)] end diff --git a/docs/src/list_of_supported_models.md b/docs/src/list_of_supported_models.md index 7fa4ea1d7..e0dde1ba7 100644 --- a/docs/src/list_of_supported_models.md +++ b/docs/src/list_of_supported_models.md @@ -40,10 +40,11 @@ independent assessment. [Imbalance.jl](https://github.com/JuliaAI/Imbalance.jl) | - | RandomOversampler, RandomWalkOversampler, ROSE, SMOTE, BorderlineSMOTE1, SMOTEN, SMOTENC, RandomUndersampler, ClusterUndersampler, ENNUndersampler, TomekUndersampler, | low | [LIBSVM.jl](https://github.com/mpastell/LIBSVM.jl) | [MLJLIBSVMInterface.jl](https://github.com/JuliaAI/MLJLIBSVMInterface.jl) | LinearSVC, SVC, NuSVC, NuSVR, EpsilonSVR, OneClassSVM | high | also via ScikitLearn.jl [LightGBM.jl](https://github.com/IQVIA-ML/LightGBM.jl) | - | LGBMClassifier, LGBMRegressor | high | +[FeatureSelector.jl](https://github.com/JuliaAI/FeatureSelection.jl) | - | FeatureSelector, RecursiveFeatureElimination | low | [Flux.jl](https://github.com/FluxML/Flux.jl) | [MLJFlux.jl](https://github.com/FluxML/MLJFlux.jl) | NeuralNetworkRegressor, NeuralNetworkClassifier, MultitargetNeuralNetworkRegressor, ImageClassifier | low | [MLJBalancing.jl](https://github.com/JuliaAI/MLJBalancing.jl) | - | BalancedBaggingClassifier | low | [MLJLinearModels.jl](https://github.com/JuliaAI/MLJLinearModels.jl) | - | LinearRegressor, RidgeRegressor, LassoRegressor, ElasticNetRegressor, QuantileRegressor, HuberRegressor, RobustRegressor, LADRegressor, LogisticClassifier, MultinomialClassifier | medium | -[MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl) (built-in) | - | ConstantClassifier, ConstantRegressor, ContinuousEncoder, DeterministicConstantClassifier, DeterministicConstantRegressor, FeatureSelector, FillImputer, InteractionTransformer, OneHotEncoder, Standardizer, UnivariateBoxCoxTransformer, UnivariateDiscretizer, UnivariateFillImputer, UnivariateTimeTypeToContinuous, Standardizer, BinaryThreshholdPredictor | medium | +[MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl) (built-in) | - | ConstantClassifier, ConstantRegressor, ContinuousEncoder, DeterministicConstantClassifier, DeterministicConstantRegressor, FillImputer, InteractionTransformer, OneHotEncoder, Standardizer, UnivariateBoxCoxTransformer, UnivariateDiscretizer, UnivariateFillImputer, UnivariateTimeTypeToContinuous, Standardizer, BinaryThreshholdPredictor | medium | [MLJText.jl](https://github.com/JuliaAI/MLJText.jl) | - | TfidfTransformer, BM25Transformer, CountTransformer | low | [MultivariateStats.jl](https://github.com/JuliaStats/MultivariateStats.jl) | [MLJMultivariateStatsInterface.jl](https://github.com/JuliaAI/MLJMultivariateStatsInterface.jl) | LinearRegressor, MultitargetLinearRegressor, RidgeRegressor, MultitargetRidgeRegressor, PCA, KernelPCA, ICA, LDA, BayesianLDA, SubspaceLDA, BayesianSubspaceLDA, FactorAnalysis, PPCA | high | [NaiveBayes.jl](https://github.com/dfdx/NaiveBayes.jl) | [MLJNaiveBayesInterface.jl](https://github.com/JuliaAI/MLJNaiveBayesInterface.jl) | GaussianNBClassifier, MultinomialNBClassifier, HybridNBClassifier | low | diff --git a/docs/src/transformers.md b/docs/src/transformers.md index 59373b9eb..8428db1df 100644 --- a/docs/src/transformers.md +++ b/docs/src/transformers.md @@ -30,7 +30,7 @@ MLJModels.OneHotEncoder MLJModels.ContinuousEncoder MLJModels.FillImputer MLJModels.UnivariateFillImputer -MLJModels.FeatureSelector +FeatureSelection.FeatureSelector MLJModels.UnivariateBoxCoxTransformer MLJModels.UnivariateDiscretizer MLJModels.UnivariateTimeTypeToContinuous diff --git a/src/MLJ.jl b/src/MLJ.jl index e22a0c32d..ec5cbdf85 100644 --- a/src/MLJ.jl +++ b/src/MLJ.jl @@ -10,7 +10,7 @@ explicitly loaded from the model-providing package, using `@load`, for example. some common transformers, listed using `localmodels()` at startup, are immediately available, as are the following model wrappers: `Pipeline`, `TunedModel`, `EnsembleModel`, `IteratedModel`, `BalancedModel`, `TransformedTargetModel`, `BinaryThresholdPredictor`, -and `Stack`. +and `Stack`, `RecursiveFeatureSelection`. # Components @@ -35,6 +35,9 @@ and `Stack`. - MLJBalancing.jl: Incorporation of oversampling/undersampling methods in pipelines, via the `BalancedModel` wrapper +- FeatureSelection.jl: Transformers for feature selection, and the supervised model wrapper + `RecursiveFeatureSelection`. + - MLJFlow.jl: Integration with MLflow workflow tracking - OpenML.jl: Tool for grabbing datasets from OpenML.org @@ -60,6 +63,7 @@ import MLJBase.save using MLJEnsembles using MLJTuning using MLJModels +@reexport using FeatureSelection using OpenML @reexport using MLJFlow @reexport using StatisticalMeasures @@ -165,11 +169,11 @@ export Grid, RandomSearch, Explicit, TunedModel, LatinHypercube, # re-export from MLJModels: export models, localmodels, @load, @iload, load, info, doc, ConstantRegressor, ConstantClassifier, # builtins/Constant.jl - FeatureSelector, UnivariateStandardizer, # builtins/Transformers.jl + UnivariateStandardizer, Standardizer, UnivariateBoxCoxTransformer, OneHotEncoder, ContinuousEncoder, UnivariateDiscretizer, FillImputer, matching, BinaryThresholdPredictor, - UnivariateTimeTypeToContinuous, InteractionTransformer + UnivariateTimeTypeToContinuous, InteractionTransformer # builtins/Transformers.jl # re-export from MLJIteration: export MLJIteration diff --git a/test/integration.jl b/test/integration.jl index 4dbbc2124..f5b33629f 100644 --- a/test/integration.jl +++ b/test/integration.jl @@ -53,6 +53,9 @@ FILTER_GIVEN_ISSUE = Dict( "MultiTaskLassoCVRegressor", "MultiTaskLassoRegressor", ] + "https://github.com/JuliaAI/FeatureSelection.jl/issues/15" => + model -> model.package_name == "FeatureSelection" && + model.name = "RecursiveFeatureElimination" )