From 9bc3daeba00e1c736db8d02d86317b1766264082 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 21 Sep 2023 15:49:49 +1200 Subject: [PATCH 01/16] adapt /src to migration of measures MLJBase->StatisticalMeasures --- Project.toml | 2 ++ src/MLJ.jl | 11 +---------- test/exported_names.jl | 6 ++++++ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/Project.toml b/Project.toml index 5a6b42914..60b6d8baf 100644 --- a/Project.toml +++ b/Project.toml @@ -21,6 +21,7 @@ ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" +StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" @@ -40,6 +41,7 @@ ProgressMeter = "1.1" Reexport = "1.2" ScientificTypes = "3" StatsBase = "0.32,0.33, 0.34" +StatisticalMeasures = "0.1" Tables = "0.2,1.0" julia = "1.6" diff --git a/src/MLJ.jl b/src/MLJ.jl index 161c975eb..6c6bc002a 100644 --- a/src/MLJ.jl +++ b/src/MLJ.jl @@ -18,6 +18,7 @@ using MLJTuning using MLJModels using OpenML @reexport using MLJFlow +@reexport using StatisticalMeasures using MLJIteration import MLJIteration.IterationControl @@ -108,16 +109,6 @@ for T in MLJBase.EXTENDED_ABSTRACT_MODEL_TYPES @eval(export $T) end -# MLJBase/measures: -# measure names: -for m in MLJBase.MEASURE_TYPES_ALIASES_AND_INSTANCES - :(export $m) |> eval -end -export measures, - aggregate, default_measure, skipinvalid, - roc_curve, roc, - no_avg, macro_avg, micro_avg - # re-export from MLJEnsembles: export EnsembleModel diff --git a/test/exported_names.jl b/test/exported_names.jl index fe95fc3e7..e49709972 100644 --- a/test/exported_names.jl +++ b/test/exported_names.jl @@ -27,4 +27,10 @@ Save() MLFlowLogger +# StatisticalMeasures + +rms +l2 +log_score + true From 98e54a0f3c07b0d062f5d4561829551b7546f5a4 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 21 Sep 2023 16:49:51 +1200 Subject: [PATCH 02/16] update the docs to reflect measure migration --- docs/src/evaluating_model_performance.md | 67 ++++---- docs/src/performance_measures.md | 192 +++++------------------ 2 files changed, 73 insertions(+), 186 deletions(-) diff --git a/docs/src/evaluating_model_performance.md b/docs/src/evaluating_model_performance.md index 63476d14f..af6e86864 100644 --- a/docs/src/evaluating_model_performance.md +++ b/docs/src/evaluating_model_performance.md @@ -37,7 +37,7 @@ data: ```@repl evaluation_of_supervised_models mach = machine(model, X, y) -evaluate!(mach, resampling=cv, measure=l2, verbosity=0) +evaluate!(mach, resampling=cv, measure=l2), verbosity=0) ``` (The latter call is a mutating call as the learned parameters stored in the @@ -45,31 +45,41 @@ machine potentially change. ) ## Multiple measures +Multiple measures are specified as a vector: + ```@repl evaluation_of_supervised_models -evaluate!(mach, - resampling=cv, - measure=[l1, rms, rmslp1], verbosity=0) +evaluate!( + mach, + resampling=cv, + measures=[l1, rms, rmslp1], + verbosity=0, +a) ``` -## Custom measures and weighted measures - -```@repl evaluation_of_supervised_models -my_loss(yhat, y) = maximum((yhat - y).^2); +[Custom measures](@ref) can also be provided. -my_per_observation_loss(yhat, y) = abs.(yhat - y); -MLJ.reports_each_observation(::typeof(my_per_observation_loss)) = true; +## Specifying weights -my_weighted_score(yhat, y) = 1/mean(abs.(yhat - y)); -my_weighted_score(yhat, y, w) = 1/mean(abs.((yhat - y).^w)); -MLJ.supports_weights(::typeof(my_weighted_score)) = true; -MLJ.orientation(::typeof(my_weighted_score)) = :score; +Per-observation weights can be passed to measures. If a measure does not support weights, +the weights are ignored: +```@repl evaluation_of_supervised_models holdout = Holdout(fraction_train=0.8) weights = [1, 1, 2, 1, 1, 2, 3, 1, 1, 2, 3, 1]; -evaluate!(mach, - resampling=CV(nfolds=3), - measure=[my_loss, my_per_observation_loss, my_weighted_score, l1], - weights=weights, verbosity=0) +evaluate!( + mach, + resampling=CV(nfolds=3), + measure=[l2, rsquared], + weights=weights, +) +``` + +In classification problems, a class weight dictionary can be supplied instead. + +```@docs +MLJBase.evaluate! +MLJBase.evaluate +MLJBase.PerformanceEvaluation ``` ## User-specified train/test sets @@ -78,18 +88,20 @@ Users can either provide an explicit list of train/test pairs of row indices for ```@repl evaluation_of_supervised_models fold1 = 1:6; fold2 = 7:12; -evaluate!(mach, - resampling = [(fold1, fold2), (fold2, fold1)], - measure=[l1, l2], verbosity=0) +evaluate!( + mach, + resampling = [(fold1, fold2), (fold2, fold1)], + measures=[l1, l2], + verbosity=0, +) ``` -Or define their own re-usable `ResamplingStrategy` objects, - see -[Custom resampling strategies](@ref) below. +Or the user can define their own re-usable `ResamplingStrategy` objects, - see [Custom +resampling strategies](@ref) below. ## Built-in resampling strategies - ```@docs MLJBase.Holdout ``` @@ -159,10 +171,3 @@ function train_test_pairs(holdout::Holdout, rows) end ``` -## API - -```@docs -MLJBase.evaluate! -MLJBase.evaluate -MLJBase.PerformanceEvaluation -``` diff --git a/docs/src/performance_measures.md b/docs/src/performance_measures.md index 2d47c25e8..ab4b6f516 100644 --- a/docs/src/performance_measures.md +++ b/docs/src/performance_measures.md @@ -1,174 +1,56 @@ # Performance Measures -In MLJ loss functions, scoring rules, sensitivities, and so on, are -collectively referred to as *measures*. These include re-exported loss -functions from the -[LossFunctions.jl](https://github.com/JuliaML/LossFunctions.jl) -library, overloaded to behave the same way as the built-in measures. +**Quick link:** [List of aliases of all +measures](https://juliaai.github.io/StatisticalMeasures.jl/dev/auto_generated_list_of_measures/#aliases) -To see the list of all measures, run `measures()`. Further measures for -probabilistic predictors, such as proper scoring rules, and for -constructing multi-target product measures, are planned. If you'd like -to see a measure added to MLJ, post a comment -[here](https://github.com/JuliaAI/MLJBase.jl/issues/299).g +In MLJ loss functions, scoring rules, confusion matrices, sensitivities, etc, are +collectively referred to as *measures*. These measures are provided by the package +[StatisticalMeasures.jl](https://juliaai.github.io/StatisticalMeasures.jl/dev/). As this +package is a dependency of MLJ, and all its methods are re-exported, the measures are +immediately available to the MLJ user. Commonly measures are passed to MLJ meta-algorithms +(see [Uses of measures](@ref) below) but to learn how to call measures directly, see +[this](https://juliaai.github.io/StatisticalMeasures.jl/dev/examples_of_usage/) +StatisticalMeasures.jl tutorial. -*Note for developers:* The measures interface and the built-in -measures described here are defined in MLJBase, but will ultimately live -in a separate package. +A list of all measures ready to use after running `using MLJ` or `using +StatisticalMeasures`, is +[here](https://juliaai.github.io/StatisticalMeasures.jl/dev/auto_generated_list_of_measures/). Alternatively, +call [`measures()`](@ref) (experimental) to generate a dictionary keyed on available +measure constructors, with measure metadata as values. -## Using built-in measures +## Custom measures -These measures all have the common calling syntax +Any measure-like object with appropriate [calling +behavior](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/implementing_new_measures/#definitions) +can be used with MLJ. To quickly build custom measures, we recommend using the package +[StatisticalMeasuresBase.jl](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/), +which provides [this](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/tutorial/) +tutorial. Note, in particular, that an "atomic" measure can be transformed into a +multi-target measure using this package. -```julia -measure(ŷ, y) -``` - -or - -```julia -measure(ŷ, y, w) -``` - -where `y` iterates over observations of some target variable, and `ŷ` -iterates over predictions (`Distribution` or `Sampler` objects in the -probabilistic case). Here `w` is an optional vector of sample weights, -or a dictionary of class weights, when these are supported by the -measure. - -```@repl losses_and_scores -using MLJ -y = [1, 2, 3, 4]; -ŷ = [2, 3, 3, 3]; -w = [1, 2, 2, 1]; -rms(ŷ, y) # reports an aggregate loss -l2(ŷ, y, w) # reports per observation losses -y = coerce(["male", "female", "female"], Multiclass) -d = UnivariateFinite(["male", "female"], [0.55, 0.45], pool=y); -ŷ = [d, d, d]; -log_loss(ŷ, y) -``` - -The measures `rms`, `l2` and `log_loss` illustrated here are actually - instances of measure *types*. For, example, `l2 = LPLoss(p=2)` and -`log_loss = LogLoss() = LogLoss(tol=eps())`. Common aliases are -provided: - -```@repl losses_and_scores -cross_entropy -``` - -## Traits and custom measures +## Uses of measures -Notice that `l1` reports per-sample evaluations, while `rms` -only reports an aggregated result. This and other behavior can be -gleaned from measure *traits* which are summarized by the `info` -method: - -```@repl losses_and_scores -info(l1) -``` +In MLJ, measures are specified: -Query the doc-string for a measure using the name of its type: +- when evaluating model performance using +[`evaluate!`](@ref)/[`evaluate`](@ref) - see [Evaluating Model Performance](@ref) -```@repl losses_and_scores -rms -@doc RootMeanSquaredError # same as `?RootMeanSqauredError -``` +- when wrapping models using [`TunedModel`](@ref) - see [Tuning Models](@ref) +- when wrapping iterative models using [`IteratedModel`](@ref) - see [Controlling Iterative Models](@ref) +- when generating learning curves using [`learning_curve`](@ref) - see [Learning Curves](@ref) -Use `measures()` to list all measures, and `measures(conditions...)` to -search for measures with given traits (as you would [query -models](model_search.md)). The trait `instances` list the actual -callable instances of a given measure type (typically aliases for the -default instance). +and elsewhere. -```@docs -measures(conditions...) -``` +## Using LossFunctions.jl -A user-defined measure in MLJ can be passed to the `evaluate!` -method, and elsewhere in MLJ, provided it is a function or callable -object conforming to the above syntactic conventions. By default, a -custom measure is understood to: - -- be a loss function (rather than a score) - -- report an aggregated value (rather than per-sample evaluations) - -- be feature-independent - -To override this behavior one simply overloads the appropriate trait, -as shown in the following examples: - -```@repl losses_and_scores -y = [1, 2, 3, 4]; -ŷ = [2, 3, 3, 3]; -w = [1, 2, 2, 1]; -my_loss(ŷ, y) = maximum((ŷ - y).^2); -my_loss(ŷ, y) -my_per_sample_loss(ŷ, y) = abs.(ŷ - y); -MLJ.reports_each_observation(::typeof(my_per_sample_loss)) = true; -my_per_sample_loss(ŷ, y) -my_weighted_score(ŷ, y) = 1/mean(abs.(ŷ - y)); -my_weighted_score(ŷ, y, w) = 1/mean(abs.((ŷ - y).^w)); -MLJ.supports_weights(::typeof(my_weighted_score)) = true; -MLJ.orientation(::typeof(my_weighted_score)) = :score; -my_weighted_score(ŷ, y) -X = (x=rand(4), penalty=[1, 2, 3, 4]); -my_feature_dependent_loss(ŷ, X, y) = sum(abs.(ŷ - y) .* X.penalty)/sum(X.penalty); -MLJ.is_feature_dependent(::typeof(my_feature_dependent_loss)) = true -my_feature_dependent_loss(ŷ, X, y) -``` +In previous versions of MLJ, measures from LossFunctions.jl were also available. Now +measures from that package must be explicitly imported and wrapped, as described +[here](https://juliaai.github.io/StatisticalMeasures.jl/dev/examples_of_usage/#Using-losses-from-LossFunctions.jl). -The possible signatures for custom measures are: `measure(ŷ, y)`, -`measure(ŷ, y, w)`, `measure(ŷ, X, y)` and `measure(ŷ, X, y, w)`, each -measure implementing one non-weighted version, and possibly a second -weighted version. +## Receiver operator characteristics -## Using measures from LossFunctions.jl - -The [LossFunctions.jl](https://github.com/JuliaML/LossFunctions.jl) -package includes "distance loss" functions for `Continuous` targets, -and "marginal loss" functions for `Finite{2}` (binary) targets. While the -LossFunctions.jl interface differs from the present one (for, example -binary observations must be +1 or -1), MLJ has overloaded instances -of the LossFunctions.jl types to behave the same as the built-in -types. - -Note that the "distance losses" in the package apply to deterministic -predictions, while the "marginal losses" apply to probabilistic -predictions. - - -## List of measures - -All measures listed below have a doc-string associated with the measure's -*type*. So, for example, do `?LPLoss` not `?l2`. - -```@setup losses_and_scores -using DataFrames -``` - -```@example losses_and_scores -ms = measures() -types = map(ms) do m - m.name -end -instance = map(ms) do m m.instances end -table = (type=types, instances=instance) -DataFrame(table) -``` - - -## Other performance-related tools - -In MLJ one computes a confusion matrix by calling an instance of the -`ConfusionMatrix` measure type on the data: - -```@docs -ConfusionMatrix -``` +A related performance evaluation tool provided by StatisticalMeasures.jl, and hence by MLJ, is the `roc_curve` method: ```@docs roc_curve From 8b780ad01bba1554c2b8f37c6be61f1f1e7f7f43 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 21 Sep 2023 17:47:27 +1200 Subject: [PATCH 03/16] update ORGANIZATION.md --- ORGANIZATION.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ORGANIZATION.md b/ORGANIZATION.md index 2a7c690a1..fd6ec873e 100644 --- a/ORGANIZATION.md +++ b/ORGANIZATION.md @@ -40,6 +40,9 @@ its conventional use, are marked with a ⟂ symbol: readme](https://github.com/JuliaAI/MLJBase.jl) for a detailed description of MLJBase's contents. +* [StatisticalMeasures.jl](https://github.com/JuliaAI/StatisticalMeasures.jl) provifes + performance measures (metrics) such as losses and scores. + * [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl) hosts the *MLJ model registry*, which contains metadata on all the models the MLJ user can search and load from MLJ. Moreover, it From b5f01c0aae1583d687e91e081eb7a439e4869f3c Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 22 Sep 2023 11:25:34 +1200 Subject: [PATCH 04/16] add measures migration guide to docs --- docs/src/performance_measures.md | 123 +++++++++++++++++++++++++++++-- 1 file changed, 115 insertions(+), 8 deletions(-) diff --git a/docs/src/performance_measures.md b/docs/src/performance_measures.md index ab4b6f516..ba8a6a8e0 100644 --- a/docs/src/performance_measures.md +++ b/docs/src/performance_measures.md @@ -1,16 +1,21 @@ # Performance Measures -**Quick link:** [List of aliases of all -measures](https://juliaai.github.io/StatisticalMeasures.jl/dev/auto_generated_list_of_measures/#aliases) +## Quick links + +- [List of aliases of all + measures](https://juliaai.github.io/StatisticalMeasures.jl/dev/auto_generated_list_of_measures/#aliases) + +- [Changes to measures in MLJBase 1.0](@ref) + +## Introduction In MLJ loss functions, scoring rules, confusion matrices, sensitivities, etc, are collectively referred to as *measures*. These measures are provided by the package -[StatisticalMeasures.jl](https://juliaai.github.io/StatisticalMeasures.jl/dev/). As this -package is a dependency of MLJ, and all its methods are re-exported, the measures are -immediately available to the MLJ user. Commonly measures are passed to MLJ meta-algorithms -(see [Uses of measures](@ref) below) but to learn how to call measures directly, see -[this](https://juliaai.github.io/StatisticalMeasures.jl/dev/examples_of_usage/) -StatisticalMeasures.jl tutorial. +[StatisticalMeasures.jl](https://juliaai.github.io/StatisticalMeasures.jl/dev/) but are +immediately available to the MLJ user. Commonly, measures are passed to MLJ +meta-algorithms (see [Uses of measures](@ref) below) but to learn how to call measures +directly, see the StatisticalMeasures.jl +[tutorial](https://juliaai.github.io/StatisticalMeasures.jl/dev/examples_of_usage/). A list of all measures ready to use after running `using MLJ` or `using StatisticalMeasures`, is @@ -55,3 +60,105 @@ A related performance evaluation tool provided by StatisticalMeasures.jl, and he ```@docs roc_curve ``` + +## Changes to measures in MLJBase 1.0 + +Prior to MLJBase.jl 1.0 (respectivey, MLJ.jl version 0.19.6) measures were defined in +MLJBase.jl (a dependency of MLJ.jl) but now they are provided by MLJ.jl dependency +[StatisticalMeasures](https://juliaai.github.io/StatisticalMeasures.jl/dev/). The effects +on users is detailed below: + + +### Breaking behavior relevant to many users + +- If `using MLJBase` without MLJ, then, in Julia 1.9 or higher, `StatisticalMeasures` must + be explicitly imported to use measures that were previously part of MLJBase. If `using + MLJ`, then all previous measures are still available. + +- All measures return a *single* aggregated measurement. In other words, measures + previously reporting a measurement *per-observation* (previously subtyping + `Unaggregated`) no longer do so. To get per-observation measurements, use the new method + `measurements(measure, ŷ, y[, weights, class_weights])`. + +- The default measure for regression models (used in `evaluate/evaluate!` when `measures` + is unspecified) is changed from `rms` to `l2=LPLoss(2)` (mean sum of squares). + +- Measures that previously skipped `NaN` values will now (at least by default) propagate + those values. Missing value behavior is unchanged, except some measures that + previously did not support `missing` now do. + +- Aliases for measure *types* have been removed. For example `RMSE` (alias for + `RootMeanSquaredError`) is gone. Aliases for instances, such as `rms` and + `cross_entropy` persist. The exception is `precision`, for which `ppv` can + be used in its place. (This is to avoid conflict with `Base.precision`, which was + previously pirated.) + +- `info(measure)` has been decommissioned; query docstrings or access the new [measure + traits](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/methods/#Traits) + individually instead. These traits are now provided by StatisticalMeasures.jl and not + are not exported. For example, to access the orientation of the measure `rms`, do + `import StatisticalMeasures as SM; SM.orientation(rms)`. + +- Behavior of the `measures()` method, to list all measures and associated traits, has + changed. It now returns a dictionary instead of a vector of named tuples; + `measures(predicate)` is decommissioned, but `measures(needle)` is preserved. (This + method, owned by StatisticalMeasures.jl, has some other search options, but is + experimental.) + +- Measures that were wraps of losses from LossFunctions.jl are no longer exposed by + MLJBase or MLJ. To use such a loss, you must explicitly `import LossFunctions` and wrap + the loss appropriately. See [Using losses from + LossFunctions.jl](https://juliaai.github.io/StatisticalMeasures.jl/dev/examples_of_usage/#Using-losses-from-LossFunctions.jl) + for examples. + +- Some user-defined measures working in previous versions of MLJBase.jl may not work + without modification, as they must conform to the new [StatisticalMeasuresBase.jl + API](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/implementing_new_measures/#definitions). See + [this tutorial](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/tutorial/) on + how define new measures. + +- Measures with a "feature argument" `X`, as in `some_measure(ŷ, y, X)`, are no longer + supported. See [What is a + measure?](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/implementing_new_measures/#definitions) + for allowed signatures in measures. + +## Breaking behavior likely relevant only to developers of some client packages + +- The abstract measure types `Aggregated`, `Unaggregated`, `Measure` have been + decommissioned. (A measure is now defined purely by its [calling + behavior](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/implementing_new_measures/#definitions).) + +- What were previously exported as measure types are now only constructors. + +- `target_scitype(measure)` is decommissioned. Related is + `StatisticalMeasures.observation_scitype(measure)` which declares an upper bound on the + allowed scitype *of a single observation*. + +- `prediction_type(measure)` is decommissioned. Instead use + `StatisticalMeasures.kind_of_proxy(measure)`. + +- The trait `reports_each_observation` is decommissioned. Related is + `StatisticalMeasures.can_report_unaggregated`; if `false` the new `measurements` method + simply returns `n` copies of the aggregated measurement, where `n` is the number of + observations provided, instead of individual observation-dependent measurements. + +- `aggregation(measure)` has been decommissioned. Instead use + `StatisticalMeasures.external_mode_of_aggregation(measure)`. + +- `instances(measure)` has been decommissioned; query docstrings for measure aliases, or + follow this example: `aliases = measures()[RootMeanSquaredError].aliases`. + +- `is_feature_dependent(measure)` has been decommissioned. Measures consuming feature data + are not longer supported; see above. + +- `distribution_type(measure)` has been decommissioned. + +- `docstring(measure)` has been decommissioned. + +- Behavior of `aggregate` [has changed](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/methods/#StatisticalMeasuresBase.aggregate). + +- The following traits, previously exported by MLJBase and MLJ, cannot be applied to + measures: `supports_weights`, `supports_class_weights`, `orientation`, + `human_name`. Instead use the traits with these names provided by + StatisticalMeausures.jl (they will need to be qualified, as in `import + StatisticalMeasures; StatisticalMeasures.orientation(measure)`). From f394bc2c60baa505d5a4db6ea34f656f3fc5803c Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 22 Sep 2023 11:41:34 +1200 Subject: [PATCH 05/16] update migration guide --- docs/src/performance_measures.md | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/docs/src/performance_measures.md b/docs/src/performance_measures.md index ba8a6a8e0..529e8c943 100644 --- a/docs/src/performance_measures.md +++ b/docs/src/performance_measures.md @@ -5,7 +5,7 @@ - [List of aliases of all measures](https://juliaai.github.io/StatisticalMeasures.jl/dev/auto_generated_list_of_measures/#aliases) -- [Changes to measures in MLJBase 1.0](@ref) +- [Migration guide for changes to measures in MLJBase 1.0](@ref) ## Introduction @@ -61,12 +61,12 @@ A related performance evaluation tool provided by StatisticalMeasures.jl, and he roc_curve ``` -## Changes to measures in MLJBase 1.0 +## Migration guide for changes to measures in MLJBase 1.0 Prior to MLJBase.jl 1.0 (respectivey, MLJ.jl version 0.19.6) measures were defined in MLJBase.jl (a dependency of MLJ.jl) but now they are provided by MLJ.jl dependency -[StatisticalMeasures](https://juliaai.github.io/StatisticalMeasures.jl/dev/). The effects -on users is detailed below: +[StatisticalMeasures](https://juliaai.github.io/StatisticalMeasures.jl/dev/). Effects +on users are detailed below: ### Breaking behavior relevant to many users @@ -121,8 +121,28 @@ on users is detailed below: supported. See [What is a measure?](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/implementing_new_measures/#definitions) for allowed signatures in measures. + +### Packages implementing the MLJ model interface + +The migration of measures is not expected to require any changes to the source code in +packges providing implementations of the MLJ model interface (MLJModelInterface.jl) such +as MLJDecisionTreeInterface.jl and MLJFlux.jl, and this is confirmed by extensive +integration tests. However, some current tests will fail, if they use MLJBase +measures. The following should generally suffice to adapt such tests: + +- Add StatisticalMeasures as test dependency, and add `using StatisticalMeasures` to your + `runtests.jl` (and/or included submodules). + +- If measures are qualified, as in `MLJBase.rms`, then the qualification must be removed + or changed to `StatisticalMeasures.rms`, etc. + +- Be aware that the default measure used in methods such as `evaluate!`, when `measure` is + not specified, is changed from `rms` to `l2` for regression models. + +- Be aware of that all measures now report a measurement for every observation, and never + an aggregate. See second point above. -## Breaking behavior likely relevant only to developers of some client packages +### Rarely relevant breaking behavior - The abstract measure types `Aggregated`, `Unaggregated`, `Measure` have been decommissioned. (A measure is now defined purely by its [calling From 8a2471723b6b0b20b744c13e52c8e5a024a201e3 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 22 Sep 2023 14:08:47 +1200 Subject: [PATCH 06/16] improve a title in docs --- docs/src/learning_networks.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/learning_networks.md b/docs/src/learning_networks.md index a8f30398b..46e688941 100644 --- a/docs/src/learning_networks.md +++ b/docs/src/learning_networks.md @@ -240,7 +240,7 @@ data). We demonstrate the process by way of examples of increasing complexity: - [Example A - Mini-pipeline](@ref) - [More on replacing models with symbols](@ref) - [Example B - Multiple operations: transform and inverse transform](@ref) -- [Example C - Exposing internal network state in reports](@ref) +- [Example C - Blending predictions and exposing internal network state in reports](@ref) - [Example D - Multiple nodes pointing to the same machine](@ref) - [Example E - Coupling component model hyper-parameters](@ref) - [More on defining new nodes](@ref) @@ -428,7 +428,7 @@ W = transform(mach, X) @assert inverse_transform(mach, W) ≈ X ``` -### Example C - Exposing internal network state in reports +### Example C - Blending predictions and exposing internal network state in reports The code below defines a new composite model type `CompositeC` that predicts by taking the weighted average of two regressors, and additionally exposes, in the model's report, a From 2178c10bd162bd0585ff6caac15c46cae27c97db Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 22 Sep 2023 14:56:53 +1200 Subject: [PATCH 07/16] add comment in manual to address comment in #1036 --- docs/src/generating_synthetic_data.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/src/generating_synthetic_data.md b/docs/src/generating_synthetic_data.md index f999d91f9..0349f0861 100644 --- a/docs/src/generating_synthetic_data.md +++ b/docs/src/generating_synthetic_data.md @@ -1,5 +1,9 @@ # Generating Synthetic Data +Here *synthetic data* means artificially generated data, with no reference to a "real +world" data set. Not to be confused "fake data" obtained by resampling from a distribution +fit to some actual real data. + MLJ has a set of functions - `make_blobs`, `make_circles`, `make_moons` and `make_regression` (closely resembling functions in [scikit-learn](https://scikit-learn.org/stable/datasets/index.html#generated-datasets) From cf21943c4973ef9797bfb4297577400144fc4fa3 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 22 Sep 2023 15:48:09 +1200 Subject: [PATCH 08/16] more doc fixes --- docs/make.jl | 2 - docs/src/common_mlj_workflows.md | 6 +- docs/src/evaluating_model_performance.md | 2 +- docs/src/getting_started.md | 18 +- docs/src/julia_blogpost.md | 227 ----------------------- docs/src/known_issues.md | 32 ---- docs/src/weights.md | 20 +- 7 files changed, 14 insertions(+), 293 deletions(-) delete mode 100644 docs/src/julia_blogpost.md delete mode 100644 docs/src/known_issues.md diff --git a/docs/make.jl b/docs/make.jl index f646377a5..b71fb7094 100755 --- a/docs/make.jl +++ b/docs/make.jl @@ -87,9 +87,7 @@ pages = [ "Third Party Packages" => "third_party_packages.md", "Glossary" => "glossary.md", "MLJ Cheatsheet" => "mlj_cheatsheet.md", - "Known Issues" => "known_issues.md", "FAQ" => "frequently_asked_questions.md", - "Julia BlogPost" => "julia_blogpost.md", "Index of Methods" => "api.md", ] diff --git a/docs/src/common_mlj_workflows.md b/docs/src/common_mlj_workflows.md index ce66037a6..2ecc3ceda 100644 --- a/docs/src/common_mlj_workflows.md +++ b/docs/src/common_mlj_workflows.md @@ -220,7 +220,7 @@ Fit on the train data set and evaluate on the test data set: ```@example workflows fit!(mach, rows=train) yhat = predict(mach, X[test,:]) -mean(LogLoss(tol=1e-4)(yhat, y[test])) +LogLoss(tol=1e-4)(yhat, y[test]) ``` Note `LogLoss()` has aliases `log_loss` and `cross_entropy`. @@ -451,7 +451,7 @@ transformation/inverse transformation: ```@example workflows X, y = @load_reduced_ames KNN = @load KNNRegressor -knn_with_target = TransformedTargetModel(model=KNN(K=3), target=Standardizer()) +knn_with_target = TransformedTargetModel(model=KNN(K=3), transformer=Standardizer()) pipe = (X -> coerce(X, :age=>Continuous)) |> OneHotEncoder() |> knn_with_target ``` @@ -476,7 +476,7 @@ target transformation/inverse transformation: ```@example workflows Tree = @load DecisionTreeRegressor pkg=DecisionTree verbosity=0 tree_with_target = TransformedTargetModel(model=Tree(), - target=y -> log.(y), + transformer=y -> log.(y), inverse = z -> exp.(z)) pipe2 = (X -> coerce(X, :age=>Continuous)) |> OneHotEncoder() |> tree_with_target; nothing # hide diff --git a/docs/src/evaluating_model_performance.md b/docs/src/evaluating_model_performance.md index af6e86864..645962892 100644 --- a/docs/src/evaluating_model_performance.md +++ b/docs/src/evaluating_model_performance.md @@ -74,7 +74,7 @@ evaluate!( ) ``` -In classification problems, a class weight dictionary can be supplied instead. +In classification problems, use `class_weights=...` to specify a class weight dictionary. ```@docs MLJBase.evaluate! diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index bd104f182..e6a9ed3d6 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -191,7 +191,7 @@ train, test = partition(eachindex(y), 0.7); # 70:30 split fit!(mach, rows=train); yhat = predict(mach, X[test,:]); yhat[3:5] -log_loss(yhat, y[test]) |> mean +log_loss(yhat, y[test]) ``` Note that `log_loss` and `cross_entropy` are aliases for `LogLoss()` @@ -350,23 +350,11 @@ MLJ models expecting a table do not generally accept a matrix instead. However, a matrix can be wrapped as a table, using [`MLJ.table`](@ref): -```julia -matrix_table = MLJ.table(rand(2,3)) +```@repl doda +matrix_table = MLJ.table(rand(2,3)); schema(matrix_table) ``` -``` -┌─────────┬─────────┬────────────┐ -│ _.names │ _.types │ _.scitypes │ -├─────────┼─────────┼────────────┤ -│ x1 │ Float64 │ Continuous │ -│ x2 │ Float64 │ Continuous │ -│ x3 │ Float64 │ Continuous │ -└─────────┴─────────┴────────────┘ -_.nrows = 2 - -``` - The matrix is *not* copied, only wrapped. To manifest a table as a matrix, use [`MLJ.matrix`](@ref). diff --git a/docs/src/julia_blogpost.md b/docs/src/julia_blogpost.md deleted file mode 100644 index cad97e02a..000000000 --- a/docs/src/julia_blogpost.md +++ /dev/null @@ -1,227 +0,0 @@ -!!! warning "Old post" - - This post is quite old. For a newer overview of the design of MLJ, see [here](https://github.com/alan-turing-institute/MLJ.jl/blob/master/paper/paper.md) - - -# Beyond machine learning pipelines with MLJ - -Anthony Blaom, Diego Arenas, Franz Kiraly, Yiannis Simillides, Sebastian Vollmer - -**May 1st, 2019.** Blog post also posted on the [Julia Language Blog](https://julialang.org/blog/2019/05/beyond-ml-pipelines-with-mlj) - - - - -![](img/learningcurves.png) | ![](img/heatmap.png) -------------------------|-------------------------- -![](img/wrapped_ridge.png) | ![](img/MLPackages.png) - - -## Introducing MLJ - -[MLJ](https://github.com/alan-turing-institute/MLJ.jl) is an -open-source machine learning toolbox written in pure Julia. It -provides a uniform interface for interacting with supervised and -unsupervised learning models currently scattered in different Julia -packages. - -Building on a earlier proof-of-concept, development began in earnest -at [The Alan Turing Institute](https://www.turing.ac.uk) in -December 2018. In a short time interest grew and the project is now -the Institute's most starred software repository. - -After outlining MLJ's current functionality, this post introduces MLJ -**learning networks**, a super-charged pipelining feature for model -composition. - -**Quick links:** - -- [MLJ vs ScikitLearn.jl](https://alan-turing-institute.github.io/MLJ.jl/dev/frequently_asked_questions/) - -- Video from [London Julia User Group meetup in March 2019](https://www.youtube.com/watch?v=CfHkjNmj1eE) (skip to [demo at 21'39](https://youtu.be/CfHkjNmj1eE?t=21m39s))   - -- [Learning MLJ](@ref) - -- Implementing the MLJ interface for a [new model](https://alan-turing-institute.github.io/MLJ.jl/dev/adding_models_for_general_use/) - -- How to [contribute](https://github.com/alan-turing-institute/MLJ.jl/blob/master/CONTRIBUTE.md) - -- Julia [Slack](http://julialang.slack.com) channel: \#mlj. - -- Star'ing us to show support for [MLJ](https://github.com/alan-turing-institute/MLJ.jl) would be greatly appreciated! - - -## MLJ features - -MLJ already has substantial functionality: - -- **Learning networks.** Flexible model composition beyond traditional - pipelines (more on this below). - -- **Automatic tuning.** Automated tuning of hyperparameters, including - composite models. Tuning implemented as a model wrapper for - composition with other meta-algorithms. - -- **Homogeneous model ensembling.** - -- **Registry for model metadata.** Metadata available without loading - model code. Basis of a "task" interface and facilitates - model composition. - -- **Task interface.** Automatically match models to specified learning - tasks, to streamline benchmarking and model selection. - -- **Clean probabilistic API.** Improves support for Bayesian - statistics and probabilistic graphical models. - -- **Data container agnostic.** Present and manipulate data in your - favorite Tables.jl format. - -- **Universal adoption of categorical data types.** Enables model - implementations to properly account for classes seen in training but - not in evaluation. - -Enhancements planned for the near future include integration of -Flux.jl **deep learning** models, and **gradient descent tuning** of -continuous hyperparameters using automatic differentiation. - -While a relatively small number of machine learning models currently -implement the MLJ interface, work in progress aims to wrap models -supported by the popular python framework, scikit-learn, as a -temporary expedient. For a comparison of the MLJ's design with the -Julia wrap [ScitLearn.jl](https://github.com/cstjean/ScikitLearn.jl), -see this -[FAQ](https://github.com/alan-turing-institute/MLJ.jl/blob/master/docs/src/frequently_asked_questions.md). - - -## Learning networks - -MLJ's model composition interface is flexible enough to implement, for -example, the [model -stacks](https://www.kdnuggets.com/2017/02/stacking-models-imropved-predictions.html) -popular in data science competitions. To treat examples of this kind, -the interface design must account for the fact that information flow -in prediction and training modes is different. This can be seen from -the following schematic of a simple two-model stack, viewed as a -network: - -![](img/two_model_stack.png) - -## Building a simple network - -In MLJ, networks of models are built using a declarative syntax -already familiar from basic use of the package. For example, the -ordinary syntax for training a decision tree in MLJ, after one-hot -encoding the categorical features, looks like this: - -```julia -using MLJ -@load DecisionTreeRegressor - -# load some data: -task = load_reduced_ames(); -X, y = task(); - -# one-hot encode the inputs, X: -hot_model = OneHotEncoder() -hot = machine(hot_model, X) -fit!(hot) -Xt = transform(hot, X) - -# fit a decision tree to the transformed data: -tree_model = DecisionTreeRegressor() -tree = machine(tree_model, Xt, y) -fit!(tree, rows = 1:1300) -``` - -Note that a *model* in MLJ is just a struct containing -hyperparameters. Wrapping a model in data delivers a *machine* struct, -which will additionally record the results of training. - -Without a pipeline, each time we want to present new data for -prediction we must first apply one-hot encoding: - -```julia -Xnew = X[1301:1400,:]; -Xnewt = transform(hot, Xnew); -yhat = predict(tree, Xnewt); -yhat[1:3] - 3-element Array{Float64,1}: - 223956.9999999999 - 320142.85714285733 - 161227.49999999994 -``` - -To build a pipeline one simply wraps the supplied data in source nodes -and repeats similar declarations, omitting calls to -`fit!`. The difference now is that each "variable" (e.g., `Xt`, -`yhat`) is a node of our pipeline, instead of concrete data: - -```julia -Xs = source(X) -ys = source(y) - -hot = machine(hot_model, Xs) -Xt = transform(hot, Xs); - -tree = machine(tree_model, Xt, ys) -yhat = predict(tree, Xt) -``` - -If we like, we can think of a node as *dynamic data* - "data" because -it can be called (indexed) on rows, but "dynamic" because the result -depends on the outcome of training events, which in turn depend on -hyperparameter values. For example, after fitting the completed pipeline, -we can make new predictions like this: - -```julia -fit!(yhat, rows=1:1300) - [ Info: Training NodalMachine @ 1…51. - [ Info: Spawned 1300 sub-features to one-hot encode feature :Neighborhood. - [ Info: Spawned 1300 sub-features to one-hot encode feature :MSSubClass. - [ Info: Training NodalMachine @ 1…17. - Node @ 1…79 = predict(1…17, transform(1…51, 1…07)) - -yhat(rows=1301:1302) # to predict on rows of source node -yhat(Xnew) # to predict on new data -156-element Array{Float64,1}: - 223956.9999999999 - 320142.85714285733 - ... -``` - - -## Exporting and retraining - -Once a pipeline like this has been built and tested on sample data, it -can be exported as a stand-alone model, ready to be trained on any -dataset. For details, see the MLJ -[documentation](https://alan-turing-institute.github.io/MLJ.jl/dev/learning_networks/). In -the future, Julia macros will allow common architectures (e.g., linear -pipelines) to be built in a couple of lines. - -Finally, we mention that MLJ learning networks, and their exported -counterparts, are "smart" in the sense that changing a hyperparameter -does not trigger retraining of component models upstream of the -change: - -```julia -tree_model.max_depth = 4 -fit!(yhat, rows=1:1300) - [ Info: Not retraining NodalMachine @ 1…51. It is up-to-date. - [ Info: Updating NodalMachine @ 1…17. - Node @ 1…79 = predict(1…17, transform(1…51, 1…07)) -``` - - -## Just "Write the math!" - -Because of Julia's generic programming features, any kind of operation -you would normally apply to data (arithmetic, row selection, column -concatenation, etc) can be overloaded to work with nodes. In this way, -MLJ's network-building syntax is economical, intuitive and easy to -read. In this respect we have been inspired by [On Machine Learning -and Programming Languages](https://julialang.org/blog/2017/12/ml&pl). - -## Invitation to the community -We now invite the community to try out our newly registered packages, [MLJ](https://github.com/alan-turing-institute/MLJ.jl)alongside [MLJModels](https://github.com/JuliaAI/MLJModels.jl), and provide any feedback or suggestions you may have going forward. We are also particularly interested in hearing how you would use our package, and what features it may be lacking. diff --git a/docs/src/known_issues.md b/docs/src/known_issues.md deleted file mode 100644 index 807127dd9..000000000 --- a/docs/src/known_issues.md +++ /dev/null @@ -1,32 +0,0 @@ -# Known Issues - -Routine issues are posted -[here](https://github.com/alan-turing-institute/MLJ.jl/issues). Below -are some longer term issues and limitations. - -#### ScikitLearn/MKL issue - -For users of Mac OS using Julia 1.3 or higher, using ScikitLearn -models can lead to unexpected MKL errors due to an issue not related -to MLJ. See -[this Julia Discourse discussion](https://discourse.julialang.org/t/julia-1-3-1-4-on-macos-and-intel-mkl-error/36469/2) -and -[this issue](https://github.com/JuliaPackaging/BinaryBuilder.jl/issues/700) -for context. - -A temporary workaround for this issue is to force the installation of -an older version of the `OpenSpecFun_jll` library. To install an -appropriate version, activate your MLJ environment and run - -```julia - using Pkg; - Pkg.add(PackageSpec(url="https://github.com/tlienart/OpenSpecFun_jll.jl")) -``` - -#### Serialization for composite models with component models with custom serialization - -See -[here](https://github.com/alan-turing-institute/MLJ.jl/issues/678). Workaround: -Instead of `XGBoost` models (the chief known case) use models from the -pure Julia package `EvoTrees`. - diff --git a/docs/src/weights.md b/docs/src/weights.md index 202d7dd5f..3789faf21 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -43,7 +43,7 @@ The model `model` supports class weights if ## Specifying weights in performance evaluation -When calling an MLJ measure (metric) that supports weights, provide the +When calling a measure (metric) that supports weights, provide the weights as the last argument, as in ```julia @@ -53,18 +53,12 @@ w = Dict("versicolor" => 1, "setosa" => 2, "virginica"=> 3) macro_f1score(ŷ, y, w) ``` -You can use `supports_weights` and `supports_class_weights` on -measures to check weight support. For example, to list all measures -supporting per observation weights, do +Some measures also support specification of a class weight dictionary. For details see the +StatisticalMeasures.jl +[tutorial](https://juliaai.github.io/StatisticalMeasures.jl/dev/examples_of_usage/). -```julia -measures() do m - m.supports_weights -end -``` +To pass weights to all the measures listed in an [`evaluate!`](@ref)/[`evaluate`](@ref) +call, use the keyword specifiers `weights=...` or `class_weights=...`. For details, see +[Evaluating Model Performance](@ref). -See also [Evaluating Model Performance](@ref). -To pass weights to all the measures listed in an `evaluate!/evaluate` -call, use the keyword specifiers `weights=...` or -`class_weights=...`. For details, see [`evaluate!`](@ref). From eb507f91f57353ba5cbcb1ac1c1f1a7ae6392c1a Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Mon, 25 Sep 2023 17:48:43 +1300 Subject: [PATCH 09/16] update doc --- docs/src/performance_measures.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/src/performance_measures.md b/docs/src/performance_measures.md index 529e8c943..b9647bc61 100644 --- a/docs/src/performance_measures.md +++ b/docs/src/performance_measures.md @@ -73,7 +73,8 @@ on users are detailed below: - If `using MLJBase` without MLJ, then, in Julia 1.9 or higher, `StatisticalMeasures` must be explicitly imported to use measures that were previously part of MLJBase. If `using - MLJ`, then all previous measures are still available. + MLJ`, then all previous measures are still available, with the exception of those + corresponding to LossFunctions.jl (see below). - All measures return a *single* aggregated measurement. In other words, measures previously reporting a measurement *per-observation* (previously subtyping From 6328b5b5fa5e801ede5898ffc0eb82fa5125bb56 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Mon, 25 Sep 2023 17:49:46 +1300 Subject: [PATCH 10/16] another update --- docs/src/performance_measures.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/performance_measures.md b/docs/src/performance_measures.md index b9647bc61..a50469368 100644 --- a/docs/src/performance_measures.md +++ b/docs/src/performance_measures.md @@ -79,7 +79,7 @@ on users are detailed below: - All measures return a *single* aggregated measurement. In other words, measures previously reporting a measurement *per-observation* (previously subtyping `Unaggregated`) no longer do so. To get per-observation measurements, use the new method - `measurements(measure, ŷ, y[, weights, class_weights])`. + `StatisticalMeasures.measurements(measure, ŷ, y[, weights, class_weights])`. - The default measure for regression models (used in `evaluate/evaluate!` when `measures` is unspecified) is changed from `rms` to `l2=LPLoss(2)` (mean sum of squares). From 83db2bafc4ad53055eafc265b6a6954fb13f26db Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 26 Sep 2023 13:59:03 +1300 Subject: [PATCH 11/16] add docstring for MLJ module --- docs/src/getting_started.md | 5 ++++- src/MLJ.jl | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index e6a9ed3d6..de31ba32b 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -5,7 +5,10 @@ For an outline of MLJ's **goals** and **features**, see This page introduces some MLJ basics, assuming some familiarity with machine learning. For a complete list of other MLJ learning resources, -see [Learning MLJ](@ref). +see [Learning MLJ](@ref). + +MLJ collects together the functionality provided by mutliple packages. To learn how to +install components separately, run `using MLJ; @doc MLJ`. This section introduces only the most basic MLJ operations and concepts. It assumes MLJ has been successfully installed. See diff --git a/src/MLJ.jl b/src/MLJ.jl index 6c6bc002a..f126b1b1b 100644 --- a/src/MLJ.jl +++ b/src/MLJ.jl @@ -1,3 +1,34 @@ +""" + MLJ + +[`MLJ`](https://alan-turing-institute.github.io/MLJ.jl/dev/) is a Machine Learning toolbox +for Julia. It collects together functionality from the following packages, which can be +loaded separately: + +- MLJBase.jl: The `machine` interface, tools to `partition` and `unpack` datasets, + `evaluate`/`evaluate!` for model performance, `|>` pipeline syntax, + `TransformedTargetModel` wrapper, general model composition syntax (learning networks), + synthetic data generators, `scitype` and `schema` methods (from ScientificTypes.jl) for + checking how MLJ interprets your data + +- StatisticalMeasures.jl: MLJ-compatible measures (metrics) for machine learning, + confusion matrices, ROC curves + +- MLJModels.jl: Common transformers for data preprocessing, searching the model registry, + loading models with `@load` + +- MLJTuning.jl: Hyperparameter optimization via `TunedModel` wrapper + +- MLJIteration.jl: `IteratedModel` Wrapper for controlling iterative models + +- MLJEnsembles.jl: Homogeneous model ensembling, via the `EnsembleModel` wrapper + +- MLJBalancing.jl: Incorporation of oversampling/undersampling methods in pipelines, via + the `BalancedModel` wrapper + +- OpenML.jl: Tool for grabbing datasets from OpenML.org + +""" module MLJ From 92a278c2cf567f90004061769f3491f81dea29ca Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 26 Sep 2023 15:20:39 +1300 Subject: [PATCH 12/16] bump compat for MLJEnsembles, MLJFlow, MLJTuning, MLJIteration --- Project.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Project.toml b/Project.toml index 60b6d8baf..ba912c1b2 100644 --- a/Project.toml +++ b/Project.toml @@ -31,11 +31,11 @@ CategoricalArrays = "0.8,0.9, 0.10" ComputationalResources = "0.3" Distributions = "0.21,0.22,0.23, 0.24, 0.25" MLJBase = "0.21.14" -MLJEnsembles = "0.3" -MLJFlow = "0.1" -MLJIteration = "0.5" +MLJEnsembles = "0.4" +MLJFlow = "0.2" +MLJIteration = "0.6" MLJModels = "0.16" -MLJTuning = "0.7" +MLJTuning = "0.8" OpenML = "0.2,0.3" ProgressMeter = "1.1" Reexport = "1.2" From 185ddebb20acee2c2d1f1a625a392d2c95bf7a8a Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 26 Sep 2023 15:25:36 +1300 Subject: [PATCH 13/16] bump [compat] MLJBase = "1" --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index ba912c1b2..d65f8aca1 100644 --- a/Project.toml +++ b/Project.toml @@ -30,7 +30,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" CategoricalArrays = "0.8,0.9, 0.10" ComputationalResources = "0.3" Distributions = "0.21,0.22,0.23, 0.24, 0.25" -MLJBase = "0.21.14" +MLJBase = "1" MLJEnsembles = "0.4" MLJFlow = "0.2" MLJIteration = "0.6" From d7a4dfa7df0d2790982fec036eb478cb903f3205 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 27 Sep 2023 13:29:49 +1300 Subject: [PATCH 14/16] doc fixes, including to close #1040 --- docs/Project.toml | 12 +- docs/make.jl | 9 +- docs/model_docstring_tools.jl | 20 +++- docs/src/about_mlj.md | 2 - docs/src/common_mlj_workflows.md | 4 +- docs/src/evaluating_model_performance.md | 4 +- docs/src/getting_started.md | 2 +- docs/src/learning_curves.md | 4 +- docs/src/machines.md | 13 +-- docs/src/performance_measures.md | 8 +- docs/src/transformers.md | 135 +++++++++++++---------- docs/src/tuning_models.md | 20 ++-- 12 files changed, 131 insertions(+), 102 deletions(-) diff --git a/docs/Project.toml b/docs/Project.toml index ee3cd3f2e..7c38b6b61 100755 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -8,11 +8,11 @@ EarlyStopping = "792122b4-ca99-40de-a6bc-6742525f08b6" EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" IterationControl = "b3c1a2ee-3fec-4384-bf48-272ea71de57c" -LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" MLJClusteringInterface = "d354fa79-ed1c-40d4-88ef-b8c7bd1568af" MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661" MLJEnsembles = "50ed68f4-41fd-4504-931a-ed422449fee0" +MLJFlow = "7b7b8358-b45c-48ea-a8ef-7ca328ad328f" MLJGLMInterface = "caf8df21-4939-456d-ac9c-5fefbfb04c0c" MLJIteration = "614be32b-d00c-4edb-bd02-1eb411ab5e55" MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692" @@ -25,16 +25,10 @@ NearestNeighborModels = "636a865e-7cf4-491e-846c-de09b730eb36" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161" +StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" [compat] -CategoricalDistributions = "0.1" -Documenter = "0.27" -MLJEnsembles = "0.3" -MLJIteration = "0.5" -MLJModels = "0.16" -MLJTuning = "0.7" -ScientificTypes = "3" -ScientificTypesBase = "3" +Documenter = "1" julia = "1.6" diff --git a/docs/make.jl b/docs/make.jl index b71fb7094..b89c7fc03 100755 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,11 +14,12 @@ import MLJModels import MLJEnsembles import ScientificTypes import MLJModelInterface +import ScientificTypes import ScientificTypesBase import Distributions using CategoricalArrays -using LossFunctions import CategoricalDistributions +import StatisticalMeasures const MMI = MLJModelInterface @@ -107,12 +108,14 @@ makedocs( ScientificTypes, MLJModelInterface, ScientificTypesBase, + StatisticalMeasures, MLJIteration, EarlyStopping, IterationControl, - CategoricalDistributions], + CategoricalDistributions, + StatisticalMeasures], pages = pages, - strict = Documenter.except(:cross_references, :missing_docs), + warnonly = Documenter.except(:cross_references, :missing_docs), ) @info "`makedocs` has finished running. " diff --git a/docs/model_docstring_tools.jl b/docs/model_docstring_tools.jl index 0edf63f49..f4e399f78 100644 --- a/docs/model_docstring_tools.jl +++ b/docs/model_docstring_tools.jl @@ -2,6 +2,22 @@ const PATH_TO_MODEL_DOCS = joinpath(@__DIR__, "src", "models") +""" + remove_doc_refs(str::AbstractString) + +Removes `@ref` references from `str. For example, a substring of the form +"[`some.thing_like_this123!`](@ref)" is replaced with "`some.thing_like_this123!`". + +""" +function remove_doc_refs(page) + regex = r"\[(['\.\d`\!\_a-zA-Z]*)\]\(\@ref\)" + while contains(page, regex) + # replace the first matched regex with the captured string + page = replace(page, regex => s"\1") + end + page +end + demote_headings(str) = replace(str, "# "=>"## ") handle(model) = model.name*"_"*model.package_name @@ -25,7 +41,7 @@ function write_page(model; path=PATH_TO_MODEL_DOCS) open(pagepath, "w") do stream header = "# [$(model.name)](@id $id)\n\n" md_page = doc(model.name, pkg=model.package_name) - page = header*demote_headings(string(md_page)) + page = header*demote_headings(string(md_page)) |> remove_doc_refs write(stream, page) nothing end @@ -54,7 +70,7 @@ function models_missing_descriptors() handles = handle.(models()) filter(handles) do h !(h in HANDLES) - end + end end """ diff --git a/docs/src/about_mlj.md b/docs/src/about_mlj.md index 276cfefd5..f54896165 100755 --- a/docs/src/about_mlj.md +++ b/docs/src/about_mlj.md @@ -221,8 +221,6 @@ Bugs, suggestions, and feature requests can be posted Users are also welcome to join the `#mlj` Julia slack channel to ask questions and make suggestions. -See also, [Known Issues](@ref) - ## Installation diff --git a/docs/src/common_mlj_workflows.md b/docs/src/common_mlj_workflows.md index 2ecc3ceda..d3ce0bb3a 100644 --- a/docs/src/common_mlj_workflows.md +++ b/docs/src/common_mlj_workflows.md @@ -176,10 +176,10 @@ KNN = @load KNNRegressor knn = KNN() evaluate(knn, X, y, resampling=CV(nfolds=5), - measure=[RootMeanSquaredError(), MeanAbsoluteError()]) + measure=[RootMeanSquaredError(), LPLoss(1)]) ``` -Note `RootMeanSquaredError()` has alias `rms` and `MeanAbsoluteError()` has alias `mae`. +Note `RootMeanSquaredError()` has alias `rms` and `LPLoss(1)` has aliases `l1`, `mae`. Do `measures()` to list all losses and scores and their aliases. diff --git a/docs/src/evaluating_model_performance.md b/docs/src/evaluating_model_performance.md index 645962892..448283c57 100644 --- a/docs/src/evaluating_model_performance.md +++ b/docs/src/evaluating_model_performance.md @@ -37,7 +37,7 @@ data: ```@repl evaluation_of_supervised_models mach = machine(model, X, y) -evaluate!(mach, resampling=cv, measure=l2), verbosity=0) +evaluate!(mach, resampling=cv, measure=l2, verbosity=0) ``` (The latter call is a mutating call as the learned parameters stored in the @@ -53,7 +53,7 @@ evaluate!( resampling=cv, measures=[l1, rms, rmslp1], verbosity=0, -a) +) ``` [Custom measures](@ref) can also be provided. diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index de31ba32b..acda6f28b 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -344,7 +344,7 @@ scientific type `Table{K}`, where `K` depends on the scientific types of the col which can be individually inspected using `schema`: ```@repl doda -schema(X) +ScientificTypes.schema ``` #### Matrix data diff --git a/docs/src/learning_curves.md b/docs/src/learning_curves.md index d49a1f9d1..42847171a 100644 --- a/docs/src/learning_curves.md +++ b/docs/src/learning_curves.md @@ -24,7 +24,7 @@ r_lambda = range(ensemble, :(model.lambda), lower=1e-1, upper=100, scale=:log10) curve = MLJ.learning_curve(mach; range=r_lambda, resampling=CV(nfolds=3), - measure=MeanAbsoluteError()) + measure=l1) ``` ```julia using Plots @@ -52,7 +52,7 @@ atom.lambda= 7.3 r_n = range(ensemble, :n, lower=1, upper=50) curves = MLJ.learning_curve(mach; range=r_n, - measure=MeanAbsoluteError(), + measure=l1, verbosity=0, rng_name=:rng, rngs=4) diff --git a/docs/src/machines.md b/docs/src/machines.md index 8bfc9cd2c..68eb9cddc 100644 --- a/docs/src/machines.md +++ b/docs/src/machines.md @@ -93,8 +93,8 @@ report(mach) ``` ```@docs -fitted_params -report +fitted_params(::Machine) +report(::Machine) ``` ### Training losses and feature importances @@ -166,12 +166,9 @@ machine(model, X, y, cache=false) ### Constructing machines in learning networks -Instead of data `X`, `y`, etc, the `machine` constructor is provided -`Node` or `Source` objects ("dynamic data") when building a learning -network. See [Composing Models](composing_models.md) for more on this -advanced feature. One also uses `machine` to wrap a machine -around a whole learning network; see [Learning network -machines](@ref). +Instead of data `X`, `y`, etc, the `machine` constructor is provided `Node` or `Source` +objects ("dynamic data") when building a learning network. See [Learning Networks](@ref) +for more on this advanced feature. ## Saving machines diff --git a/docs/src/performance_measures.md b/docs/src/performance_measures.md index a50469368..f8d2697a7 100644 --- a/docs/src/performance_measures.md +++ b/docs/src/performance_measures.md @@ -20,8 +20,8 @@ directly, see the StatisticalMeasures.jl A list of all measures ready to use after running `using MLJ` or `using StatisticalMeasures`, is [here](https://juliaai.github.io/StatisticalMeasures.jl/dev/auto_generated_list_of_measures/). Alternatively, -call [`measures()`](@ref) (experimental) to generate a dictionary keyed on available -measure constructors, with measure metadata as values. +call [`measures()`](@ref StatisticalMeasures.measures) (experimental) to generate a +dictionary keyed on available measure constructors, with measure metadata as values. ## Custom measures @@ -58,7 +58,7 @@ measures from that package must be explicitly imported and wrapped, as described A related performance evaluation tool provided by StatisticalMeasures.jl, and hence by MLJ, is the `roc_curve` method: ```@docs -roc_curve +StatisticalMeausures.roc_curve ``` ## Migration guide for changes to measures in MLJBase 1.0 @@ -83,6 +83,8 @@ on users are detailed below: - The default measure for regression models (used in `evaluate/evaluate!` when `measures` is unspecified) is changed from `rms` to `l2=LPLoss(2)` (mean sum of squares). + +- `MeanAbsoluteError` has been removed and instead `mae` is an alias for `LPLoss(p=1)`. - Measures that previously skipped `NaN` values will now (at least by default) propagate those values. Missing value behavior is unchanged, except some measures that diff --git a/docs/src/transformers.md b/docs/src/transformers.md index 2bdae8e0c..911fba82a 100644 --- a/docs/src/transformers.md +++ b/docs/src/transformers.md @@ -42,7 +42,7 @@ MLJModels.UnivariateTimeTypeToContinuous A *static transformer* is a model for transforming data that does not generalize to new data (does not "learn") but which nevertheless has hyperparameters. For example, the `DBSAN` clustering model from Clustering.jl can assign labels to some collection of -observations, cannot directly assign a label to some new observation. +observations, cannot directly assign a label to some new observation. The general user may define their own static models. The main use-case is insertion into a [Linear Pipelines](@ref) some parameter-dependent transformation. (If a static transformer @@ -73,86 +73,103 @@ Such static transformers with (unlearned) parameters can have arbitrarily many inputs, but only one output. In the single input case, an `inverse_transform` can also be defined. Since they have no real learned parameters, you bind a static transformer to a machine without -specifying training arguments. +specifying training arguments; there is no need to `fit!` the machine: ```@example boots -mach = machine(Averager(0.5)) |> fit! +mach = machine(Averager(0.5)) transform(mach, [1, 2, 3], [3, 2, 1]) ``` -Let's see how we can include our `Averager` in a learning network (see -[Composing Models](@ref)) to mix the predictions of two regressors, -with one-hot encoding of the inputs: +Let's see how we can include our `Averager` in a [learning network](@ref "Learning +Networks")) to mix the predictions of two regressors, with one-hot encoding of the +inputs. Here's some dummy data and component models to test our learning network: ```@example boots -X = source() -y = source() - ridge = (@load RidgeRegressor pkg=MultivariateStats)() knn = (@load KNNRegressor)() + +import Random.seed! +seed!(112) +X = ( + x1=coerce(rand("ab", 100), Multiclass), + x2=rand(100), +) +y = X.x2 + 0.05*rand(100) +schema(X) +``` + +And the learning network: + +```@example boots +Xs = source(X) +ys = source(y) + averager = Averager(0.5) -hotM = machine(OneHotEncoder(), X) -W = transform(hotM, X) # one-hot encode the input +mach0 = machine(OneHotEncoder(), Xs) +W = transform(mach0, Xs) # one-hot encode the input + +mach1 = machine(ridge, W, ys) +y1 = predict(mach1, W) -ridgeM = machine(ridge, W, y) -y1 = predict(ridgeM, W) +mach2 = machine(knn, W, ys) +y2 = predict(mach2, W) -knnM = machine(knn, W, y) -y2 = predict(knnM, W) +mach4= machine(averager) +yhat = transform(mach4, y1, y2) -averagerM= machine(averager) -yhat = transform(averagerM, y1, y2) +# test: +fit!(yhat) +Xnew = selectrows(X, 1:3) +yhat(Xnew) ``` -Now we export to obtain a `Deterministic` composite model and then -instantiate composite model +We next "export" the learning network as a standalone composite model type. First we need +a struct for the composite model. Since we are restricting to `Deterministic` component +regressors, the composite will also make deterministic predictions, and so gets the +supertype `DeterministicNetworkComposite`: -```julia -learning_mach = machine(Deterministic(), X, y; predict=yhat) -Machine{DeterministicSurrogate} @772 trained 0 times. - args: - 1: Source @415 ⏎ `Unknown` - 2: Source @389 ⏎ `Unknown` - - -@from_network learning_mach struct DoubleRegressor - regressor1=ridge - regressor2=knn - averager=averager - end - -composite = DoubleRegressor() -julia> composite = DoubleRegressor() -DoubleRegressor( - regressor1 = RidgeRegressor( - lambda = 1.0), - regressor2 = KNNRegressor( - K = 5, - algorithm = :kdtree, - metric = Distances.Euclidean(0.0), - leafsize = 10, - reorder = true, - weights = :uniform), - averager = Averager( - mix = 0.5)) @301 +```@example boots +mutable struct DoubleRegressor <: DeterministicNetworkComposite + regressor1 + regressor2 + averager +end +``` +As described in [Learning Networks](@ref), we next paste the learning network into a +`prefit` declaration, removing the test data and replacing the component models with +symbolic placeholders: + +```@example boots +function MLJBase.prefit(composite::DoubleRegressor, verbosity, X, y) + Xs = source() + ys = source() + + mach0 = machine(OneHotEncoder(), Xs) + W = transform(mach0, Xs) # one-hot encode the input + + mach1 = machine(:regressor1, W, ys) + y1 = predict(mach1, W) + + mach2 = machine(:regressor2, W, ys) + y2 = predict(mach2, W) + + mach4= machine(:averager) + yhat = transform(mach4, y1, y2) +end ``` -which can be can be evaluated like any other model: +The new model type can be evaluated like any other supervised model: -```julia +```@example boots +X, y = @load_reduced_ames +composite = DoubleRegressor(ridge, knn, Averager(0.5)) +``` + +```@example boots composite.averager.mix = 0.25 # adjust mix from default of 0.5 -julia> evaluate(composite, (@load_reduced_ames)..., measure=rms) -Evaluating over 6 folds: 100%[=========================] Time: 0:00:00 -┌───────────┬───────────────┬────────────────────────────────────────────────────────┐ -│ _.measure │ _.measurement │ _.per_fold │ -├───────────┼───────────────┼────────────────────────────────────────────────────────┤ -│ rms │ 26800.0 │ [21400.0, 23700.0, 26800.0, 25900.0, 30800.0, 30700.0] │ -└───────────┴───────────────┴────────────────────────────────────────────────────────┘ -_.per_observation = [missing] -_.fitted_params_per_fold = [ … ] -_.report_per_fold = [ … ] +evaluate(composite, X, y, measures=[l1, rmslp1]) ``` A static transformer can also expose byproducts of the transform computation in the report diff --git a/docs/src/tuning_models.md b/docs/src/tuning_models.md index 46c04c72e..76f234f18 100644 --- a/docs/src/tuning_models.md +++ b/docs/src/tuning_models.md @@ -215,23 +215,25 @@ predict(mach, rows=148:150) ### Specifying a custom measure -Users may specify a custom loss or scoring function. Suppose, for -example, we define a new scoring function `custom_accuracy` by +Users may specify a custom loss or scoring function, so long as it complies with the +StatisticalMeasuresBase.jl +[API](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/implementing_new_measures/#definitions) +and implements the appropriate `orientation` trait (`Score()` or `Loss()`) from that +package. For example, we suppose define a "new" scoring function `custom_accuracy` by ```@example goof -custom_accuracy(y,yhat) = mean(y .== yhat); +custom_accuracy(yhat, y) = mean(y .== yhat); # yhat - prediction, y - ground truth ``` -In tuning, scores are maximised, while losses are minimised. By -default, a custom measure is assumed to be a loss rather than a score, -so we must also declare +In tuning, scores are maximised, while losses are minimised. So here we declare ```@example goof -MLJ.orientation(::typeof(custom_accuracy)) = :score +import StatisticalMeasuresBase as SMB +SMB.orientation(::typeof(custom_accuracy)) = SMB.Score() ``` -For full details on constructing custom measures, see [Traits and custom -measures](@ref). +For full details on constructing custom measures, see +[StatisticalMeasuresBase.jl](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/). ```@example goof From 5e6c57460f13b780a352a731193906a00e9674e7 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 28 Sep 2023 11:42:34 +1300 Subject: [PATCH 15/16] more doc fixes and cleanup of model browser page generation --- docs/Project.toml | 1 + docs/make.jl | 3 +- docs/model_docstring_tools.jl | 2 +- docs/src/common_mlj_workflows.md | 2 +- docs/src/getting_started.md | 31 ++------ docs/src/performance_measures.md | 53 ++++++++----- docs/src/target_transformations.md | 10 +-- docs/src/transformers.md | 21 +++-- docs/src/tuning_models.md | 120 ++++++++++++++++------------- src/MLJ.jl | 2 +- 10 files changed, 129 insertions(+), 116 deletions(-) diff --git a/docs/Project.toml b/docs/Project.toml index 7c38b6b61..a1f37bd96 100755 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -26,6 +26,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161" StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541" +StatisticalMeasuresBase = "c062fc1d-0d66-479b-b6ac-8b44719de4cc" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" diff --git a/docs/make.jl b/docs/make.jl index b89c7fc03..93a7dfd9a 100755 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,6 +20,7 @@ import Distributions using CategoricalArrays import CategoricalDistributions import StatisticalMeasures +import StatisticalMeasuresBase const MMI = MLJModelInterface @@ -115,7 +116,7 @@ makedocs( CategoricalDistributions, StatisticalMeasures], pages = pages, - warnonly = Documenter.except(:cross_references, :missing_docs), + warnonly = [:cross_references, :missing_docs], ) @info "`makedocs` has finished running. " diff --git a/docs/model_docstring_tools.jl b/docs/model_docstring_tools.jl index f4e399f78..5885f7d4d 100644 --- a/docs/model_docstring_tools.jl +++ b/docs/model_docstring_tools.jl @@ -10,7 +10,7 @@ Removes `@ref` references from `str. For example, a substring of the form """ function remove_doc_refs(page) - regex = r"\[(['\.\d`\!\_a-zA-Z]*)\]\(\@ref\)" + regex = r"\[([\?'\.\d`\!\_a-zA-Z]*)\]\(\@ref\)" while contains(page, regex) # replace the first matched regex with the captured string page = replace(page, regex => s"\1") diff --git a/docs/src/common_mlj_workflows.md b/docs/src/common_mlj_workflows.md index d3ce0bb3a..2b7cfaec9 100644 --- a/docs/src/common_mlj_workflows.md +++ b/docs/src/common_mlj_workflows.md @@ -458,7 +458,7 @@ pipe = (X -> coerce(X, :age=>Continuous)) |> OneHotEncoder() |> knn_with_target Evaluating the pipeline (just as you would any other model): ```@example workflows -pipe.one_hot_encoder.drop_last = true +pipe.one_hot_encoder.drop_last = true # mutate a nested hyper-parameter evaluate(pipe, X, y, resampling=Holdout(), measure=RootMeanSquaredError(), verbosity=2) ``` diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index acda6f28b..5952cc6bb 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -268,17 +268,8 @@ evaluate!(mach, resampling=Holdout(fraction_train=0.7), ## Next steps -To learn a little more about what MLJ can do, browse [Common MLJ -Workflows](common_mlj_workflows.md) or [Data Science Tutorials in -Julia](https://alan-turing-institute.github.io/DataScienceTutorials.jl/) -or try the [JuliaCon2020 -Workshop](https://github.com/ablaom/MachineLearningInJulia2020) on MLJ -(recorded -[here](https://www.youtube.com/watch?time_continue=27&v=qSWbCn170HU&feature=emb_title)) -returning to the manual as needed. - -*Read at least the remainder of this page before considering serious -use of MLJ.* +For next steps, consult the [Learn MLJ](@ref) section. *At the least, we recommned you +read the remainder of this page before considering serious use of MLJ.* ## Data containers and scientific types @@ -438,19 +429,9 @@ are the key features of that convention: - The scientific types of `nothing` and `missing` are `Nothing` and `Missing`, native types we also regard as scientific. -Use `coerce(v, OrderedFactor)` or `coerce(v, Multiclass)` to coerce a -vector `v` of integers, strings or characters to a vector with an -appropriate `Finite` (categorical) scitype. See [Working with -Categorical Data](@ref). - -For more on scitype coercion of arrays and tables, see [`coerce`](@ref), -[`autotype`](@ref) and [`unpack`](@ref) below and the examples at -[ScientificTypes.jl](https://JuliaAI.github.io/ScientificTypes.jl/dev/). +Use `coerce(v, OrderedFactor)` or `coerce(v, Multiclass)` to coerce a vector `v` of +integers, strings or characters to a vector with an appropriate `Finite` (categorical) +scitype. See also [Working with Categorical Data](@ref), and the +[ScientificTypes.jl](https://JuliaAI.github.io/ScientificTypes.jl/dev/) documentation. - -```@docs -scitype -coerce -autotype -``` diff --git a/docs/src/performance_measures.md b/docs/src/performance_measures.md index f8d2697a7..699bf261f 100644 --- a/docs/src/performance_measures.md +++ b/docs/src/performance_measures.md @@ -7,17 +7,28 @@ - [Migration guide for changes to measures in MLJBase 1.0](@ref) -## Introduction +## Introduction In MLJ loss functions, scoring rules, confusion matrices, sensitivities, etc, are collectively referred to as *measures*. These measures are provided by the package [StatisticalMeasures.jl](https://juliaai.github.io/StatisticalMeasures.jl/dev/) but are -immediately available to the MLJ user. Commonly, measures are passed to MLJ -meta-algorithms (see [Uses of measures](@ref) below) but to learn how to call measures -directly, see the StatisticalMeasures.jl +immediately available to the MLJ user. Here's a simple example of direct application of +the `log_loss` measures to compute a training loss: + +```@example measures +using MLJ +X, y = @load_iris +DecisionTreeClassifier = @load DecisionTreeClassifier pkg=DecisionTree +tree = DecisionTreeClassifier(max_depth=2) +mach = machine(tree, X, y) |> fit! +yhat = predict(mach, X) +log_loss(yhat, y) +``` + +For more examples of direct measure usage, see the StatisticalMeasures.jl [tutorial](https://juliaai.github.io/StatisticalMeasures.jl/dev/examples_of_usage/). -A list of all measures ready to use after running `using MLJ` or `using +A list of all measures, ready to use after running `using MLJ` or `using StatisticalMeasures`, is [here](https://juliaai.github.io/StatisticalMeasures.jl/dev/auto_generated_list_of_measures/). Alternatively, call [`measures()`](@ref StatisticalMeasures.measures) (experimental) to generate a @@ -36,7 +47,7 @@ multi-target measure using this package. ## Uses of measures -In MLJ, measures are specified: +In MLJ, measures are specified: - when evaluating model performance using [`evaluate!`](@ref)/[`evaluate`](@ref) - see [Evaluating Model Performance](@ref) @@ -47,7 +58,7 @@ In MLJ, measures are specified: and elsewhere. -## Using LossFunctions.jl +## Using LossFunctions.jl In previous versions of MLJ, measures from LossFunctions.jl were also available. Now measures from that package must be explicitly imported and wrapped, as described @@ -58,7 +69,7 @@ measures from that package must be explicitly imported and wrapped, as described A related performance evaluation tool provided by StatisticalMeasures.jl, and hence by MLJ, is the `roc_curve` method: ```@docs -StatisticalMeausures.roc_curve +StatisticalMeasures.roc_curve ``` ## Migration guide for changes to measures in MLJBase 1.0 @@ -69,27 +80,27 @@ MLJBase.jl (a dependency of MLJ.jl) but now they are provided by MLJ.jl dependen on users are detailed below: -### Breaking behavior relevant to many users +### Breaking behavior likely relevant to many users - If `using MLJBase` without MLJ, then, in Julia 1.9 or higher, `StatisticalMeasures` must be explicitly imported to use measures that were previously part of MLJBase. If `using MLJ`, then all previous measures are still available, with the exception of those - corresponding to LossFunctions.jl (see below). + corresponding to LossFunctions.jl (see below). - All measures return a *single* aggregated measurement. In other words, measures previously reporting a measurement *per-observation* (previously subtyping `Unaggregated`) no longer do so. To get per-observation measurements, use the new method `StatisticalMeasures.measurements(measure, ŷ, y[, weights, class_weights])`. - + - The default measure for regression models (used in `evaluate/evaluate!` when `measures` is unspecified) is changed from `rms` to `l2=LPLoss(2)` (mean sum of squares). - -- `MeanAbsoluteError` has been removed and instead `mae` is an alias for `LPLoss(p=1)`. + +- `MeanAbsoluteError` has been removed and instead `mae` is an alias for `LPLoss(p=1)`. - Measures that previously skipped `NaN` values will now (at least by default) propagate those values. Missing value behavior is unchanged, except some measures that previously did not support `missing` now do. - + - Aliases for measure *types* have been removed. For example `RMSE` (alias for `RootMeanSquaredError`) is gone. Aliases for instances, such as `rms` and `cross_entropy` persist. The exception is `precision`, for which `ppv` can @@ -107,7 +118,7 @@ on users are detailed below: `measures(predicate)` is decommissioned, but `measures(needle)` is preserved. (This method, owned by StatisticalMeasures.jl, has some other search options, but is experimental.) - + - Measures that were wraps of losses from LossFunctions.jl are no longer exposed by MLJBase or MLJ. To use such a loss, you must explicitly `import LossFunctions` and wrap the loss appropriately. See [Using losses from @@ -124,8 +135,8 @@ on users are detailed below: supported. See [What is a measure?](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/implementing_new_measures/#definitions) for allowed signatures in measures. - -### Packages implementing the MLJ model interface + +### Packages implementing the MLJ model interface The migration of measures is not expected to require any changes to the source code in packges providing implementations of the MLJ model interface (MLJModelInterface.jl) such @@ -135,17 +146,17 @@ measures. The following should generally suffice to adapt such tests: - Add StatisticalMeasures as test dependency, and add `using StatisticalMeasures` to your `runtests.jl` (and/or included submodules). - + - If measures are qualified, as in `MLJBase.rms`, then the qualification must be removed or changed to `StatisticalMeasures.rms`, etc. - Be aware that the default measure used in methods such as `evaluate!`, when `measure` is not specified, is changed from `rms` to `l2` for regression models. - + - Be aware of that all measures now report a measurement for every observation, and never an aggregate. See second point above. -### Rarely relevant breaking behavior +### Breaking behavior possibly relevant to some developers - The abstract measure types `Aggregated`, `Unaggregated`, `Measure` have been decommissioned. (A measure is now defined purely by its [calling @@ -156,7 +167,7 @@ measures. The following should generally suffice to adapt such tests: - `target_scitype(measure)` is decommissioned. Related is `StatisticalMeasures.observation_scitype(measure)` which declares an upper bound on the allowed scitype *of a single observation*. - + - `prediction_type(measure)` is decommissioned. Instead use `StatisticalMeasures.kind_of_proxy(measure)`. diff --git a/docs/src/target_transformations.md b/docs/src/target_transformations.md index d39aa0423..63203f780 100644 --- a/docs/src/target_transformations.md +++ b/docs/src/target_transformations.md @@ -53,11 +53,11 @@ meaningfully compare the corresponding mean absolute errors, which are indeed di this case. ```@example 123 -evaluate(ridge, X, y, measure=mae) +evaluate(ridge, X, y, measure=l1) ``` ```@example 123 -evaluate(ridge2, X, y, measure=mae) +evaluate(ridge2, X, y, measure=l1) ``` Ordinary functions can also be used in target transformations but an @@ -66,11 +66,11 @@ inverse must be explicitly specified: ```@example 123 ridge3 = TransformedTargetModel(ridge, transformer=y->log.(y), inverse=z->exp.(z)) X, y = @load_boston -evaluate(ridge3, X, y, measure=mae) +evaluate(ridge3, X, y, measure=l1) ``` -Without the log transform (ie, using `ridge`) we get the poorer -`mae` of 3.9. +Without the log transform (ie, using `ridge`) we get the poorer mean absolute error, +`l1`, of 3.9. ```@docs TransformedTargetModel diff --git a/docs/src/transformers.md b/docs/src/transformers.md index 911fba82a..f03cdb92f 100644 --- a/docs/src/transformers.md +++ b/docs/src/transformers.md @@ -81,8 +81,9 @@ transform(mach, [1, 2, 3], [3, 2, 1]) ``` Let's see how we can include our `Averager` in a [learning network](@ref "Learning -Networks")) to mix the predictions of two regressors, with one-hot encoding of the -inputs. Here's some dummy data and component models to test our learning network: +Networks") to mix the predictions of two regressors, with one-hot encoding of the +inputs. Here's two regressors for mixing, and some dummy data for testing our learning +network: ```@example boots ridge = (@load RidgeRegressor pkg=MultivariateStats)() @@ -138,13 +139,14 @@ end ``` As described in [Learning Networks](@ref), we next paste the learning network into a -`prefit` declaration, removing the test data and replacing the component models with -symbolic placeholders: +`prefit` declaration, replace the component models with symbolic placeholders, and add a +learning network "interface": ```@example boots +import MLJBase function MLJBase.prefit(composite::DoubleRegressor, verbosity, X, y) - Xs = source() - ys = source() + Xs = source(X) + ys = source(y) mach0 = machine(OneHotEncoder(), Xs) W = transform(mach0, Xs) # one-hot encode the input @@ -157,19 +159,22 @@ function MLJBase.prefit(composite::DoubleRegressor, verbosity, X, y) mach4= machine(:averager) yhat = transform(mach4, y1, y2) + + # learning network interface: + (; predict=yhat) end ``` The new model type can be evaluated like any other supervised model: ```@example boots -X, y = @load_reduced_ames +X, y = @load_reduced_ames; composite = DoubleRegressor(ridge, knn, Averager(0.5)) ``` ```@example boots composite.averager.mix = 0.25 # adjust mix from default of 0.5 -evaluate(composite, X, y, measures=[l1, rmslp1]) +evaluate(composite, X, y, measure=l1) ``` A static transformer can also expose byproducts of the transform computation in the report diff --git a/docs/src/tuning_models.md b/docs/src/tuning_models.md index 76f234f18..e0c57c961 100644 --- a/docs/src/tuning_models.md +++ b/docs/src/tuning_models.md @@ -67,11 +67,13 @@ one-dimensional range object constructed using the `range` method: ```@example goof r = range(tree, :min_purity_increase, lower=0.001, upper=1.0, scale=:log); -self_tuning_tree = TunedModel(model=tree, - resampling=CV(nfolds=3), - tuning=Grid(resolution=10), - range=r, - measure=rms); +self_tuning_tree = TunedModel( + model=tree, + resampling=CV(nfolds=3), + tuning=Grid(resolution=10), + range=r, + measure=rms +); ``` Incidentally, a grid is generated internally "over the range" by calling the @@ -171,11 +173,13 @@ deterministic measure, such as `misclassification_rate` (which means **Case (i) - probabilistic measure**: ```@example goof -self_tuning_knn = TunedModel(model=knn, - resampling = CV(nfolds=4, rng=1234), - tuning = Grid(resolution=5), - range = K_range, - measure=BrierLoss()); +self_tuning_knn = TunedModel( + model=knn, + resampling = CV(nfolds=4, rng=1234), + tuning = Grid(resolution=5), + range = K_range, + measure=BrierLoss() +); mach = machine(self_tuning_knn, X, y); fit!(mach, verbosity=0); @@ -184,11 +188,13 @@ fit!(mach, verbosity=0); **Case (ii) - deterministic measure**: ```@example goof -self_tuning_knn = TunedModel(model=knn, - resampling = CV(nfolds=4, rng=1234), - tuning = Grid(resolution=5), - range = K_range, - measure=MisclassificationRate()) +self_tuning_knn = TunedModel( + model=knn, + resampling = CV(nfolds=4, rng=1234), + tuning = Grid(resolution=5), + range = K_range, + measure=MisclassificationRate() +) mach = machine(self_tuning_knn, X, y); fit!(mach, verbosity=0); @@ -237,12 +243,14 @@ For full details on constructing custom measures, see ```@example goof -self_tuning_knn = TunedModel(model=knn, - resampling = CV(nfolds=4), - tuning = Grid(resolution=5), - range = K_range, - measure = [custom_accuracy, MulticlassFScore()], - operation = predict_mode); +self_tuning_knn = TunedModel( + model=knn, + resampling = CV(nfolds=4), + tuning = Grid(resolution=5), + range = K_range, + measure = [custom_accuracy, MulticlassFScore()], + operation = predict_mode +); mach = machine(self_tuning_knn, X, y) fit!(mach, verbosity=0) @@ -270,11 +278,12 @@ points: ```@example goof r1 = range(forest, :(model.n_subfeatures), lower=1, upper=9); r2 = range(forest, :bagging_fraction, lower=0.4, upper=1.0); -self_tuning_forest = TunedModel(model=forest, - tuning=Grid(goal=30), - resampling=CV(nfolds=6), - range=[r1, r2], - measure=rms); +self_tuning_forest = TunedModel( + model=forest, + tuning=Grid(goal=30), + resampling=CV(nfolds=6), + range=[r1, r2], + measure=rms); X = MLJ.table(rand(100, 10)); y = 2X.x1 - X.x2 + 0.05*rand(100); @@ -302,12 +311,14 @@ be limited to 25. ```@example goof tuning = Grid(resolution=100, shuffle=true, rng=1234) -self_tuning_forest = TunedModel(model=forest, - tuning=tuning, - resampling=CV(nfolds=6), - range=[(r1, 3), r2], - measure=rms, - n=25); +self_tuning_forest = TunedModel( + model=forest, + tuning=tuning, + resampling=CV(nfolds=6), + range=[(r1, 3), r2], + measure=rms, + n=25 +); fit!(machine(self_tuning_forest, X, y), verbosity=0); ``` @@ -323,12 +334,14 @@ distribution by default, and all others using a (truncated) normal distribution. ```@example goof -self_tuning_forest = TunedModel(model=forest, - tuning=RandomSearch(), - resampling=CV(nfolds=6), - range=[r1, r2], - measure=rms, - n=25); +self_tuning_forest = TunedModel( + model=forest, + tuning=RandomSearch(), + resampling=CV(nfolds=6), + range=[r1, r2], + measure=rms, + n=25 +); X = MLJ.table(rand(100, 10)); y = 2X.x1 - X.x2 + 0.05*rand(100); mach = machine(self_tuning_forest, X, y); @@ -375,12 +388,14 @@ For this illustration we'll add a third, nominal, hyper-parameter: ```@example goof r3 = range(forest, :(model.post_prune), values=[true, false]); -self_tuning_forest = TunedModel(model=forest, - tuning=latin, - resampling=CV(nfolds=6), - range=[r1, r2, r3], - measure=rms, - n=25); +self_tuning_forest = TunedModel( + model=forest, + tuning=latin, + resampling=CV(nfolds=6), + range=[r1, r2, r3], + measure=rms, + n=25 +); mach = machine(self_tuning_forest, X, y); fit!(mach, verbosity=0) ``` @@ -411,10 +426,12 @@ The following model is equivalent to the best in `models` by using 3-fold cross-validation: ```@example goof -multi_model = TunedModel(models=models, - resampling=CV(nfolds=3), - measure=log_loss, - check_measure=false) +multi_model = TunedModel( + models=models, + resampling=CV(nfolds=3), + measure=log_loss, + check_measure=false +) nothing # hide ``` @@ -426,10 +443,7 @@ evaluated 2 x 3 times): ```@example goof X, y = make_blobs() -e = evaluate(multi_model, X, y, - resampling=CV(nfolds=2), - measure=log_loss, - verbosity=6) +e = evaluate(multi_model, X, y, resampling=CV(nfolds=2), measure=log_loss, verbosity=6) ``` Now, for example, we can get the best model for the first fold out of @@ -453,7 +467,7 @@ For example, for the first fold of the outer loop and the second model: e.report_per_fold[2].history[1] ``` -## API +## Reference ```@docs MLJBase.range diff --git a/src/MLJ.jl b/src/MLJ.jl index f126b1b1b..b2935eac1 100644 --- a/src/MLJ.jl +++ b/src/MLJ.jl @@ -12,7 +12,7 @@ loaded separately: checking how MLJ interprets your data - StatisticalMeasures.jl: MLJ-compatible measures (metrics) for machine learning, - confusion matrices, ROC curves + confusion matrices, ROC curves. - MLJModels.jl: Common transformers for data preprocessing, searching the model registry, loading models with `@load` From 67e905e98ad27c572df3fe73a5dfdd5a7eba782f Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 29 Sep 2023 11:09:10 +1300 Subject: [PATCH 16/16] doc tweak to Logging docs --- docs/src/logging_workflows.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/src/logging_workflows.md b/docs/src/logging_workflows.md index 7f4b468cd..8eda54f4f 100644 --- a/docs/src/logging_workflows.md +++ b/docs/src/logging_workflows.md @@ -5,8 +5,11 @@ [MLflow](https://mlflow.org) is a popular, language-agnostic, tool for externally logging the outcomes of machine learning experiments, including those carried out using MLJ. -This functionality is provided by the [MLJFlow.jl](https://github.com/JuliaAI/MLJFlow.jl) -package whose methods are automatically available to MLJ users. Refer to the package's -documentation for examples. +MLJ logging examples are given in the [MLJFlow.jl](https://github.com/JuliaAI/MLJFlow.jl) +documentation. MLJ includes and re-exports all the methods of MLJFlow.jl, so there is no +need to import MLJFlow.jl if `using MLJ`. +!!! warning + + MLJFlow.jl is a new package still under active development and should be regarded as experimental. At this time, breaking changes to MLJFlow.jl will not necessarily trigger new breaking releases of MLJ.jl.