Merge tag 'v0.23.2' into graph-nodes

MilesCranmer · MilesCranmer · commit 9438597e37a1 · 2024-02-19T13:29:53.000Z
[Diff since v0.23.1](v0.23.1...v0.23.2) **Merged pull requests:** - Formatting overhaul (#278) (@MilesCranmer) - Avoid julia-formatter on pre-commit.ci (#279) (@MilesCranmer) - Make it easier to select expression from Pareto front for evaluation (#289) (@MilesCranmer) **Closed issues:** - Garbage collection too passive on worker processes (#237) - How can I set the maximum number of nests? (#285)
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "SymbolicRegression"
 uuid = "8254be44-1295-4e6a-a16d-46603ac705cb"
 authors = ["MilesCranmer <miles.cranmer@gmail.com>"]
-version = "0.23.1"
+version = "0.23.2"
 
 [deps]
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
diff --git a/README.md b/README.md
@@ -1,24 +1,24 @@
+<!-- prettier-ignore-start -->
 <div align="center">
 
 SymbolicRegression.jl searches for symbolic expressions which optimize a particular objective.
 
 https://github.com/MilesCranmer/SymbolicRegression.jl/assets/7593028/f5b68f1f-9830-497f-a197-6ae332c94ee0
 
-<!-- prettier-ignore-start -->
 | Latest release | Documentation | Forums | Paper |
 | :---: | :---: | :---: | :---: |
 | [![version](https://juliahub.com/docs/SymbolicRegression/version.svg)](https://juliahub.com/ui/Packages/SymbolicRegression/X2eIS) | [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://astroautomata.com/SymbolicRegression.jl/dev/) | [![Discussions](https://img.shields.io/badge/discussions-github-informational)](https://github.com/MilesCranmer/PySR/discussions) | [![Paper](https://img.shields.io/badge/arXiv-2305.01582-b31b1b)](https://arxiv.org/abs/2305.01582) |
 
 | Build status | Coverage |
 | :---: | :---: |
 | [![CI](https://github.com/MilesCranmer/SymbolicRegression.jl/workflows/CI/badge.svg)](.github/workflows/CI.yml) | [![Coverage Status](https://coveralls.io/repos/github/MilesCranmer/SymbolicRegression.jl/badge.svg?branch=master)](https://coveralls.io/github/MilesCranmer/SymbolicRegression.jl?branch=master)<br>[![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl) |
-<!-- prettier-ignore-end >
 
 Check out [PySR](https://github.com/MilesCranmer/PySR) for
 a Python frontend.
 [Cite this software](https://arxiv.org/abs/2305.01582)
 
 </div>
+<!-- prettier-ignore-end >
 
 **Contents**:
 
@@ -153,16 +153,20 @@ predict(mach, X)
 ```
 
 This will make predictions using the expression
-selected using the function passed to `selection_method`.
-By default this selection is made a mix of accuracy and complexity.
-For example, we can make predictions using expression 2 with:
+selected by `model.selection_method`,
+which by default is a mix of accuracy and complexity.
+
+You can override this selection and select an equation from
+the Pareto front manually with:
 
 ```julia
-mach.model.selection_method = Returns(2)
-predict(mach, X)
+predict(mach, (data=X, idx=2))
 ```
 
-For fitting multiple outputs, one can use `MultitargetSRRegressor`.
+where here we choose to evaluate the second equation.
+
+For fitting multiple outputs, one can use `MultitargetSRRegressor`
+(and pass an array of indices to `idx` in `predict` for selecting specific equations).
 For a full list of options available to each regressor, see the [API page](https://astroautomata.com/SymbolicRegression.jl/dev/api/).
 
 ### Low-Level Interface
@@ -223,9 +227,12 @@ The `output` array will contain the result of the tree at each of the 100 rows.
 This `did_succeed` flag detects whether an evaluation was successful, or whether
 encountered any NaNs or Infs during calculation (such as, e.g., `sqrt(-1)`).
 
-## Constructing trees
+## Constructing expressions
+
+Expressions are represented as the `Node` type which is developed
+in the [DynamicExpressions.jl](https://github.com/SymbolicML/DynamicExpressions.jl/) package.
 
-You can also manipulate and construct trees directly. For example:
+You can manipulate and construct expressions directly. For example:
 
 ```julia
 import SymbolicRegression: Options, Node, eval_tree_array
diff --git a/src/MLJInterface.jl b/src/MLJInterface.jl
@@ -274,22 +274,12 @@ function prediction_warn()
     @warn "Evaluation failed either due to NaNs detected or due to unfinished search. Using 0s for prediction."
 end
 
-@inline function wrap_units(v, y_units, i::Integer)
-    if y_units === nothing
-        return v
-    else
-        return (yi -> Quantity(yi, y_units[i])).(v)
-    end
-end
-@inline function wrap_units(v, y_units, ::Nothing)
-    if y_units === nothing
-        return v
-    else
-        return (yi -> Quantity(yi, y_units)).(v)
-    end
-end
+wrap_units(v, ::Nothing, ::Integer) = v
+wrap_units(v, ::Nothing, ::Nothing) = v
+wrap_units(v, y_units, i::Integer) = (yi -> Quantity(yi, y_units[i])).(v)
+wrap_units(v, y_units, ::Nothing) = (yi -> Quantity(yi, y_units)).(v)
 
-function prediction_fallback(::Type{T}, m::SRRegressor, Xnew_t, fitresult) where {T}
+function prediction_fallback(::Type{T}, ::SRRegressor, Xnew_t, fitresult, _) where {T}
     prediction_warn()
     out = fill!(similar(Xnew_t, T, axes(Xnew_t, 2)), zero(T))
     return wrap_units(out, fitresult.y_units, nothing)
@@ -303,11 +293,11 @@ function prediction_fallback(
             fill!(similar(Xnew_t, T, axes(Xnew_t, 2)), zero(T)), fitresult.y_units, i
         ) for i in 1:(fitresult.num_targets)
     ]
-    out_matrix = reduce(hcat, out_cols)
+    out_matrix = hcat(out_cols...)
     if !fitresult.y_is_table
         return out_matrix
     else
-        return MMI.table(out_matrix; names=fitresult.y_variable_names, prototype=prototype)
+        return MMI.table(out_matrix; names=fitresult.y_variable_names, prototype)
     end
 end
 
@@ -344,50 +334,58 @@ function MMI.fitted_params(m::AbstractSRRegressor, fitresult)
     )
 end
 
-function MMI.predict(m::SRRegressor, fitresult, Xnew)
-    params = full_report(m, fitresult; v_with_strings=Val(false))
-    Xnew_t, variable_names, X_units = get_matrix_and_info(Xnew, m.dimensions_type)
-    T = promote_type(eltype(Xnew_t), fitresult.types.T)
-    if length(params.equations) == 0
-        return prediction_fallback(T, m, Xnew_t, fitresult)
-    end
-    X_units_clean = clean_units(X_units)
-    validate_variable_names(variable_names, fitresult)
-    validate_units(X_units_clean, fitresult.X_units)
-    eq = params.equations[params.best_idx]
-    out, completed = eval_tree_array(eq, Xnew_t, fitresult.options)
-    if !completed
-        return prediction_fallback(T, m, Xnew_t, fitresult)
+function eval_tree_mlj(
+    tree::Node, X_t, m::AbstractSRRegressor, ::Type{T}, fitresult, i, prototype
+) where {T}
+    out, completed = eval_tree_array(tree, X_t, fitresult.options)
+    if completed
+        return wrap_units(out, fitresult.y_units, i)
     else
-        return wrap_units(out, fitresult.y_units, nothing)
+        return prediction_fallback(T, m, X_t, fitresult, prototype)
     end
 end
-function MMI.predict(m::MultitargetSRRegressor, fitresult, Xnew)
+
+function MMI.predict(m::M, fitresult, Xnew; idx=nothing) where {M<:AbstractSRRegressor}
+    if Xnew isa NamedTuple && (haskey(Xnew, :idx) || haskey(Xnew, :data))
+        @assert(
+            haskey(Xnew, :idx) && haskey(Xnew, :data) && length(keys(Xnew)) == 2,
+            "If specifying an equation index during prediction, you must use a named tuple with keys `idx` and `data`."
+        )
+        return MMI.predict(m, fitresult, Xnew.data; idx=Xnew.idx)
+    end
+
     params = full_report(m, fitresult; v_with_strings=Val(false))
     prototype = MMI.istable(Xnew) ? Xnew : nothing
     Xnew_t, variable_names, X_units = get_matrix_and_info(Xnew, m.dimensions_type)
     T = promote_type(eltype(Xnew_t), fitresult.types.T)
+
+    if isempty(params.equations) || any(isempty, params.equations)
+        @warn "Equations not found. Returning 0s for prediction."
+        return prediction_fallback(T, m, Xnew_t, fitresult, prototype)
+    end
+
     X_units_clean = clean_units(X_units)
     validate_variable_names(variable_names, fitresult)
     validate_units(X_units_clean, fitresult.X_units)
-    equations = params.equations
-    if any(t -> length(t) == 0, equations)
-        return prediction_fallback(T, m, Xnew_t, fitresult, prototype)
-    end
-    best_idx = params.best_idx
-    outs = []
-    for (i, (best_i, eq)) in enumerate(zip(best_idx, equations))
-        out, completed = eval_tree_array(eq[best_i], Xnew_t, fitresult.options)
-        if !completed
-            return prediction_fallback(T, m, Xnew_t, fitresult, prototype)
+
+    idx = idx === nothing ? params.best_idx : idx
+
+    if M <: SRRegressor
+        return eval_tree_mlj(
+            params.equations[idx], Xnew_t, m, T, fitresult, nothing, prototype
+        )
+    elseif M <: MultitargetSRRegressor
+        outs = [
+            eval_tree_mlj(
+                params.equations[i][idx[i]], Xnew_t, m, T, fitresult, i, prototype
+            ) for i in eachindex(idx, params.equations)
+        ]
+        out_matrix = reduce(hcat, outs)
+        if !fitresult.y_is_table
+            return out_matrix
+        else
+            return MMI.table(out_matrix; names=fitresult.y_variable_names, prototype)
         end
-        push!(outs, wrap_units(out, fitresult.y_units, i))
-    end
-    out_matrix = reduce(hcat, outs)
-    if !fitresult.y_is_table
-        return out_matrix
-    else
-        return MMI.table(out_matrix; names=fitresult.y_variable_names, prototype=prototype)
     end
 end
 
@@ -508,11 +506,14 @@ function tag_with_docstring(model_name::Symbol, description::String, bottom_matt
         Note that if you pass complex data `::Complex{L}`, then the loss
         type will automatically be set to `L`.
     - `selection_method::Function`: Function to selection expression from
-        the Pareto frontier for use in `predict`. See `SymbolicRegression.MLJInterfaceModule.choose_best`
-        for an example. This function should return a single integer specifying
-        the index of the expression to use. By default, `choose_best` maximizes
+        the Pareto frontier for use in `predict`.
+        See `SymbolicRegression.MLJInterfaceModule.choose_best` for an example.
+        This function should return a single integer specifying
+        the index of the expression to use. By default, this maximizes
         the score (a pound-for-pound rating) of expressions reaching the threshold
-        of 1.5x the minimum loss. To fix the index at `5`, you could just write `Returns(5)`.
+        of 1.5x the minimum loss. To override this at prediction time, you can pass
+        a named tuple with keys `data` and `idx` to `predict`. See the Operations
+        section for details.
     - `dimensions_type::AbstractDimensions`: The type of dimensions to use when storing
         the units of the data. By default this is `DynamicQuantities.SymbolicDimensions`.
     """
@@ -523,6 +524,9 @@ function tag_with_docstring(model_name::Symbol, description::String, bottom_matt
     - `predict(mach, Xnew)`: Return predictions of the target given features `Xnew`, which
       should have same scitype as `X` above. The expression used for prediction is defined
       by the `selection_method` function, which can be seen by viewing `report(mach).best_idx`.
+    - `predict(mach, (data=Xnew, idx=i))`: Return predictions of the target given features
+      `Xnew`, which should have same scitype as `X` above. By passing a named tuple with keys
+      `data` and `idx`, you are able to specify the equation you wish to evaluate in `idx`.
 
     $(bottom_matter)
     """
@@ -583,7 +587,8 @@ eval(
     Note that unlike other regressors, symbolic regression stores a list of
     trained models. The model chosen from this list is defined by the function
     `selection_method` keyword argument, which by default balances accuracy
-    and complexity.
+    and complexity. You can override this at prediction time by passing a named
+    tuple with keys `data` and `idx`.
 
     """,
             r"^    " => "",
@@ -595,7 +600,8 @@ eval(
     The fields of `fitted_params(mach)` are:
 
     - `best_idx::Int`: The index of the best expression in the Pareto frontier,
-       as determined by the `selection_method` function.
+       as determined by the `selection_method` function. Override in `predict` by passing
+        a named tuple with keys `data` and `idx`.
     - `equations::Vector{Node{T}}`: The expressions discovered by the search, represented
       in a dominating Pareto frontier (i.e., the best expressions found for
       each complexity). `T` is equal to the element type
@@ -608,7 +614,8 @@ eval(
     The fields of `report(mach)` are:
 
     - `best_idx::Int`: The index of the best expression in the Pareto frontier,
-       as determined by the `selection_method` function.
+       as determined by the `selection_method` function. Override in `predict` by passing
+       a named tuple with keys `data` and `idx`.
     - `equations::Vector{Node{T}}`: The expressions discovered by the search, represented
       in a dominating Pareto frontier (i.e., the best expressions found for
       each complexity).
@@ -705,7 +712,8 @@ eval(
     Note that unlike other regressors, symbolic regression stores a list of lists of
     trained models. The models chosen from each of these lists is defined by the function
     `selection_method` keyword argument, which by default balances accuracy
-    and complexity.
+    and complexity. You can override this at prediction time by passing a named
+    tuple with keys `data` and `idx`.
 
     """,
             r"^    " => "",
@@ -717,7 +725,8 @@ eval(
     The fields of `fitted_params(mach)` are:
 
     - `best_idx::Vector{Int}`: The index of the best expression in each Pareto frontier,
-      as determined by the `selection_method` function.
+      as determined by the `selection_method` function. Override in `predict` by passing
+      a named tuple with keys `data` and `idx`.
     - `equations::Vector{Vector{Node{T}}}`: The expressions discovered by the search, represented
       in a dominating Pareto frontier (i.e., the best expressions found for
       each complexity). The outer vector is indexed by target variable, and the inner
@@ -731,7 +740,8 @@ eval(
     The fields of `report(mach)` are:
 
     - `best_idx::Vector{Int}`: The index of the best expression in each Pareto frontier,
-       as determined by the `selection_method` function.
+       as determined by the `selection_method` function. Override in `predict` by passing
+       a named tuple with keys `data` and `idx`.
     - `equations::Vector{Vector{Node{T}}}`: The expressions discovered by the search, represented
       in a dominating Pareto frontier (i.e., the best expressions found for
       each complexity). The outer vector is indexed by target variable, and the inner
diff --git a/test/test_mlj.jl b/test/test_mlj.jl
@@ -43,8 +43,17 @@ end
         fit!(mach)
         rep = report(mach)
         @test occursin("a", rep.equation_strings[rep.best_idx])
+        ypred_good = predict(mach, X)
         @test sum(abs2, predict(mach, X) .- y) / length(y) < 1e-5
 
+        @testset "Check that we can choose the equation" begin
+            ypred_same = predict(mach, (data=X, idx=rep.best_idx))
+            @test ypred_good == ypred_same
+
+            ypred_bad = predict(mach, (data=X, idx=1))
+            @test ypred_good != ypred_bad
+        end
+
         @testset "Smoke test SymbolicUtils" begin
             eqn = node_to_symbolic(rep.equations[rep.best_idx], model)
             n = symbolic_to_node(eqn, model)
@@ -63,6 +72,28 @@ end
         @test all(
             eq -> occursin("a", eq), [rep.equation_strings[i][rep.best_idx[i]] for i in 1:3]
         )
+        ypred_good = predict(mach, X)
+
+        @testset "Test that we can choose the equation" begin
+            ypred_same = predict(mach, (data=X, idx=rep.best_idx))
+            @test ypred_good == ypred_same
+
+            ypred_bad = predict(mach, (data=X, idx=[1, 1, 1]))
+            @test ypred_good != ypred_bad
+
+            ypred_mixed = predict(mach, (data=X, idx=[rep.best_idx[1], 1, rep.best_idx[3]]))
+            @test ypred_mixed == hcat(ypred_good[:, 1], ypred_bad[:, 2], ypred_good[:, 3])
+
+            @test_throws AssertionError predict(mach, (data=X,))
+            VERSION >= v"1.8" &&
+                @test_throws "If specifying an equation index during" predict(
+                    mach, (data=X,)
+                )
+            VERSION >= v"1.8" &&
+                @test_throws "If specifying an equation index during" predict(
+                    mach, (X=X, idx=1)
+                )
+        end
     end
 
     @testset "Named outputs" begin