diff --git a/.cirrus.yml b/.cirrus.yml deleted file mode 100644 index 0a3671d..0000000 --- a/.cirrus.yml +++ /dev/null @@ -1,20 +0,0 @@ -freebsd_instance: - image_family: freebsd-14-0 -task: - name: FreeBSD - artifacts_cache: - folder: ~/.julia/artifacts - env: - matrix: - - JULIA_VERSION: 1.9 - - JULIA_VERSION: 1 - - JULIA_VERSION: nightly - allow_failures: $JULIA_VERSION == 'nightly' - install_script: - - sh -c "$(fetch https://raw.githubusercontent.com/ararslan/CirrusCI.jl/master/bin/install.sh -o -)" - build_script: - - cirrusjl build - test_script: - - cirrusjl test - coverage_script: - - cirrusjl coverage codecov diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..700707c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,7 @@ +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" # Location of package manifests + schedule: + interval: "weekly" diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml index f49313b..0cd3114 100644 --- a/.github/workflows/TagBot.yml +++ b/.github/workflows/TagBot.yml @@ -4,6 +4,22 @@ on: types: - created workflow_dispatch: + inputs: + lookback: + default: "3" +permissions: + actions: read + checks: read + contents: write + deployments: read + issues: read + discussions: read + packages: read + pages: read + pull-requests: read + repository-projects: read + security-events: read + statuses: read jobs: TagBot: if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ba475a2..d12f804 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,20 +1,47 @@ -name: Upload coverage reports to Codecov -on: [push, pull_request] +name: CI +on: + push: + branches: + - lumen-refactoring + tags: ['*'] + pull_request: + workflow_dispatch: +concurrency: + # Skip intermediate builds: always. + # Cancel intermediate builds: only if it is a pull request build. + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} jobs: - run: - runs-on: ubuntu-latest + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} + runs-on: ${{ matrix.os }} + timeout-minutes: 60 + permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created + actions: write + contents: read + strategy: + fail-fast: false + matrix: + version: + - '1' + - 'lts' + - 'pre' + os: + - ubuntu-latest + arch: + - x64 steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Set up Julia 1.9.0 - uses: julia-actions/setup-julia@v1 + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 with: - version: "1.9.0" + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/cache@v2 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v5 with: + files: lcov.info token: ${{ secrets.CODECOV_TOKEN }} - slug: aclai-lab/SoleModels.jl - + fail_ci_if_error: false \ No newline at end of file diff --git a/Project.toml b/Project.toml index b224cf9..c927db9 100644 --- a/Project.toml +++ b/Project.toml @@ -2,33 +2,22 @@ name = "SoleModels" uuid = "4249d9c7-3290-4ddd-961c-e1d3ec2467f8" license = "MIT" authors = ["Michele GHIOTTI", "Giovanni PAGLIARINI", "Edoardo PONSANESI", "Eduard I. STAN"] -version = "0.10.0" +version = "0.10.1" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" -CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" -DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" FunctionWrappers = "069b7b12-0de2-55c6-9aab-29f3d0a68a2e" -Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" -HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" Lazy = "50d2b5c4-7a5e-59d5-8109-a42b560f39c0" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" -ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" -Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" SoleBase = "4475fa32-7023-44a0-aa70-4813b230e492" SoleData = "123f1ae1-6307-4526-ab5b-aab3a92a2b8c" SoleLogics = "b002da8f-3cb3-4d91-bbe3-2953433912b5" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb" -Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" ThreadSafeDicts = "4239201d-c60e-5e0a-9702-85d713665ba7" -ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" [weakdeps] DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" @@ -40,53 +29,48 @@ XGBoostExt = "XGBoost" [compat] AbstractTrees = "0.4" -BenchmarkTools = "1" -CSV = "0.10" CategoricalArrays = "0.10" -DataFrames = "1" -DataStructures = "0.18" -DecisionTree = "0.12" FillArrays = "1" FunctionWrappers = "1" -Graphs = "1.8" -HTTP = "1.9" IterTools = "1" -Lazy = "0.15.1" -MLJ = "0.19 - 0.20" -MLJBase = "1.6 - 1.7" -MLJDecisionTreeInterface = "0.4" -MLJModelInterface = "1.8" +Lazy = "0.15" PrettyTables = "2.2" -ProgressMeter = "1" -Random = "1" Reexport = "1" -Revise = "3" SoleBase = "0.13" -SoleData = "0.15, 0.16" +SoleData = "0.16" SoleLogics = "0.13" StatsBase = "0.30 - 0.34" -Suppressor = "0.2" -Tables = "1" ThreadSafeDicts = "0.1" -XGBoost = "2" -ZipFile = "0.10" julia = "1" [extras] -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" -MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661" MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" MLJXGBoostInterface = "54119dfa-1dab-4055-a167-80440f4f7a91" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" MultiData = "8cc5100c-b3d1-4f82-90cb-0ea93d317aba" -PlutoUI = "7f904dfe-b85e-4ff6-b463-dae2292396a8" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -SoleData = "123f1ae1-6307-4526-ab5b-aab3a92a2b8c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +XGBoost = "009559a3-9522-5dbb-924b-0b6ed2b22bb9" [targets] -test = ["Test", "DataFrames", "Random", "MLJ", "MLJXGBoostInterface", "MultiData", "Markdown", "InteractiveUtils", "BenchmarkTools", "MLJBase", "XGBoost", "DecisionTree", "MLJDecisionTreeInterface", "SoleData"] +test = [ + "DataFrames", + "DecisionTree", + "Distributed", + "InteractiveUtils", + "MLJ", + "MLJDecisionTreeInterface", + "MLJModelInterface", + "MLJXGBoostInterface", + "Markdown", + "MultiData", + "Random", + "Test", + "XGBoost" +] diff --git a/TODO.md b/TODO.md index e6e2383..04cfba6 100644 --- a/TODO.md +++ b/TODO.md @@ -51,6 +51,6 @@ Test: # Distribution of covered examples for consequent # Distribution of examples on which the rule was built ✔ Testing parser error @done(24-05-31 11:12) - + ☐ Add test for rule-extraction.jl Questions: - ✔ Readmetrics for CN2 statistics @done(24-05-31 11:12) \ No newline at end of file + ✔ Readmetrics for CN2 statistics @done(24-05-31 11:12) diff --git a/ext/DecisionTreeExt.jl b/ext/DecisionTreeExt.jl index de67fdb..1755fb5 100644 --- a/ext/DecisionTreeExt.jl +++ b/ext/DecisionTreeExt.jl @@ -9,7 +9,7 @@ import DecisionTree as DT function get_condition(featid, featval, featurenames) test_operator = (<) # @show fieldnames(typeof(tree)) - feature = !isnothing(featurenames) ? VariableValue(featurenames[featid]) : VariableValue(featid) + feature = isnothing(featurenames) ? VariableValue(featid) : VariableValue(featid, featurenames[featid]) return ScalarCondition(feature, test_operator, featval) end @@ -106,9 +106,16 @@ function SoleModels.solemodel( return m end -function SoleModels.solemodel(tree::DT.InfoNode; keep_condensed = false, featurenames = true, classlabels = tree.info.classlabels, kwargs...) +function SoleModels.solemodel( + tree::DT.InfoNode{T,orig_O}; + keep_condensed=false, + featurenames=true, + # classlabels=tree.info.classlabels, + kwargs... +) where {T,orig_O} # @show fieldnames(typeof(tree)) featurenames = featurenames == true ? tree.info.featurenames : featurenames + classlabels = haskey(tree.info, :classlabels) ? tree.info.classlabels : nothing root, info = begin if keep_condensed diff --git a/ext/MLJXGBoostInterfaceExt.jl b/ext/MLJXGBoostInterfaceExt.jl deleted file mode 100644 index caf64fe..0000000 --- a/ext/MLJXGBoostInterfaceExt.jl +++ /dev/null @@ -1,308 +0,0 @@ -module MLJXGBoostInterfaceExt - -import MLJModelInterface as MMI -import XGBoost as XGB -import Tables -using CategoricalArrays -using AbstractTrees - -import Sole: AbstractModel -import Sole: VariableValue, ScalarCondition, Atom, ConstantModel, Branch, DecisionTree - -const PKG = "MLJXGBoostInterface" - -abstract type XGBoostAbstractRegressor <: MMI.Deterministic end -abstract type XGBoostAbstractClassifier <: MMI.Probabilistic end - -const XGTypes = Union{XGBoostAbstractRegressor,XGBoostAbstractClassifier} - -struct TreePrinter{T} - tree::T - features::Vector{Symbol} -end -(c::TreePrinter)(depth) = AbstractTrees.print_tree(c.tree, depth, feature_names = c.features) -(c::TreePrinter)() = AbstractTrees.print_tree(c.tree, 5, feature_names = c.features) - -Base.show(stream::IO, c::TreePrinter) = - print(stream, "TreePrinter object (call with display depth)") - -function classes(y) - p = CategoricalArrays.pool(y) - [p[i] for i in 1:length(p)] -end - -# function modelexpr(name::Symbol, absname::Symbol, obj::AbstractString, objvalidate::Symbol) -function modelexpr(name::Symbol, absname::Symbol) - metric = absname == :XGBoostAbstractClassifier ? "mlogloss" : "rmse" - quote - MMI.@mlj_model mutable struct $name <: $absname - # MMI.@mlj_model mutable struct $name - # ref: https://xgboost.readthedocs.io/en/stable/parameter.html - # general parameters - booster::String = "gbtree" - # device::String = "cpu" - eval_metric::String = $metric - objective::Union{String, Nothing} = nothing - num_round::Int = 100::(_ ≥ 0) - early_stopping_rounds::Int = 0::(_ ≥ 0) - - - # parameters for tree booster - eta::Float64 = 0.3::(0.0 ≤ _ ≤ 1.0) - alpha::Float64 = 0::(_ ≥ 0) - gamma::Float64 = 0::(_ ≥ 0) - lambda::Float64 = 1::(_ ≥ 0) - - max_depth::Int = 6::(_ ≥ 0) - min_child_weight::Float64 = 1::(_ ≥ 0) - max_delta_step::Float64 = 0::(_ ≥ 0) - subsample::Float64 = 1::(0 < _ ≤ 1) - sampling_method::String = "uniform" - - colsample_bynode::Float64 = 1::(0 < _ ≤ 1) - colsample_bylevel::Float64 = 1::(0 < _ ≤ 1) - colsample_bytree::Float64 = 1::(0 < _ ≤ 1) - - tree_method::String = "auto" - - # scale_pos_weight::Float64 = 1.0 - end - - - # # additional parameters for dart booster - # one_drop::Union{Int,Bool} = 0::(0 ≤ _ ≤ 1) - # normalize_type::String = "tree" - # rate_drop::Float64 = 0::(0 ≤ _ ≤ 1) - # sample_type::String = "uniform" - # skip_drop::Float64 = 0::(0 ≤ _ ≤ 1) - - # # additional parameters for linear booster - # feature_selector::String = "cyclic" - # top_k::Int = 0::(_ ≥ 0) - - # # additional parameters for tweedie regression - # tweedie_variance_power::Float64 = 1.5::(1 < _ < 2) - - # # additional parameters for pseudo-huber - # # quantile_alpha TODO - - # # additional parameters for quantile loss - # # quantile_alpha TODO - - # # learning task parameters - # base_score::Float64 = 0.5 - - - # # test::Int = 1::(_ ≥ 0) - # # sketch_eps::Float64 = 0.03::(0 < _ < 1) - # # predictor::String = "cpu_predictor" - # # watchlist = nothing # if this is nothing we will not pass it so as to use default - # # importance_type::String = "gain" - # end - end -end - -# eval(modelexpr(:XGBoostClassifier, :XGBoostAbstractClassifier, "automatic", :validate_class_objective)) -# eval(modelexpr(:XGBoostCount, :XGBoostAbstractRegressor, "count:poisson", :validate_count_objective)) -# eval(modelexpr(:XGBoostRegressor, :XGBoostAbstractRegressor, "reg:squarederror", :validate_reg_objective)) - -eval(modelexpr(:XGBoostClassifier, :XGBoostAbstractClassifier)) -eval(modelexpr(:XGBoostCount, :XGBoostAbstractRegressor)) -eval(modelexpr(:XGBoostRegressor, :XGBoostAbstractRegressor)) - -MMI.reports_feature_importances(::Type{<:XGBoostAbstractRegressor}) = true -MMI.reports_feature_importances(::Type{<:XGBoostAbstractClassifier}) = true - -export XGBoostClassifier, XGBoostCount, XGBoostRegressor - -function MMI.fit( - m::XGBoostClassifier, - verbosity::Int, - X, - y, - features, - classes, - ) - - integers_seen = unique(y) - classes_seen = MMI.decoder(classes)(integers_seen) - - # dX = if isnothing(weight) - # XGB.DMatrix(X, y_code; feature_names=names(X)) - # # XGB.DMatrix(MMI.matrix(X), y_code) - # else - # XGB.DMatrix(X, y_code; feature_names=names(X), weight = weight) - # # XGB.DMatrix(MMI.matrix(X), y_code; feature_names=names(X), weight = weight) - # end - - # bst = xgboost(dm; kwargs(model, verbosity, objective)..., num_class...) - nclass = length(classes_seen) - if isnothing(m.objective) - m.objective = nclass == 2 ? "binary:logistic" : "multi:softprob" - end - - params = Dict((field, getfield(m, field)) for field in fieldnames(typeof(m))) - bst = XGB.xgboost((X, y.-1); verbosity=verbosity, params..., num_class=nclass) - - # imp = XGB.importancetable(bst) - ts = XGB.trees(bst) - - verbosity < 2 || AbstractTrees.print_tree(ts, m.max_depth) - - fitresult = (bst, classes_seen, integers_seen, features) - - cache = nothing - report = ( - classes_seen=nclass, - print_tree=TreePrinter(ts, features), - features=features, - ) - return fitresult, cache, report -end - -get_encoding(classes_seen) = Dict(MMI.int(c) => c for c in classes(classes_seen)) -classlabels(encoding) = [string(encoding[i]) for i in sort(keys(encoding) |> collect)] - -struct InfoXGBNode - node::XGB.Node - info::NamedTuple -end -AbstractTrees.nodevalue(n::InfoXGBNode) = n.node - -struct InfoXGBLeaf - node::XGB.Node - info::NamedTuple -end -AbstractTrees.nodevalue(l::InfoXGBLeaf) = l.node - -# struct InfoNode{S,T} <: AbstractTrees.AbstractNode{DecisionTree.Node{S,T}} -# node::DecisionTree.Node{S,T} -# info::NamedTuple -# end -# AbstractTrees.nodevalue(n::InfoNode) = n.node - -# struct InfoLeaf{T} <: AbstractTrees.AbstractNode{DecisionTree.Leaf{T}} -# leaf::DecisionTree.Leaf{T} -# info::NamedTuple -# end -# AbstractTrees.nodevalue(l::InfoLeaf) = l.leaf - -isleaf(node::XGB.Node) = isempty(node.children) ? true : false - -wrap(vecnode::Vector{<:XGB.Node}, info::NamedTuple=NamedTuple()) = MLJXGBoostInterface.wrap.(vecnode, Ref(info)) -# wrap(tree::DecisionTree.Root, info::NamedTuple=NamedTuple()) = wrap(tree.node, info) -wrap(node::XGB.Node, info::NamedTuple=NamedTuple()) = isleaf(node) ? InfoXGBLeaf(node, info) : InfoXGBNode(node, info) -# wrap(leaf::DecisionTree.Leaf, info::NamedTuple=NamedTuple()) = InfoLeaf(leaf, info) - -function MMI.fitted_params(::XGBoostAbstractClassifier, fitresult) - raw_tree = XGB.trees(fitresult[1]) - encoding = get_encoding(fitresult[2]) - features = fitresult[4] - classlabels = MLJXGBoostInterface.classlabels(encoding) - info = (featurenames=features, classlabels) - tree = MLJXGBoostInterface.wrap(raw_tree, info,) - (; tree, raw_tree, encoding, features) -end - -function AbstractTrees.children(node::InfoXGBNode) - (wrap(node.children[1], node.info), wrap(node.children[2], node.info)) -end -AbstractTrees.children(node::InfoXGBLeaf) = () - -# to get column names based on table access type: -_columnnames(X) = _columnnames(X, Val(Tables.columnaccess(X))) |> collect -_columnnames(X, ::Val{true}) = Tables.columnnames(Tables.columns(X)) -_columnnames(X, ::Val{false}) = Tables.columnnames(first(Tables.rows(X))) - -MMI.reformat(::XGBoostAbstractClassifier, X, y) = - (XGB.DMatrix(X), MMI.int(y), _columnnames(X), classes(y)) -# MMI.reformat(::Regressor, X, y) = -# (Tables.matrix(X), float(y), _columnnames(X)) -# MMI.selectrows(::TreeModel, I, Xmatrix, y, meta...) = -# (view(Xmatrix, I, :), view(y, I), meta...) - -split2id(str::String) = parse(Int, filter(isdigit, str)) + 1 - -function solemodel( - tree::Vector{<:InfoXGBNode}, - raw_tree::Vector{<:XGB.Node}, - encoding::Dict, - features::Vector{Symbol}; - kwargs... -) - dt = DecisionTree[] - @show encoding - for (i, t) in enumerate(tree) - idx = (i - 1) % length(encoding) + 1 - push!(dt, MLJXGBoostInterface.solemodel(t; majority=encoding[idx], kwargs...)) - end - - return dt -end -function solemodel(tree::InfoXGBNode, keep_condensed = false; majority, use_featurenames = true, kwargs...) - # @show fieldnames(typeof(tree)) - use_featurenames = use_featurenames ? tree.info.featurenames : false - root, info = begin - if keep_condensed - root = MLJXGBoostInterface.solemodel(tree.node; majority=majority, use_featurenames = use_featurenames, kwargs...) - info = (; - apply_preprocess=(y -> UInt32(findfirst(x -> x == y, tree.info.classlabels))), - apply_postprocess=(y -> tree.info.classlabels[y]), - ) - root, info - else - root = MLJXGBoostInterface.solemodel(tree.node; majority=majority, replace_classlabels = tree.info.classlabels, use_featurenames = use_featurenames, kwargs...) - info = (;) - root, info - end - end - - info = merge(info, (; - featurenames=tree.info.featurenames, - # - supporting_predictions=root.info[:supporting_predictions], - supporting_labels=root.info[:supporting_labels], - ) - ) - - return DecisionTree(root, info) -end - -function solemodel(tree::XGB.Node; majority, replace_classlabels = nothing, use_featurenames = false) - if isempty(tree.children) - # leaf - prediction = majority.ref - # labels = tree.leaf - # if !isnothing(replace_classlabels) - # prediction = replace_classlabels[prediction] - # labels = replace_classlabels[labels] - # end - # info = (; - # supporting_predictions = fill(prediction, length(labels)), - # supporting_labels = labels, - # ) - ### TODO - labels = [1,1,1,1] - info = (; - supporting_predictions = fill(prediction, length(labels)), - supporting_labels = labels, - ) - return ConstantModel(prediction, info) - else - # node - test_operator = (<) - # @show fieldnames(typeof(tree)) - feature = (use_featurenames != false) ? VariableValue(use_featurenames[split2id(tree.split)]) : VariableValue(split2id(tree.split)) - cond = ScalarCondition(feature, test_operator, tree.split_condition) - antecedent = Atom(cond) - lefttree = MLJXGBoostInterface.solemodel(tree.children[1]; majority=majority, replace_classlabels=replace_classlabels, use_featurenames=use_featurenames) - righttree = MLJXGBoostInterface.solemodel(tree.children[2]; majority=majority, replace_classlabels=replace_classlabels, use_featurenames=use_featurenames) - info = (; - supporting_predictions = [lefttree.info[:supporting_predictions]..., righttree.info[:supporting_predictions]...], - supporting_labels = [lefttree.info[:supporting_labels]..., righttree.info[:supporting_labels]...], - ) - return Branch(antecedent, lefttree, righttree, info) - end -end - -end \ No newline at end of file diff --git a/ext/XGBoostExt.jl b/ext/XGBoostExt.jl index ad346ae..fdf9639 100644 --- a/ext/XGBoostExt.jl +++ b/ext/XGBoostExt.jl @@ -3,10 +3,15 @@ module XGBoostExt using SoleModels using XGBoost +using CategoricalArrays + import SoleModels: alphabet, solemodel +# ---------------------------------------------------------------------------- # +# DecisionXGBoost alphabet # +# ---------------------------------------------------------------------------- # function alphabet(model::XGBoost.Booster; kwargs...) - error("TODO fix and test.") + # error("TODO fix and test.") function _alphabet!(a::Vector, model::XGBoost.Booster; kwargs...) return a end @@ -31,8 +36,8 @@ function alphabet(model::XGBoost.Booster; kwargs...) _alphabet!(Atom{ScalarCondition}[], model; kwargs...) end - # TODO fix and test. Problem: where are the tree weights? How do I write this in the multi-class case? +# leaf values are actually the weight of the tree # # Convert an XGBoost.Booster to a Sole Ensemble # function solemodel(model::XGBoost.Booster; with_stats::Bool = true, kwargs...) @@ -89,5 +94,166 @@ end # return Branch(antecedent, left_tree, right_tree, info) # end +function get_condition(featidstr, featval, featurenames; test_operator) + featid = parse(Int, featidstr[2:end]) + 1 # considering 0-based indexing in XGBoost feature ids + feature = isnothing(featurenames) ? VariableValue(featid) : VariableValue(featid, featurenames[featid]) + return ScalarCondition(feature, test_operator, featval) +end + +function get_condition(class_idx, featurenames; test_operator, featval) + feature = isnothing(featurenames) ? VariableValue(class_idx) : VariableValue(class_idx, featurenames[class_idx]) + return ScalarCondition(feature, test_operator, featval) +end + +get_operator(atom::Atom{<:ScalarCondition}) = atom.value.metacond.test_operator +get_i_variable(atom::Atom{<:ScalarCondition}) = atom.value.metacond.feature.i_variable +get_threshold(atom::Atom{<:ScalarCondition}) = atom.value.threshold + +function satisfies_conditions(row, formula) + all(atom -> get_operator(atom)( + row[get_i_variable(atom)], + get_threshold(atom)), formula + ) +end + +function bitmap_check_conditions(X, formula) + BitVector([satisfies_conditions(row, formula) for row in eachrow(X)]) +end + +function early_return(leaf, antecedent, clabel, classl) + info =(; + leaf_values=leaf, + supporting_predictions=clabel, + supporting_labels=[classl], + ) + + return Branch( + antecedent, + SoleModels.ConstantModel(first(clabel), info), + SoleModels.ConstantModel(first(clabel), info), + info + ) +end + +# ---------------------------------------------------------------------------- # +# DecisionXGBoost solemodel # +# ---------------------------------------------------------------------------- # +function SoleModels.solemodel( + model::Vector{<:XGBoost.Node}, + X::AbstractMatrix, + y::AbstractVector; + classlabels, + featurenames=nothing, + keep_condensed=false, + use_float32::Bool=true, + kwargs... +) + keep_condensed && error("Cannot keep condensed XGBoost.Node.") + + nclasses = length(classlabels) + + trees = map(enumerate(model)) do (i, t) + class_idx = (i - 1) % nclasses + 1 + clabels = categorical([classlabels[class_idx]]) + # xgboost trees could be composed of only one leaf, without any split + if isnothing(t.split) + antecedent = Atom(get_condition(class_idx, featurenames; test_operator=(<), featval=Inf)) + leaf = use_float32 ? Float32(t.leaf) : t.leaf + early_return(leaf, antecedent, clabels, classlabels[class_idx]) + else + SoleModels.solemodel(t, X, y; classlabels, class_idx, clabels, featurenames, use_float32, kwargs...) + end + end + + info = merge( + isnothing(featurenames) ? (;) : (;featurenames=featurenames), + (; + leaf_values = reduce(vcat, getindex.(getproperty.(trees, :info), :leaf_values)), + supporting_predictions = reduce(vcat, getindex.(getproperty.(trees, :info), :supporting_predictions)), + supporting_labels = reduce(vcat, getindex.(getproperty.(trees, :info), :supporting_labels)) + ) + ) + + return DecisionXGBoost(trees, info) +end + +""" + solemodel(tree::XGBoost.Node; fl=Formula[], fr=Formula[], classlabels=nothing, featurenames=nothing, keep_condensed=false) + +Traverses a learned XGBoost tree, collecting the path conditions for each branch. +Left paths (<) store conditions in `fl`, right paths (≥) store conditions in `fr`. +When reaching a leaf, calls `xgbleaf` with the path's collected conditions. +""" +function SoleModels.solemodel( + tree::XGBoost.Node, + X::AbstractMatrix, + y::AbstractVector; + classlabels, + class_idx, + clabels, + featurenames=nothing, + path_conditions=Formula[], + use_float32::Bool, +) +split_condition = use_float32 ? Float32(tree.split_condition) : tree.split_condition + antecedent = Atom(get_condition(tree.split, split_condition, featurenames; test_operator=(<))) + + # create a new path for the left branch + left_path = copy(path_conditions) + push!(left_path, Atom(get_condition(tree.split, split_condition, featurenames; test_operator=(<)))) + + # create a new path for the right branch + right_path = copy(path_conditions) + push!(right_path, Atom(get_condition(tree.split, split_condition, featurenames; test_operator=(≥)))) + + lefttree = if isnothing(tree.children[1].split) + # @show SoleModels.join_antecedents(left_path) + xgbleaf(tree.children[1], left_path, X, y; use_float32) + else + SoleModels.solemodel(tree.children[1], X, y; path_conditions=left_path, classlabels, class_idx, clabels, featurenames, use_float32) + end + + righttree = if isnothing(tree.children[2].split) + # @show SoleModels.join_antecedents(right_path) + xgbleaf(tree.children[2], right_path, X, y; use_float32) + else + SoleModels.solemodel(tree.children[2], X, y; path_conditions=right_path, classlabels, class_idx, clabels, featurenames, use_float32) + end + + info = (; + leaf_values = [lefttree.info[:leaf_values]..., righttree.info[:leaf_values]...], + supporting_predictions = [lefttree.info[:supporting_predictions]..., righttree.info[:supporting_predictions]...], + supporting_labels = [lefttree.info[:supporting_labels]..., righttree.info[:supporting_labels]...], + ) + return Branch(antecedent, lefttree, righttree, info) +end + +function xgbleaf( + leaf::XGBoost.Node, + formula::Vector{<:Formula}, + X::AbstractMatrix, + y::AbstractVector; + use_float32::Bool, +) + bitX = bitmap_check_conditions(X, formula) + + # this could happens when the split condition doesn't match any class + !any(bitX) && (bitX = trues(length(y))) + prediction = SoleModels.bestguess(y[bitX]; suppress_parity_warning=true) + + labels = unique(y) + + isnothing(prediction) && return nothing + + leaf_values = use_float32 ? Float32(leaf.leaf) : leaf.leaf + + info = (; + leaf_values, + supporting_predictions = fill(prediction, length(labels)), + supporting_labels = labels, + ) + + return SoleModels.ConstantModel(prediction, info) +end end diff --git a/pluto-demo.jl b/pluto-demo.jl index ce978a6..095231b 100644 --- a/pluto-demo.jl +++ b/pluto-demo.jl @@ -1,13 +1,13 @@ ### A Pluto.jl notebook ### # v0.19.38 -using Markdown -using InteractiveUtils +# using Markdown +# using InteractiveUtils # ╔═╡ 7685d19e-cc98-4031-a6f9-29ecccc9f417 begin - using SoleModels - using DataFrames + # using SoleModels + # using DataFrames # Load an example time-series classification dataset as a tuple (DataFrame, Vector{String}) X_df, y = SoleModels.load_arff_dataset("NATOPS") @@ -32,7 +32,7 @@ end # ╔═╡ 1ccda54b-1b70-4353-ace6-fe277e5bf67f begin - using MultiData + # using MultiData # Construct a logiset from a DataFrame logiset = scalarlogiset(X_df, features) diff --git a/src/SoleModels.jl b/src/SoleModels.jl index 6ed47a8..2b6e3de 100644 --- a/src/SoleModels.jl +++ b/src/SoleModels.jl @@ -61,6 +61,7 @@ export height export DecisionEnsemble, models export DecisionForest, trees export DecisionSet, rules, nrules +export DecisionXGBoost export MixedModel @@ -92,6 +93,9 @@ export subtreeheight include("symbolic-utils.jl") export PlainRuleExtractor + + + export extractrules, listrules, joinrules include("rule-extraction.jl") diff --git a/src/deprecate.jl b/src/deprecate.jl index a1f16d8..3f944f1 100644 --- a/src/deprecate.jl +++ b/src/deprecate.jl @@ -3,7 +3,7 @@ const MixedSymbolicModel = MixedModel const List = DecisionList const Tree = DecisionTree const Forest = DecisionForest - +const modalextractrules = extractrules; export modalextractrules @inline function apply( diff --git a/src/evaluate.jl b/src/evaluate.jl index a460498..cc459ef 100644 --- a/src/evaluate.jl +++ b/src/evaluate.jl @@ -322,8 +322,20 @@ function evaluaterule( classmask = (Y .== outcome(consequent(rule))) checkmask, explanations = begin if compute_explanations + + ### from Perry's SoleModels fix for SolePostHoc # Note: This is kind of quick and dirty. - disjs = SoleLogics.disjuncts(SoleLogics.LeftmostDisjunctiveForm(antecedent(rule))) + # disjs = SoleLogics.disjuncts(SoleLogics.LeftmostDisjunctiveForm(antecedent(rule))) + ante = antecedent(rule) + if (ante isa SyntaxBranch) + # Radice disgiuntiva: trasformiamo in forma “disjunctive” e poi estraiamo i disgiunti + dnf = SoleLogics.LeftmostDisjunctiveForm(ante) + disjs = SoleLogics.disjuncts(dnf) + else + # Non è un OR in radice → un singolo disgiunto + disjs = [ante] + end + checkmatrix = hcat([check(disj, X; kwargs...) for disj in disjs]...) # @show checkmatrix checkmask = map(any, eachrow(checkmatrix)) @@ -337,11 +349,22 @@ function evaluaterule( end pos_checkmask = checkmask[classmask] neg_checkmask = checkmask[(!).(classmask)] + + ### from Perry's SoleModels fix for SolePostHoc + # Controlli per array vuoti + sensitivity = length(pos_checkmask) > 0 ? sum(pos_checkmask)/length(pos_checkmask) : 0.0 + specificity = length(neg_checkmask) > 0 ? 1-(sum(neg_checkmask)/length(neg_checkmask)) : 1.0 + out = (; classmask = classmask, checkmask = checkmask, - sensitivity = sum(pos_checkmask)/length(pos_checkmask), - specificity = 1-(sum(neg_checkmask)/length(neg_checkmask)), + + ### from Perry's SoleModels fix for SolePostHoc + # sensitivity = sum(pos_checkmask)/length(pos_checkmask), + # specificity = 1-(sum(neg_checkmask)/length(neg_checkmask)), + sensitivity = sensitivity, + specificity = specificity, + explanations = explanations, ) return out diff --git a/src/print.jl b/src/print.jl index b111210..efc4e13 100644 --- a/src/print.jl +++ b/src/print.jl @@ -521,7 +521,7 @@ end function printmodel( io::IO, - m::DecisionEnsemble; + m::Union{DecisionEnsemble, DecisionXGBoost}; header = DEFAULT_HEADER, indentation_str = "", indentation = default_indentation, diff --git a/src/utils/models/ensembles.jl b/src/utils/models/ensembles.jl index 7f1d6cb..7d5c5aa 100644 --- a/src/utils/models/ensembles.jl +++ b/src/utils/models/ensembles.jl @@ -95,7 +95,6 @@ struct DecisionEnsemble{O,T<:AbstractModel,A<:Base.Callable,W<:Union{Nothing,Abs O = Union{outcometype.(models)...} DecisionEnsemble{O}(models, args...; kwargs...) end - end @@ -105,6 +104,8 @@ modelstype(m::DecisionEnsemble{O,T}) where {O,T} = T models(m::DecisionEnsemble) = m.models nmodels(m::DecisionEnsemble) = length(models(m)) +iscomplete(m::DecisionEnsemble) = any(iscomplete.(models(m))) + aggregation(m::DecisionEnsemble) = m.aggregation weights(m::DecisionEnsemble) = m.weights # Returns the aggregation function, patched by weights if the model has them. @@ -252,149 +253,279 @@ function ntrees(m::DecisionForest) length(trees(m)) end +# """ +# A `MaxDecisionBag` is an ensemble of models, weighted by a set of other models. + +# See also [`DecisionForest`](@ref), [`DecisionTree`](@ref), [`DecisionEnsemble`](@ref), [`MaxDecisionBag`](@ref). +# """ +# struct DecisionBag{O,TO<:AbstractModel,TU<:AbstractModel +# # ,A<:Base.Callable +# # ,W<:Union{Nothing,AbstractVector} +# } <: AbstractDecisionEnsemble{O} +# output_producing_models::Vector{TO} +# weight_producing_models::Vector{TU} +# # aggregation::A +# # weights::W +# info::NamedTuple + +# function DecisionBag{O}( +# output_producing_models::Vector, +# weight_producing_models::Vector, +# # aggregation::Union{Nothing,Base.Callable}, +# # weights::Union{Nothing,AbstractVector}, +# info::NamedTuple = (;); +# suppress_parity_warning = nothing, +# ) where {O} +# @assert length(output_producing_models) > 0 "Cannot instantiate empty bagoutput-producing models!" +# @assert length(weight_producing_models) > 0 "Cannot instantiate empty bagweight-producing models!" +# @assert length(output_producing_models) == length(weight_producing_models) "Cannot instantiate bag with different numbers of output and weight producing models: $(length(output_producing_models)) != $(length(weight_producing_models))." +# output_producing_models = wrap.(output_producing_models) +# weight_producing_models = wrap.(weight_producing_models) +# # if isnothing(aggregation) +# # # if a suppress_parity_warning parameter is provided, then the aggregation's suppress_parity_warning defaults to it; +# # # otherwise, it defaults to bestguess's suppress_parity_warning +# # if isnothing(suppress_parity_warning) +# # aggregation = function (args...; kwargs...) bestguess(args...; kwargs...) end +# # else +# # aggregation = function (args...; suppress_parity_warning = suppress_parity_warning, kwargs...) bestguess(args...; suppress_parity_warning, kwargs...) end +# # end +# # else +# # isnothing(suppress_parity_warning) || @warn "Unexpected value for suppress_parity_warning: $(suppress_parity_warning)." +# # end +# TO = typeof(output_producing_models) +# TU = typeof(weight_producing_models) +# # W = typeof(weights) +# # A = typeof(aggregation) +# new{O,TO,TU}(output_producing_models, weight_producing_models, aggregation, info) # , weights +# end + +# function MaxDecisionBag( +# output_producing_models::Vector, +# weight_producing_models::Vector, +# args...; kwargs... +# ) +# @assert length(output_producing_models) > 0 "Cannot instantiate empty bagoutput-producing models!" +# @assert length(weight_producing_models) > 0 "Cannot instantiate empty bagweight-producing models!" +# @assert length(output_producing_models) == length(weight_producing_models) "Cannot instantiate bag with different numbers of output and weight producing models: $(length(output_producing_models)) != $(length(weight_producing_models))." +# output_producing_models = wrap.(output_producing_models) +# weight_producing_models = wrap.(weight_producing_models) +# O = Union{outcometype.(output_producing_models)...} +# MaxDecisionBag{O}(output_producing_models, weight_producing_models, args...; kwargs...) +# end +# end + +# isensemble(m::MaxDecisionBag) = true + +# function apply(m::MaxDecisionBag, d::AbstractInterpretation; suppress_parity_warning = false, kwargs...) +# weights = [apply(wm, d; suppress_parity_warning, kwargs...) for wm in m.weight_producing_models] +# om = m.output_producing_models[argmax(weights)] +# pred = apply(om, d; suppress_parity_warning, kwargs...) +# # preds = [apply(om, d; suppress_parity_warning, kwargs...) for om in m.output_producing_models] +# # pred = aggregation(m)(preds, weights; suppress_parity_warning) +# pred +# end +# # TODO Add a keyword argument that toggles the soft or hard behavior. The hard behavior is one where you first find the bestguess among the weights, and then perform the apply only on the first + +# # TODO parallelize +# function apply( +# m::MaxDecisionBag, +# d::AbstractInterpretationSet; +# suppress_parity_warning = false, +# kwargs... +# ) +# weights = hcat([apply(wm, d; suppress_parity_warning, kwargs...) for wm in m.weight_producing_models]...) +# preds = __apply_post(m, preds) +# preds = [ +# apply(m.output_producing_models[im], d; suppress_parity_warning, kwargs...) +# for im in argmax(weights; dims=2) +# ] +# preds = __apply_pre(m, d, preds) +# return preds +# end + +# function apply!(m::MaxDecisionBag, d::AbstractInterpretationSet, y::AbstractVector; mode = :replace, leavesonly = false, suppress_parity_warning = false, kwargs...) +# y = __apply_pre(m, d, y) +# weights = hcat([apply!(wm, d, y; mode, leavesonly, suppress_parity_warning, kwargs...) for wm in m.weight_producing_models]...) +# preds = __apply_post(m, preds) +# preds = [ +# apply!(m.output_producing_models[im], d, y; mode, leavesonly, suppress_parity_warning, kwargs...) +# for im in argmax(weights; dims=2) +# ] +# preds = __apply_pre(m, d, preds) +# return __apply!(m, mode, preds, y, leavesonly) +# end + +# """ +# TODO explain. The output of XGBoost via the strategy "multi:softmax". +# """ +# const MaxTreeBag{O,W<:RLabel,A<:typeof(+),WW<:RLabel} = MaxDecisionBag{O,ConstantModel{O},DecisionEnsemble{W,DecisionTree,A,WW}} + +# function unique_with_indices(x) +# unique_vals = unique(x) +# indices = [findall(==(val), x) for val in unique_vals] +# return unique_vals, indices +# end +# function apply!( +# dbag::SoleModels.DecisionBag, +# X, +# y, +# classlabels +# ) +# n_stumps = length(dbag.output_producing_models) +# values, idx_groups = unique_with_indices(dbag.info.supporting_predictions) +# count_trees = Dict() +# for i in 1:n_stumps +# prediction = DecisionTree.apply_tree(dbag.output_producing_models[i], classlabels) +# count_trees[prediction] = get(count_trees, prediction, 0.0) + weights[i] +# end +# top_prediction = dbag.output_producing_models[1].left.majority +# top_count = -Inf +# for (k, v) in count_trees +# if v > top_count +# top_prediction = k +# top_count = v +# end +# end +# return top_prediction + +# dbag.info.supporting_labels = y +# dbag.info.supporting_predictions = top_prediction +# end + +# ---------------------------------------------------------------------------- # +# DecisionXGBoost struct # +# ---------------------------------------------------------------------------- # """ -A `MaxDecisionBag` is an ensemble of models, weighted by a set of other models. -In this simplified implementation, only the model with the highest (`max`) weight is responsible for the outcome. +A `DecisionXGBoost` is an ensemble of models, weighted by leaf values, exp.summed during apply. See also [`DecisionForest`](@ref), [`DecisionTree`](@ref), [`DecisionEnsemble`](@ref), [`MaxDecisionBag`](@ref). """ -struct MaxDecisionBag{O,TO<:AbstractModel,TU<:AbstractModel - # ,A<:Base.Callable - # ,W<:Union{Nothing,AbstractVector} - } <: AbstractDecisionEnsemble{O} - output_producing_models::Vector{TO} - weight_producing_models::Vector{TU} - # aggregation::A - # weights::W +struct DecisionXGBoost{O,T<:AbstractModel,A<:Base.Callable} <: AbstractDecisionEnsemble{O} + models::Vector{T} + aggregation::A info::NamedTuple - function MaxDecisionBag{O}( - output_producing_models::Vector, - weight_producing_models::Vector, - # aggregation::Union{Nothing,Base.Callable}, - # weights::Union{Nothing,AbstractVector}, + function DecisionXGBoost{O}( + models::AbstractVector{T}, + aggregation::Union{Nothing,Base.Callable}, info::NamedTuple = (;); - suppress_parity_warning = nothing, - ) where {O} - @assert length(output_producing_models) > 0 "Cannot instantiate empty bagoutput-producing models!" - @assert length(weight_producing_models) > 0 "Cannot instantiate empty bagweight-producing models!" - @assert length(output_producing_models) == length(weight_producing_models) "Cannot instantiate bag with different numbers of output and weight producing models: $(length(output_producing_models)) != $(length(weight_producing_models))." - output_producing_models = wrap.(output_producing_models) - weight_producing_models = wrap.(weight_producing_models) - # if isnothing(aggregation) - # # if a suppress_parity_warning parameter is provided, then the aggregation's suppress_parity_warning defaults to it; - # # otherwise, it defaults to bestguess's suppress_parity_warning - # if isnothing(suppress_parity_warning) - # aggregation = function (args...; kwargs...) bestguess(args...; kwargs...) end - # else - # aggregation = function (args...; suppress_parity_warning = suppress_parity_warning, kwargs...) bestguess(args...; suppress_parity_warning, kwargs...) end - # end - # else - # isnothing(suppress_parity_warning) || @warn "Unexpected value for suppress_parity_warning: $(suppress_parity_warning)." - # end - TO = typeof(output_producing_models) - TU = typeof(weight_producing_models) - # W = typeof(weights) - # A = typeof(aggregation) - new{O,TO,TU}(output_producing_models, weight_producing_models, aggregation, info) # , weights + return_sum::Bool=false + ) where {O,T<:AbstractModel} + @assert length(models) > 0 "Cannot instantiate empty ensemble!" + models = wrap.(models) + + if isnothing(aggregation) + aggregation = function(args...; return_sum=false) bestguess(args...; return_sum) end + end + + A = typeof(aggregation) + new{O,T,A}(collect(models), aggregation, info) end - function MaxDecisionBag( - output_producing_models::Vector, - weight_producing_models::Vector, + function DecisionXGBoost{O}( + models::AbstractVector; + kwargs... + ) where {O} + info = (;) + DecisionXGBoost{O}(models, nothing, info; kwargs...) + end + + function DecisionXGBoost{O}( + models::AbstractVector, + info::NamedTuple; + kwargs... + ) where {O} + DecisionXGBoost{O}(models, nothing, info; kwargs...) + end + + function DecisionXGBoost( + models::AbstractVector, args...; kwargs... ) - @assert length(output_producing_models) > 0 "Cannot instantiate empty bagoutput-producing models!" - @assert length(weight_producing_models) > 0 "Cannot instantiate empty bagweight-producing models!" - @assert length(output_producing_models) == length(weight_producing_models) "Cannot instantiate bag with different numbers of output and weight producing models: $(length(output_producing_models)) != $(length(weight_producing_models))." - output_producing_models = wrap.(output_producing_models) - weight_producing_models = wrap.(weight_producing_models) - O = Union{outcometype.(output_producing_models)...} - MaxDecisionBag{O}(output_producing_models, weight_producing_models, args...; kwargs...) + @assert length(models) > 0 "Cannot instantiate empty ensemble!" + models = wrap.(models) + O = Union{outcometype.(models)...} + DecisionXGBoost{O}(models, args...; kwargs...) end end -isensemble(m::MaxDecisionBag) = true +isensemble(m::DecisionXGBoost) = true -function apply(m::MaxDecisionBag, d::AbstractInterpretation; suppress_parity_warning = false, kwargs...) - weights = [apply(wm, d; suppress_parity_warning, kwargs...) for wm in m.weight_producing_models] - om = m.output_producing_models[argmax(weights)] - pred = apply(om, d; suppress_parity_warning, kwargs...) - # preds = [apply(om, d; suppress_parity_warning, kwargs...) for om in m.output_producing_models] - # pred = aggregation(m)(preds, weights; suppress_parity_warning) - pred -end +modelstype(m::DecisionXGBoost{O,T}) where {O,T} = T +models(m::DecisionXGBoost) = m.models +nmodels(m::DecisionXGBoost) = length(models(m)) -# TODO Add a keyword argument that toggles the soft or hard behavior. The hard behavior is one where you first find the bestguess among the weights, and then perform the apply only on the first +aggregation(m::DecisionXGBoost) = m.aggregation +scored_aggregation(m::DecisionXGBoost) = aggregation(m) + +""" + function height(m::DecisionXGBoost) + +Return the maximum height across all the [`DecisionTree`](@ref)s within `m`. + +See also [`DecisionXGBoost`](@ref), `DecisionForest`](@ref), [`DecisionTree`](@ref). +""" +height(m::DecisionXGBoost) = subtreeheight(m) + +immediatesubmodels(m::DecisionXGBoost) = trees(m) +nimmediatesubmodels(m::DecisionXGBoost) = length(trees(m)) +listimmediaterules(m::DecisionXGBoost; kwargs...) = error("TODO implement") + +# ---------------------------------------------------------------------------- # +# DecisionXGBoost apply # +# ---------------------------------------------------------------------------- # +function apply( + m::DecisionXGBoost, + id::AbstractInterpretation; + suppress_parity_warning=false, + kwargs... +) + preds = [apply_leaf_scores(subm, d; suppress_parity_warning, kwargs...) for subm in models(m)] + preds = __apply_post(m, preds) + scored_aggregation(m)(preds, sort(unique(m.info.supporting_labels)); suppress_parity_warning) +end # TODO parallelize function apply( - m::MaxDecisionBag, + m::DecisionXGBoost, d::AbstractInterpretationSet; - suppress_parity_warning = false, + suppress_parity_warning=false, kwargs... ) - weights = hcat([apply(wm, d; suppress_parity_warning, kwargs...) for wm in m.weight_producing_models]...) + # we expect X_test * classlabels * nrounds trees, because for every round, + # XGBoost for every round, creates a tree for every classlabel. + # So, in every subm model, we'll find as much trees as classlabels. + preds = hcat([apply_leaf_scores(subm, d; suppress_parity_warning, kwargs...) for subm in models(m)]...) preds = __apply_post(m, preds) preds = [ - apply(m.output_producing_models[im], d; suppress_parity_warning, kwargs...) - for im in argmax(weights; dims=2) + scored_aggregation(m)(pred, sort(unique(m.info.supporting_labels))) + for pred in eachrow(preds) ] - preds = __apply_pre(m, d, preds) return preds end -function apply!(m::MaxDecisionBag, d::AbstractInterpretationSet, y::AbstractVector; mode = :replace, leavesonly = false, suppress_parity_warning = false, kwargs...) +# TODO parallelize +function apply!( + m::DecisionXGBoost, + d::AbstractInterpretationSet, + y::AbstractVector; + mode::Symbol=:replace, + leavesonly::Bool=false, + suppress_parity_warning::Bool=true, + kwargs... +) y = __apply_pre(m, d, y) - weights = hcat([apply!(wm, d, y; mode, leavesonly, suppress_parity_warning, kwargs...) for wm in m.weight_producing_models]...) + + preds = hcat([apply_leaf_scores(subm, d; suppress_parity_warning, kwargs...) for subm in models(m)]...) preds = __apply_post(m, preds) preds = [ - apply!(m.output_producing_models[im], d, y; mode, leavesonly, suppress_parity_warning, kwargs...) - for im in argmax(weights; dims=2) + scored_aggregation(m)(pred, sort(unique(m.info.supporting_labels))) + for pred in eachrow(preds) ] preds = __apply_pre(m, d, preds) - return __apply!(m, mode, preds, y, leavesonly) -end -""" -TODO explain. The output of XGBoost via the strategy "multi:softmax". -""" -const MaxTreeBag{O,W<:RLabel,A<:typeof(+),WW<:RLabel} = MaxDecisionBag{O,ConstantModel{O},DecisionEnsemble{W,DecisionTree,A,WW}} - -function unique_with_indices(x) - unique_vals = unique(x) - indices = [findall(==(val), x) for val in unique_vals] - return unique_vals, indices + return __apply!(m, mode, preds, y, leavesonly) end - -# function apply!( -# dbag::SoleModels.DecisionBag, -# X, -# y, -# classlabels -# ) -# n_stumps = length(dbag.output_producing_models) -# values, idx_groups = unique_with_indices(dbag.info.supporting_predictions) - -# count_trees = Dict() -# for i in 1:n_stumps -# prediction = DecisionTree.apply_tree(dbag.output_producing_models[i], classlabels) -# count_trees[prediction] = get(count_trees, prediction, 0.0) + weights[i] -# end -# top_prediction = dbag.output_producing_models[1].left.majority -# top_count = -Inf -# for (k, v) in count_trees -# if v > top_count -# top_prediction = k -# top_count = v -# end -# end -# return top_prediction - -# dbag.info.supporting_labels = y -# dbag.info.supporting_predictions = top_prediction -# end - diff --git a/src/utils/models/leaf.jl b/src/utils/models/leaf.jl index 599f91a..d49101d 100644 --- a/src/utils/models/leaf.jl +++ b/src/utils/models/leaf.jl @@ -95,6 +95,43 @@ end convert(::Type{ConstantModel{O}}, o::O) where {O} = ConstantModel{O}(o) convert(::Type{<:AbstractModel{F}}, m::ConstantModel) where {F} = ConstantModel{F}(m) +# ---------------------------------------------------------------------------- # +# DecisionXGBoost apply # +# ---------------------------------------------------------------------------- # +outcome_leaf_value(m::ConstantModel) = m.info.leaf_values + +apply_leaf_scores(m::ConstantModel, i::AbstractInterpretation; kwargs...) = outcome(m) +apply_leaf_scores( + m::ConstantModel, + d::AbstractInterpretationSet, + i_instance::Integer; + kwargs... +) = (outcome(m), outcome_leaf_value(m)) +apply_leaf_scores( + m::ConstantModel, + d::AbstractInterpretationSet; + kwargs... +) = Fill((outcome(m), outcome_leaf_value(m)), ninstances(d)) + +function apply_leaf_scores!( + m::ConstantModel, + d::AbstractInterpretationSet, + y::AbstractVector; + mode = :replace, + leavesonly = false, + kwargs... +) + # @assert length(y) == ninstances(d) "$(length(y)) == $(ninstances(d))" + if mode == :replace + recursivelyemptysupports!(m, leavesonly) + mode = :append + end + + preds = fill((outcome(m), outcome_leaf_value(m)), ninstances(d)) + + return __apply!(m, mode, preds, y, leavesonly) +end + ############################################################################################ ################################### FunctionModel ########################################## ############################################################################################ diff --git a/src/utils/models/rule-and-branch.jl b/src/utils/models/rule-and-branch.jl index b845e31..e45df00 100644 --- a/src/utils/models/rule-and-branch.jl +++ b/src/utils/models/rule-and-branch.jl @@ -347,6 +347,35 @@ function apply( preds end +# ---------------------------------------------------------------------------- # +# DecisionXGBoost apply # +# ---------------------------------------------------------------------------- # +function apply_leaf_scores( + m::Branch, + d::AbstractInterpretationSet; + check_args::Tuple = (), + check_kwargs::NamedTuple = (;), + kwargs... +) + checkmask = checkantecedent(m, d, check_args...; check_kwargs...) + preds = Vector(undef,length(checkmask)) + preds[checkmask] .= apply_leaf_scores( + posconsequent(m), + slicedataset(d, checkmask; return_view = true, allow_no_instances = true); + check_args = check_args, + check_kwargs = check_kwargs, + kwargs... + ) + preds[(!).(checkmask)] .= apply_leaf_scores( + negconsequent(m), + slicedataset(d, (!).(checkmask); return_view = true, allow_no_instances = true); + check_args = check_args, + check_kwargs = check_kwargs, + kwargs... + ) + preds +end + function apply!( m::Branch, d::AbstractInterpretationSet, diff --git a/test/DecisionTreeExt/adaboost.jl b/test/DecisionTreeExt/adaboost.jl new file mode 100644 index 0000000..445b8a7 --- /dev/null +++ b/test/DecisionTreeExt/adaboost.jl @@ -0,0 +1,132 @@ +X, y = @load_iris +X = DataFrame(X) + +train_ratio = 0.7 +rng = Xoshiro(1) + +train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) +X_train, y_train = X[train, :], y[train] +X_test, y_test = X[test, :], y[test] + +println("Training set size: ", size(X_train), " - ", size(y_train)) +println("Test set size: ", size(X_test), " - ", size(y_test)) +println("Training set type: ", typeof(X_train), " - ", typeof(y_train)) +println("Test set type: ", typeof(X_test), " - ", typeof(y_test)) + +# ---------------------------------------------------------------------------- # +# AdaBoost solemodel # +# ---------------------------------------------------------------------------- # +Stump = MLJ.@load AdaBoostStumpClassifier pkg=DecisionTree + +model = Stump(; + n_iter=10, + feature_importance=:impurity, + rng +) + +# Bind the model and data into a machine +mach = machine(model, X_train, y_train) +# Fit the model +fit!(mach, verbosity=0) + +weights = mach.fitresult[2] +classlabels = sort(mach.fitresult[3]) +featurenames = MLJ.report(mach).features + +solem = solemodel(MLJ.fitted_params(mach).stumps; weights, classlabels, featurenames) +solem = solemodel(MLJ.fitted_params(mach).stumps; weights, classlabels, featurenames, keep_condensed = false) + +@test SoleData.scalarlogiset(X_test; allow_propositional = true) isa PropositionalLogiset + +# Make test instances flow into the model +preds = apply(solem, X_test) +preds2 = apply!(solem, X_test, y_test) + +@test preds == preds2 + +# apply!(solem, X_test, y_test, mode = :append) + +printmodel(solem; max_depth = 7, show_intermediate_finals = true, show_metrics = true) + +# @test_broken printmodel.(listrules(solem, min_lift = 1.0, min_ninstances = 0); show_metrics = true); + +# ---------------------------------------------------------------------------- # +# AdaBoost decisiontree # +# ---------------------------------------------------------------------------- # +# train adaptive-boosted stumps, using 10 iterations +dt_model, dt_coeffs = DT.build_adaboost_stumps(y_train, Matrix(X_train), 10) +# apply learned model +dt_preds = DT.apply_adaboost_stumps(dt_model, dt_coeffs, Matrix(X_test)) +# get the probability of each label +dt_proba = DT.apply_adaboost_stumps_proba(dt_model, dt_coeffs, Matrix(X_test), classlabels) + +@test preds == dt_preds + +# ---------------------------------------------------------------------------- # +# Accuracy # +# ---------------------------------------------------------------------------- # +ada_accuracy = sum(preds .== y_test)/length(y_test) +# @test accuracy >= 0.8 + +# decision tree +Tree = MLJ.@load DecisionTreeClassifier pkg=DecisionTree +dt_model = Tree(max_depth=-1, min_samples_leaf=1, min_samples_split=2) +dt_mach = machine(dt_model, X_train, y_train) +fit!(dt_mach, verbosity=0) +dt_solem = solemodel(fitted_params(dt_mach).tree) +dt_preds = apply(dt_solem, X_test) +dt_accuracy = sum(dt_preds .== y_test)/length(y_test) + +# random forest +Forest = MLJ.@load RandomForestClassifier pkg=DecisionTree +rm_model = Forest(; max_depth=3, min_samples_leaf=1, min_samples_split=2, n_trees=10, rng) +rm_mach = machine(rm_model, X_train, y_train) +fit!(rm_mach, verbosity=0) +classlabels = (rm_mach).fitresult[2] +classlabels = classlabels[sortperm((rm_mach).fitresult[3])] +featurenames = report(rm_mach).features +rm_solem = solemodel(fitted_params(rm_mach).forest; classlabels, featurenames) +rm_preds = apply(rm_solem, X_test) +rm_accuracy = sum(rm_preds .== y_test)/length(y_test) + +println("AdaBoost accuracy: ", ada_accuracy) +println("DecisionTree accuracy: ", dt_accuracy) +println("RandomForest accuracy: ", rm_accuracy) + +@test ada_accuracy ≥ rm_accuracy ≥ dt_accuracy + +# ---------------------------------------------------------------------------- # +# Data Validation # +# ---------------------------------------------------------------------------- # +@testset "data validation" begin + Stump = MLJ.@load AdaBoostStumpClassifier pkg=DecisionTree + + for train_ratio in 0.5:0.1:0.9 + for seed in 1:40 + train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) + X_train, y_train = X[train, :], y[train] + X_test, y_test = X[test, :], y[test] + + for n_iter in 10:10:100 + # solemodel + model = Stump(; n_iter, rng=Xoshiro(seed)) + mach = machine(model, X_train, y_train) + fit!(mach, verbosity=0) + weights = mach.fitresult[2] + classlabels = sort(mach.fitresult[3]) + featurenames = MLJ.report(mach).features + solem = solemodel(MLJ.fitted_params(mach).stumps; weights, classlabels, featurenames) + preds = apply(solem, X_test) + + # decisiontree + yl_train = CategoricalArrays.levelcode.(y_train) + dt_model, dt_coeffs = DT.build_adaboost_stumps(yl_train, Matrix(X_train), n_iter; rng=Xoshiro(seed)) + dt_preds = DT.apply_adaboost_stumps(dt_model, dt_coeffs, Matrix(X_test)) + + code_preds = CategoricalArrays.levelcode.(preds) + @test code_preds == dt_preds + end + end + end +end + diff --git a/test/DecisionTreeExt/forest.jl b/test/DecisionTreeExt/forest.jl index 4e0d1b5..d4577aa 100644 --- a/test/DecisionTreeExt/forest.jl +++ b/test/DecisionTreeExt/forest.jl @@ -1,20 +1,10 @@ -using Test - -using MLJ -using MLJBase -using DataFrames - -using MLJDecisionTreeInterface -using SoleModels - -import DecisionTree as DT - X, y = @load_iris X = DataFrame(X) train_ratio = 0.8 +rng = Xoshiro(11) -train, test = partition(eachindex(y), train_ratio, shuffle=true) +train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) X_train, y_train = X[train, :], y[train] X_test, y_test = X[test, :], y[test] @@ -52,10 +42,43 @@ preds = apply(solem, X_test) preds2 = apply!(solem, X_test, y_test) @test preds == preds2 -@test sum(preds .== y_test)/length(y_test) >= 0.8 +accuracy = sum(preds .== y_test)/length(y_test) +@test accuracy >= 0.8 # apply!(solem, X_test, y_test, mode = :append) printmodel(solem; max_depth = 7, show_intermediate_finals = true, show_metrics = true) -@test_broken printmodel.(listrules(solem, min_lift = 1.0, min_ninstances = 0); show_metrics = true); +# @test_broken printmodel.(listrules(solem, min_lift = 1.0, min_ninstances = 0); show_metrics = true); + +# ---------------------------------------------------------------------------- # +# Data Validation # +# ---------------------------------------------------------------------------- # +@testset "data validation" begin + Forest = MLJ.@load RandomForestClassifier pkg=DecisionTree + + for train_ratio in 0.5:0.1:0.9 + for seed in 1:40 + train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) + X_train, y_train = X[train, :], y[train] + X_test, y_test = X[test, :], y[test] + + for n_trees in 10:10:60 + # solemodel + model = Forest(; n_trees, rng=Xoshiro(seed)) + mach = machine(model, X_train, y_train) + fit!(mach, verbosity=0) + classlabels = (mach).fitresult[2][sortperm((mach).fitresult[3])] + featurenames = MLJ.report(mach).features + solem = solemodel(MLJ.fitted_params(mach).forest; classlabels, featurenames) + preds = apply!(solem, X_test, y_test) + + # decisiontree + rf_model = DT.build_forest(y_train, Matrix(X_train), -1, n_trees; rng=Xoshiro(seed)) + rf_preds = DT.apply_forest(rf_model, Matrix(X_test)) + + @test preds == rf_preds + end + end + end +end diff --git a/test/DecisionTreeExt/tree.jl b/test/DecisionTreeExt/tree.jl index 6936d00..2d864a0 100644 --- a/test/DecisionTreeExt/tree.jl +++ b/test/DecisionTreeExt/tree.jl @@ -1,20 +1,10 @@ -using Test - -using MLJ -using MLJBase -using DataFrames - -using MLJDecisionTreeInterface -using SoleModels - -import DecisionTree as DT - X, y = @load_iris X = DataFrame(X) train_ratio = 0.8 +rng = Xoshiro(11) -train, test = partition(eachindex(y), train_ratio, shuffle=true) +train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) X_train, y_train = X[train, :], y[train] X_test, y_test = X[test, :], y[test] @@ -47,7 +37,8 @@ preds = apply(solem, X_test) preds2 = apply!(solem, X_test, y_test) @test preds == preds2 -@test sum(preds .== y_test)/length(y_test) > 0.7 +accuracy = sum(preds .== y_test)/length(y_test) +@test accuracy > 0.7 # apply!(solem, X_test, y_test, mode = :append) @@ -72,3 +63,35 @@ printmodel.(sort(interesting_rules, by = readmetrics); show_metrics = (; round_d @test length(joinrules(interesting_rules)) == 3 @test (natoms.((interesting_rules)) |> sum) == (natoms.(joinrules(interesting_rules)) |> sum) + +# ---------------------------------------------------------------------------- # +# Data Validation # +# ---------------------------------------------------------------------------- # +@testset "data validation" begin + Tree = MLJ.@load DecisionTreeClassifier pkg=DecisionTree + + for train_ratio in 0.5:0.1:0.9 + for seed in 1:40 + train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) + X_train, y_train = X[train, :], y[train] + X_test, y_test = X[test, :], y[test] + + for max_depth in 2:1:6 + # solemodel + model = Tree(; max_depth, rng=Xoshiro(seed)) + mach = machine(model, X_train, y_train) + fit!(mach, verbosity=0) + solem = solemodel(MLJ.fitted_params(mach).tree) + preds = apply!(solem, X_test, y_test) + + # decisiontree + y_coded_train = @. CategoricalArrays.levelcode(y_train) + dt_model = DT.build_tree(y_coded_train, Matrix(X_train), 0, max_depth; rng=Xoshiro(seed)) + dt_preds = DT.apply_tree(dt_model, Matrix(X_test)) + + preds_coded = CategoricalArrays.levelcode.(CategoricalArray(preds)) + @test preds_coded == dt_preds + end + end + end +end diff --git a/test/XGBoostExt.jl b/test/XGBoostExt.jl deleted file mode 100644 index 1ae6c42..0000000 --- a/test/XGBoostExt.jl +++ /dev/null @@ -1,47 +0,0 @@ - -# Import necessary libraries -using MLJ -using DataFrames - -# Load the Iris dataset -X, y = @load_iris -X = DataFrame(X) - -# Convert the target variable to categorical -y = coerce(y, Multiclass) - -# Split the dataset into training and testing sets -train, test = partition(eachindex(y), 0.8, shuffle=true) -X_train, X_test = X[train, :], X[test, :] -y_train, y_test = y[train], y[test] - -# Load the XGBoost classifier -XGBoostClassifier = @load XGBoostClassifier pkg=XGBoost - -# Create the model and set hyperparameters -mljmodel = XGBoostClassifier() - -# Wrap the model with the data -mach = machine(mljmodel, X_train, y_train) - -# Train the model -fit!(mach) - -# Make predictions -y_pred = predict(mach, X_test) - -# Evaluate test accuracy -acc = mean(mode.(y_pred) .== y_test) - -# Print the test accuracy -println("Test Accuracy: $acc") - - - -using SoleModels - -@test_nowarn alphabet(fitted_params(mach).fitresult[1]) - -model = fitted_params(mach).fitresult[1] - -@test_broken solemodel(model) diff --git a/test/XGBoostExt/xgboost_classifier.jl b/test/XGBoostExt/xgboost_classifier.jl new file mode 100644 index 0000000..ad04f0a --- /dev/null +++ b/test/XGBoostExt/xgboost_classifier.jl @@ -0,0 +1,143 @@ +X, y = @load_iris +X = DataFrame(X) + +train_ratio = 0.7 +rng = Xoshiro(11) + +train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) +X_train, y_train = X[train, :], y[train] +X_test, y_test = X[test, :], y[test] + +println("Training set size: ", size(X_train), " - ", size(y_train)) +println("Test set size: ", size(X_test), " - ", size(y_test)) +println("Training set type: ", typeof(X_train), " - ", typeof(y_train)) +println("Test set type: ", typeof(X_test), " - ", typeof(y_test)) + +# ---------------------------------------------------------------------------- # +# XGBoost solemodel # +# ---------------------------------------------------------------------------- # +XGTrees = MLJ.@load XGBoostClassifier pkg=XGBoost + +model = XGTrees(; + num_round=10, + tree_method="exact", + objective="multi:softmax" +) + +# Bind the model and data into a machine +mach = machine(model, X_train, y_train) +# Fit the model +fit!(mach; verbosity=0) + +get_encoding(classes_seen) = Dict(MMI.int(c) => c for c in MMI.classes(classes_seen)) +get_classlabels(encoding) = [string(encoding[i]) for i in sort(keys(encoding) |> collect)] +trees = XGB.trees(mach.fitresult[1]) +encoding = get_encoding(mach.fitresult[2]) +classlabels = get_classlabels(encoding) +featurenames = mach.report.vals[1].features + +solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) +solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, keep_condensed = false) + +@test SoleData.scalarlogiset(X_test; allow_propositional = true) isa PropositionalLogiset + +# Make test instances flow into the model +X_test_f32 = mapcols(col -> Float32.(col), X_test) +preds = apply(solem, X_test_f32) +predsl = CategoricalArrays.levelcode.(CategoricalArrays.categorical(preds)) .- 1 + +apply!(solem, X_test, y_test) +@test solem.info.supporting_predictions == preds +@test solem.info.supporting_labels == y_test + +# ---------------------------------------------------------------------------- # +# julia XGBoost # +# ---------------------------------------------------------------------------- # +yl_train = CategoricalArrays.levelcode.(CategoricalArrays.categorical(y_train)) .- 1 +# create and train a gradient boosted tree model of 5 trees +bst = XGB.xgboost( + (X_train, yl_train), + num_round=10, + num_class=3, + tree_method="exact", + objective="multi:softmax" +) +# obtain model predictions +xg_preds = XGB.predict(bst, X_test) + +@test predsl == xg_preds + +# ---------------------------------------------------------------------------- # +# Accuracy # +# ---------------------------------------------------------------------------- # +xg_accuracy = sum(preds .== y_test)/length(y_test) +# @test accuracy >= 0.8 + +# decision tree +Tree = MLJ.@load DecisionTreeClassifier pkg=DecisionTree +dt_model = Tree(max_depth=-1, min_samples_leaf=1, min_samples_split=2) +dt_mach = machine(dt_model, X_train, y_train) +fit!(dt_mach, verbosity=0) +dt_solem = solemodel(fitted_params(dt_mach).tree) +dt_preds = apply(dt_solem, X_test) +dt_accuracy = sum(dt_preds .== y_test)/length(y_test) + +# random forest +Forest = MLJ.@load RandomForestClassifier pkg=DecisionTree +rm_model = Forest(;max_depth=3, min_samples_leaf=1, min_samples_split=2, n_trees=10, rng) +rm_mach = machine(rm_model, X_train, y_train) +fit!(rm_mach, verbosity=0) +classlabels = (rm_mach).fitresult[2] +classlabels = classlabels[sortperm((rm_mach).fitresult[3])] +featurenames = report(rm_mach).features +rm_solem = solemodel(fitted_params(rm_mach).forest; classlabels, featurenames) +rm_preds = apply(rm_solem, X_test) +rm_accuracy = sum(rm_preds .== y_test)/length(y_test) + +println("XGBoost accuracy: ", xg_accuracy) +println("DecisionTree accuracy: ", dt_accuracy) +println("RandomForest accuracy: ", rm_accuracy) + +@test xg_accuracy ≥ rm_accuracy ≥ dt_accuracy + +# ---------------------------------------------------------------------------- # +# XGBoost Alphabet # +# ---------------------------------------------------------------------------- # +@test_nowarn alphabet(fitted_params(mach).fitresult[1]) + +# ---------------------------------------------------------------------------- # +# Data Validation # +# ---------------------------------------------------------------------------- # +@testset "data validation" begin + XGTrees = MLJ.@load XGBoostClassifier pkg=XGBoost + + for train_ratio in 0.5:0.1:0.9 + for seed in 1:40 + train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) + X_train, y_train = X[train, :], y[train] + X_test, y_test = X[test, :], y[test] + + for num_round in 10:10:50 + for eta in 0.1:0.1:0.6 + model = XGTrees(; num_round, eta, objective="multi:softmax") + mach = machine(model, X_train, y_train) + fit!(mach, verbosity=0) + trees = XGB.trees(mach.fitresult[1]) + encoding = get_encoding(mach.fitresult[2]) + classlabels = get_classlabels(encoding) + featurenames = mach.report.vals[1].features + solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) + X_test_f32 = mapcols(col -> Float32.(col), X_test) + preds = apply!(solem, X_test_f32, y_test) + predsl = CategoricalArrays.levelcode.(CategoricalArrays.categorical(preds)) .- 1 + + yl_train = CategoricalArrays.levelcode.(CategoricalArrays.categorical(y_train)) .- 1 + bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softmax") + xg_preds = XGB.predict(bst, X_test) + + @test predsl == xg_preds + end + end + end + end +end diff --git a/test/base.jl b/test/base.jl index 5dd8767..f9c98dc 100644 --- a/test/base.jl +++ b/test/base.jl @@ -1,10 +1,3 @@ -using SoleModels -using SoleLogics -using FunctionWrappers: FunctionWrapper -using SoleModels: AbstractModel -using SoleModels: ConstantModel, LeafModel -using Test - # base.jl io = IOBuffer() diff --git a/test/juliacon2024.jl b/test/juliacon2024.jl index 93f135c..ab50e19 100644 --- a/test/juliacon2024.jl +++ b/test/juliacon2024.jl @@ -1,12 +1,12 @@ # JuliaCon2024 demo # Load packages -begin - using MLJ - using MLJDecisionTreeInterface - using DataFrames - using Random -end +# begin +# using MLJ +# using MLJDecisionTreeInterface +# using DataFrames +# using Random +# end # Load dataset X, y = begin diff --git a/test/linear-form-utilities.jl b/test/linear-form-utilities.jl index d55fcb2..09fbb9b 100644 --- a/test/linear-form-utilities.jl +++ b/test/linear-form-utilities.jl @@ -1,8 +1,3 @@ -using Test -using SoleLogics -using SoleModels - - b = Branch(LeftmostConjunctiveForm((@atoms p q r s)), "YES", "NO") @test_nowarn b[1:3] diff --git a/test/misc.jl b/test/misc.jl index cbafaf2..a4594a4 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -1,17 +1,5 @@ -using Revise - -using Reexport -using FunctionWrappers: FunctionWrapper -using Test -using SoleLogics -using SoleModels -using SoleModels: AbstractModel -using SoleModels: ConstantModel, LeafModel -using SoleModels: listrules, displaymodel, submodels - io = IOBuffer() -# parse_other_kind_of_formula = SoleLogics.parsebaseformula parse_other_kind_of_formula = SoleLogics.parseformula ################################### LeafModel ############################################# diff --git a/test/parse.jl b/test/parse.jl index a4e905e..5d1a019 100644 --- a/test/parse.jl +++ b/test/parse.jl @@ -1,10 +1,3 @@ -using Test -using SoleModels -using SoleData -using SoleData: AbstractUnivariateFeature, Feature -using SoleData: ScalarCondition -using SoleData: feature - ############################################################################################ ############################ Orange parser ################################################# ############################################################################################ diff --git a/test/runtests.jl b/test/runtests.jl index 8d95df5..a9b6a96 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,8 +1,30 @@ -# using Revise -using SoleModels -using SoleLogics -using Test -using Random +using Distributed +addprocs(2) + +@everywhere begin + using SoleModels + using SoleModels: AbstractModel + using SoleModels: ConstantModel, LeafModel + using SoleModels: listrules, displaymodel, submodels + using SoleData + using SoleData: AbstractUnivariateFeature, Feature + using SoleData: ScalarCondition + using SoleData: feature + using SoleLogics + using CategoricalArrays + using Markdown + using MultiData + using InteractiveUtils + using MLJ + using MLJDecisionTreeInterface + import DecisionTree as DT + import MLJModelInterface as MMI + import XGBoost as XGB + using DataFrames + using Test + using Random + using FunctionWrappers: FunctionWrapper +end function run_tests(list) println("\n" * ("#"^50)) @@ -15,14 +37,14 @@ end println("Julia version: ", VERSION) test_suites = [ - ("Models", ["base.jl", ]), + ("Models", ["base.jl", "test_tree.jl"]), ("Miscellaneous", ["misc.jl", ]), ("Parse", ["parse.jl", ]), ("Rules", ["juliacon2024.jl", ]), ("Linear forms", ["linear-form-utilities.jl", ]), ("Pluto Demo", ["$(dirname(dirname(pathof(SoleModels))))/pluto-demo.jl", ]), - ("DecisionTreeExt", ["DecisionTreeExt/tree.jl", "DecisionTreeExt/forest.jl"]), - ("XGBoostExt", ["XGBoostExt.jl"]), + ("DecisionTreeExt", ["DecisionTreeExt/tree.jl", "DecisionTreeExt/forest.jl", "DecisionTreeExt/adaboost.jl"]), + ("XGBoostExt", ["XGBoostExt/xgboost_classifier.jl"]), ] @testset "SoleModels.jl" begin diff --git a/test/test_tree.jl b/test/test_tree.jl new file mode 100644 index 0000000..38c8450 --- /dev/null +++ b/test/test_tree.jl @@ -0,0 +1,27 @@ +################################################ + # p + # ┌───────┴─────────────┐ + # │ r + # q ┌───┴───┐ + # │ s "yes" + # ┌───┴───┐ ┌───┴───┐ + # "yes" "no" "yes" "no" +################################################## + +formula_p = SoleLogics.parseformula("p") +formula_q = SoleLogics.parseformula("q") +formula_r = SoleLogics.parseformula("r") +formula_s = SoleLogics.parseformula("s") + +branch_q = Branch(formula_q,("yes","no"),(;)) +branch_s = Branch(formula_s,("yes","no"),(;)) +branch_r = Branch(formula_r,(branch_s,"yes"),(;)) + +#dt_q = DecisionTree(branch_r,(;)) + + +#Possibile path +path_all = [formula_p,formula_q,formula_s,formula_r,"yes"] +path_2 = [formula_p,formula_q,"yes"] +path_1 = [formula_p,"yes"] +path_0 = ["yes"]