|
95 | 95 | # %%
|
96 | 96 | # Accuracy of the Model
|
97 | 97 | # ---------------------
|
98 |
| -# Prior to inspecting the feature importances, it is important to check that |
99 |
| -# the model predictive performance is high enough. Indeed there would be little |
100 |
| -# interest of inspecting the important features of a non-predictive model. |
101 |
| -# |
102 |
| -# Here one can observe that the train accuracy is very high (the forest model |
| 98 | +# Before inspecting the feature importances, it is important to check that |
| 99 | +# the model predictive performance is high enough. Indeed, there would be little |
| 100 | +# interest in inspecting the important features of a non-predictive model. |
| 101 | + |
| 102 | +print(f"RF train accuracy: {rf.score(X_train, y_train):.3f}") |
| 103 | +print(f"RF test accuracy: {rf.score(X_test, y_test):.3f}") |
| 104 | + |
| 105 | +# %% |
| 106 | +# Here, one can observe that the train accuracy is very high (the forest model |
103 | 107 | # has enough capacity to completely memorize the training set) but it can still
|
104 | 108 | # generalize well enough to the test set thanks to the built-in bagging of
|
105 | 109 | # random forests.
|
|
110 | 114 | # ``min_samples_leaf=10``) so as to limit overfitting while not introducing too
|
111 | 115 | # much underfitting.
|
112 | 116 | #
|
113 |
| -# However let's keep our high capacity random forest model for now so as to |
114 |
| -# illustrate some pitfalls with feature importance on variables with many |
| 117 | +# However, let us keep our high capacity random forest model for now so that we can |
| 118 | +# illustrate some pitfalls about feature importance on variables with many |
115 | 119 | # unique values.
|
116 |
| -print(f"RF train accuracy: {rf.score(X_train, y_train):.3f}") |
117 |
| -print(f"RF test accuracy: {rf.score(X_test, y_test):.3f}") |
118 |
| - |
119 | 120 |
|
120 | 121 | # %%
|
121 | 122 | # Tree's Feature Importance from Mean Decrease in Impurity (MDI)
|
|
135 | 136 | #
|
136 | 137 | # The bias towards high cardinality features explains why the `random_num` has
|
137 | 138 | # a really large importance in comparison with `random_cat` while we would
|
138 |
| -# expect both random features to have a null importance. |
| 139 | +# expect that both random features have a null importance. |
139 | 140 | #
|
140 | 141 | # The fact that we use training set statistics explains why both the
|
141 | 142 | # `random_num` and `random_cat` features have a non-null importance.
|
|
155 | 156 | # %%
|
156 | 157 | # As an alternative, the permutation importances of ``rf`` are computed on a
|
157 | 158 | # held out test set. This shows that the low cardinality categorical feature,
|
158 |
| -# `sex` and `pclass` are the most important feature. Indeed, permuting the |
159 |
| -# values of these features will lead to most decrease in accuracy score of the |
| 159 | +# `sex` and `pclass` are the most important features. Indeed, permuting the |
| 160 | +# values of these features will lead to the most decrease in accuracy score of the |
160 | 161 | # model on the test set.
|
161 | 162 | #
|
162 |
| -# Also note that both random features have very low importances (close to 0) as |
| 163 | +# Also, note that both random features have very low importances (close to 0) as |
163 | 164 | # expected.
|
164 | 165 | from sklearn.inspection import permutation_importance
|
165 | 166 |
|
|
0 commit comments