1
1
# ' Tree Segment Function
2
2
# '
3
- # ' Runs decision tree optimisation on the data to segment customers .
3
+ # ' Runs decision tree optimisation on the data to segment ids .
4
4
# ' @param data data.frame, the data to segment
5
5
# ' @param hyperparameters list, list of hyperparameters to pass. They include
6
6
# ' segmentation_variables: a vector or list with variable names that will be used as segmentation variables;
7
7
# ' dependent_variable: a string with the name of the dependent variable that is used in the clustering;
8
8
# ' min_segmentation_fraction: integer, the minimum segment size as a proportion of the total data set;
9
- # ' number_of_personas : integer, number of leaves you want the decision tree to have.
9
+ # ' number_of_segments : integer, number of leaves you want the decision tree to have.
10
10
# ' @importFrom dplyr mutate_all left_join select %>%
11
11
# ' @importFrom treeClust rpart.predict.leaves
12
12
# ' @importFrom rpart.plot rpart.plot
16
16
tree_segment <- function (data , hyperparameters , verbose = TRUE ){
17
17
18
18
if (is.null(hyperparameters $ segmentation_variables )){
19
- segmentation_variables <- colnames(data )[colnames(data )!= hyperparameters $ dependent_variable & colnames(data )!= ' customerid ' ]
19
+ segmentation_variables <- colnames(data )[colnames(data )!= hyperparameters $ dependent_variable & colnames(data )!= ' id ' ]
20
20
}else {
21
21
segmentation_variables <- hyperparameters $ segmentation_variables
22
22
}
23
23
inputs_params <- list (segmentation_variables = segmentation_variables ,
24
24
dependent_variable = hyperparameters $ dependent_variable ,
25
25
min_segmentation_fraction = hyperparameters $ min_segmentation_fraction ,
26
- number_of_personas = hyperparameters $ number_of_personas )
26
+ number_of_segments = hyperparameters $ number_of_segments )
27
27
28
28
int_colnames <- names(data )[unname(sapply(data , typeof )) == ' integer' ]
29
29
@@ -42,22 +42,22 @@ tree_segment <- function(data, hyperparameters, verbose = TRUE){
42
42
segmentation_variables = segmentation_variables ,
43
43
dependent_variable = hyperparameters $ dependent_variable ,
44
44
min_segmentation_fraction = hyperparameters $ min_segmentation_fraction ,
45
- number_of_leafs = hyperparameters $ number_of_personas )
45
+ number_of_leafs = hyperparameters $ number_of_segments )
46
46
47
47
if (nrow(first_tree $ frame )== 1 ){print(' Only 1 segment. Change parameters or inputs!' )}else {
48
- persona_table <- tree_table.make(first_tree , int_colnames )
49
- persona_tree <- persona_tree .make(first_tree )
50
- persona_tree_df <- persona_tree $ df
51
- persona_tree <- persona_tree $ tree
52
- persona_predicted <- data.frame (customerid = data $ customerid , orig_row = as.numeric(rpart.predict.leaves(persona_tree , data , type = " where" )))
53
- persona_predicted <- left_join(persona_predicted , persona_tree_df %> % select(.data $ orig_row ,.data $ persona ), by = " orig_row" ) %> % select(.data $ customerid , .data $ persona )
48
+ segment_table <- tree_table.make(first_tree , int_colnames )
49
+ segment_tree <- segment_tree .make(first_tree )
50
+ segment_tree_df <- segment_tree $ df
51
+ segment_tree <- segment_tree $ tree
52
+ segment_predicted <- data.frame (id = data $ id , orig_row = as.numeric(rpart.predict.leaves(segment_tree , data , type = " where" )))
53
+ segment_predicted <- left_join(segment_predicted , segment_tree_df %> % select(.data $ orig_row ,.data $ segment ), by = " orig_row" ) %> % select(.data $ id , .data $ segment )
54
54
55
- if (hyperparameters $ print_plot & (hyperparameters $ number_of_personas < hyperparameters $ print_safety_check )){rpart.plot(first_tree )}
55
+ if (hyperparameters $ print_plot & (hyperparameters $ number_of_segments < hyperparameters $ print_safety_check )){rpart.plot(first_tree )}
56
56
57
57
return (
58
- list (persona_model = persona_tree ,
59
- persona_table = persona_table ,
60
- persona_predicted = persona_predicted ,
58
+ list (segment_model = segment_tree ,
59
+ segment_table = segment_table ,
60
+ segment_predicted = segment_predicted ,
61
61
model_inputs = inputs_params )
62
62
)
63
63
}
@@ -76,7 +76,7 @@ decision_tree_user_defined_leafs.make <- function(df,segmentation_variables,depe
76
76
tree <- rpart(f ,data = df ,method = ' anova' ,control = control )
77
77
78
78
if (nrow(tree $ frame %> % filter(.data $ var == ' <leaf>' ))< number_of_leafs ){
79
- print(' WARNING: Output number of personas is less than than the requested amount. Reduce the minimum segmentation fraction, increase the number of segmentation variables, get more data etc.' )
79
+ print(' WARNING: Output number of segments is less than than the requested amount. Reduce the minimum segmentation fraction, increase the number of segmentation variables, get more data etc.' )
80
80
pruned_tree <- tree
81
81
} else {
82
82
cp_adjusted_tree <- tree
@@ -120,7 +120,7 @@ tree_table.make <- function(tree, integer_columns){
120
120
df1 <- rownames_to_column(tree $ frame ) %> % arrange(as.numeric(.data $ rowname )) %> %
121
121
bind_cols(tibble(rules = unlist(rpart.rules(tree ))) %> % filter(nchar(.data $ rules )> 0 )) %> %
122
122
filter(.data $ var == ' <leaf>' ) %> %
123
- transmute(persona = row_number(),n ,.data $ yval ,.data $ rules )
123
+ transmute(segment = row_number(),n ,.data $ yval ,.data $ rules )
124
124
var_names <- tree $ frame %> % filter(.data $ var != ' <leaf>' ) %> % select(.data $ var ) %> % unique()
125
125
df2 <- df1 %> % bind_cols(as.data.frame(matrix (data = NA ,nrow = nrow(df1 ),ncol = nrow(var_names ),dimnames = list (c(),var_names $ var ))))
126
126
@@ -156,15 +156,15 @@ tree_table.make <- function(tree, integer_columns){
156
156
}
157
157
}
158
158
159
- df3 <- df2 %> % mutate(percentage = n / sum(n )* 100 ) %> % select(.data $ persona ,.data $ yval ,.data $ percentage ,everything()) %> %
159
+ df3 <- df2 %> % mutate(percentage = n / sum(n )* 100 ) %> % select(.data $ segment ,.data $ yval ,.data $ percentage ,everything()) %> %
160
160
rename(mean_value = .data $ yval )%> % select(- .data $ rules )
161
161
df3 [,5 : ncol(df3 )][is.na( df3 [,5 : ncol(df3 )])] <- ' All'
162
162
163
163
# Ensures that the conditions for integer columns in the table remain formatted as integers.
164
164
# Without this step, a condition for an integer column could be, e.g., > 1.5.
165
165
# With this step, this condition gets changed to >= 2.
166
166
167
- # Select the columns in the persona table that are integers in the raw DF
167
+ # Select the columns in the segment table that are integers in the raw DF
168
168
169
169
if (sum(names(df3 ) %in% integer_columns ) == 1 ) {
170
170
df_to_change <- data.frame (df3 [, names(df3 ) %in% integer_columns ], stringsAsFactors = FALSE )
@@ -209,13 +209,13 @@ tree_table.make <- function(tree, integer_columns){
209
209
# ' @importFrom dplyr mutate row_number arrange bind_cols filter transmute %>%
210
210
# ' @importFrom rpart.utils rpart.rules
211
211
# ' @importFrom rlang .data
212
- persona_tree .make <- function (tree ){
212
+ segment_tree .make <- function (tree ){
213
213
214
214
df1 <- rownames_to_column(tree $ frame ) %> % mutate(orig_row = row_number()) %> % arrange(as.numeric(.data $ rowname )) %> %
215
215
bind_cols(tibble(rules = unlist(rpart.rules(tree ))) %> % filter(nchar(.data $ rules )> 0 )) %> %
216
216
filter(.data $ var == ' <leaf>' ) %> %
217
- transmute(persona = row_number(),n ,.data $ yval ,.data $ rules ,.data $ orig_row ) %> % arrange(.data $ orig_row )
218
- tree $ frame $ yval [tree $ frame $ var == ' <leaf>' ] <- df1 $ persona
217
+ transmute(segment = row_number(),n ,.data $ yval ,.data $ rules ,.data $ orig_row ) %> % arrange(.data $ orig_row )
218
+ tree $ frame $ yval [tree $ frame $ var == ' <leaf>' ] <- df1 $ segment
219
219
return (list (tree = tree ,
220
220
df = df1 ))
221
221
}
@@ -408,11 +408,11 @@ rpart.plot_pretty <- function(model,main="",sub,caption,palettes,type=2,fontfami
408
408
# ' @export
409
409
tree_segment_prettify <- function (tree , char_length = 20 , print_plot = F ){
410
410
411
- if (print_plot ){rpart.plot_pretty(tree $ persona_model )}
411
+ if (print_plot ){rpart.plot_pretty(tree $ segment_model )}
412
412
413
- features_used <- names(tree $ persona_table )
414
- features_used <- features_used [! features_used %in% c(" persona " ," mean_value" ," percentage" ," n" )]
415
- split_data <- tree $ persona_table %> % select(features_used )
413
+ features_used <- names(tree $ segment_table )
414
+ features_used <- features_used [! features_used %in% c(" segment " ," mean_value" ," percentage" ," n" )]
415
+ split_data <- tree $ segment_table %> % select(features_used )
416
416
417
417
character_check <- function (x ){
418
418
words <- unique(x )
@@ -436,7 +436,7 @@ tree_segment_prettify <- function(tree, char_length = 20, print_plot = F){
436
436
split_data [,col_number ] <- sapply(split_data [,col_number ],dynamic_binning )
437
437
}
438
438
439
- tree $ persona_table [,features_used ] <- split_data
439
+ tree $ segment_table [,features_used ] <- split_data
440
440
441
441
return (tree )
442
442
}
@@ -451,10 +451,10 @@ tree_abstract <- function(model, inputdata){
451
451
# TODO: add performance statistics
452
452
# tree_performance()
453
453
structure(
454
- list (persona_model = model $ persona_model ,
454
+ list (segment_model = model $ segment_model ,
455
455
model_hyperparameters = model $ model_inputs ,
456
- persona_table = model $ persona_table ,
457
- predicted_values = model $ persona_predicted ,
456
+ segment_table = model $ segment_table ,
457
+ predicted_values = model $ segment_predicted ,
458
458
input_data = inputdata ),
459
459
460
460
class = " tree_model" )
0 commit comments