Skip to content

Commit 41faff1

Browse files
Merge pull request #43 from rebekahyates-peak/id_rename
Renaming customerid to id and persona to segment
2 parents 59f73be + 92361f2 commit 41faff1

14 files changed

+109
-111
lines changed

R/model_management.R

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ model_management <- function(model,hyperparameters){
1717
directory_path <- paste0('~/segmentationoutputs/',format(Sys.time(),format = '%Y-%m-%d-%H-%M-%S'))
1818
dir.create(directory_path)
1919
#Save model
20-
persona_model <- model$persona_model
21-
save(persona_model,
22-
file=paste0(directory_path,'/persona_model.RData'), ascii=TRUE)
20+
segment_model <- model$segment_model
21+
save(segment_model,
22+
file=paste0(directory_path,'/segment_model.RData'), ascii=TRUE)
2323
#Save hyperparameters
2424
model_hyperparameters <- model$model_hyperparameters
2525
save(model_hyperparameters,
@@ -46,10 +46,10 @@ model_management <- function(model,hyperparameters){
4646
}
4747
}
4848
#Bespoke management layers - if(class(model) == 'abc'){...}
49-
#TODO: Save persona_table?
49+
#TODO: Save segment_table?
5050
# if(class(model) == 'abc'){
51-
# save(model$persona_table,
52-
# file=paste0(directory_path,'/persona_table.RData'), ascii=TRUE)
51+
# save(model$segment_table,
52+
# file=paste0(directory_path,'/segment_table.RData'), ascii=TRUE)
5353
# }
5454
if(class(model) == 'k-clusters'){
5555
outliers <- model$outliers_table
@@ -62,7 +62,7 @@ model_management <- function(model,hyperparameters){
6262
#Save rpart.plot
6363
if(class(model) == 'tree_model'){
6464
pdf(paste0(directory_path,'/tree.pdf'))
65-
rpart.plot_pretty(persona_model)
65+
rpart.plot_pretty(segment_model)
6666
dev.off()
6767
}
6868
}

R/output_table.R

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,29 @@
22
#'
33
#' Generates the output table for model and data
44
#' @param data A dataframe generated from the pre-processing step
5-
#' @param model A model object used to classify customers with, generated from the model selection layer
5+
#' @param model A model object used to classify ids with, generated from the model selection layer
66
#' @importFrom dplyr left_join select mutate group_by summarise summarise_each funs
77
#' @importFrom rlang .data
88
#' @export
99
output_table <- function(data, model) {
1010
#TODO: Add summary stats for the predictors
11-
output <- data.frame(segment = model$predicted_values$persona,
12-
customerid = as.character(model$predicted_values$customerid),
11+
output <- data.frame(segment = model$predicted_values$segment,
12+
id = as.character(model$predicted_values$id),
1313
stringsAsFactors = FALSE)
1414
if(!is.null(model$model_hyperparameters$dependent_variable)) {
1515
response <- model$model_hyperparameters$dependent_variable
1616
} else {
1717
response <- "response"
1818
}
1919

20-
df <- left_join(data, output, by = 'customerid')
20+
df <- left_join(data, output, by = 'id')
2121

2222

2323
segmentation_vars <- model$model_hyperparameters$segmentation_variables
2424

2525
if(is.null(segmentation_vars)){
2626
allcolumnnames <- colnames(df)
27-
segmentation_vars <- allcolumnnames[!allcolumnnames %in% c('customerid', response , 'segment')]
27+
segmentation_vars <- allcolumnnames[!allcolumnnames %in% c('id', response , 'segment')]
2828
}
2929

3030
df_agg <- df %>% select(c('segment',model$model_hyperparameters$segmentation_variables))

R/plotting.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77

88
citrus_pair_plot <- function(model,vars = NULL) {
99

10-
segments <- model$predicted_values$persona
10+
segments <- model$predicted_values$segment
1111
data <- model$input_data
12-
data <- data[ , -which(names(data) == "customerid")]
12+
data <- data[ , -which(names(data) == "id")]
1313
if(!is.null(vars)){
1414
data <- data[,vars]
1515
}

R/preprocess.R

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#' Preprocess Function
22
#'
3-
#' Transforms a transactional table into a customer aggregated table with custom options for aggregation methods for numeric and categorical columns.
3+
#' Transforms a transactional table into an id aggregated table with custom options for aggregation methods for numeric and categorical columns.
44
#' @param df data.frame, the data to preprocess
5-
#' @param samplesize numeric, the fraction of customers used to create a sub-sample of the input df
5+
#' @param samplesize numeric, the fraction of ids used to create a sub-sample of the input df
66
#' @param numeric_operation_list list, a list of the aggregation functions to apply to numeric columns
77
#' @param categories list, a list of the categorical columns to aggregate
88
#' @param target character, the column to use as a response variable for supervised learning
@@ -20,14 +20,14 @@ preprocess <- function(df,
2020

2121
# Warning: Rename data
2222
print('Please ensure columns are renamed accordingly:')
23-
print('Customer Identifier: customerid')
23+
print('Unique Identifier: id')
2424
print('Transaction Identifier: transactionid')
2525
print('Transaction Date: orderdate')
2626
print('Value Column: transactionvalue')
2727
print(paste0('Target column: ', target, ' (', target_agg, ')'))
2828

2929
# Column name check
30-
need_to_have <- c('customerid', 'transactionid', 'orderdate', 'transactionvalue')
30+
need_to_have <- c('id', 'transactionid', 'orderdate', 'transactionvalue')
3131
if (!all(need_to_have %in% names(df))) {
3232
stop('Missing need to haves')
3333
}
@@ -51,13 +51,13 @@ preprocess <- function(df,
5151

5252
# Standard column formatting
5353
df$orderdate <- as.Date(df$orderdate)
54-
df$customerid <- as.character(df$customerid)
54+
df$id <- as.character(df$id)
5555
df$transactionvalue <- as.numeric(df$transactionvalue)
5656

5757
# RFM aggregations
5858
latest_date <- max(df$orderdate)
5959
final_df <- df %>%
60-
group_by(.data$customerid) %>%
60+
group_by(.data$id) %>%
6161
summarise(recency = as.integer(latest_date - max(.data$orderdate, na.rm = TRUE)),
6262
frequency = n_distinct(.data$transactionid),
6363
monetary = sum(.data$transactionvalue, na.rm = TRUE)) %>%
@@ -72,45 +72,43 @@ preprocess <- function(df,
7272
if(!is.na(target)) {
7373
numeric_df <- df %>%
7474
select(-target) %>%
75-
group_by(.data$customerid) %>%
75+
group_by(.data$id) %>%
7676
summarise_if(is.numeric, function_vector) %>%
7777
ungroup()
7878
} else {
7979
numeric_df <- df %>%
80-
group_by(.data$customerid) %>%
80+
group_by(.data$id) %>%
8181
summarise_if(is.numeric, function_vector) %>%
8282
ungroup()
8383
}
84-
8584

8685
if (is.na(target)) {
87-
evaluated_columns <- names(df)[sapply(df, is.numeric) & names(df) != 'customerid']
86+
evaluated_columns <- names(df)[sapply(df, is.numeric) & names(df) != 'id']
8887
} else {
89-
evaluated_columns <- names(df)[sapply(df, is.numeric) & names(df) != 'customerid' & names(df) != target]
88+
evaluated_columns <- names(df)[sapply(df, is.numeric) & names(df) != 'id' & names(df) != target]
9089
}
9190

9291

9392
if (length(evaluated_columns) == 1) {
94-
adjusted_name <- paste0(evaluated_columns, '_', names(numeric_df)[!(names(numeric_df) %in% c('customerid', target))])
95-
names(numeric_df) <- c('customerid', adjusted_name)
93+
adjusted_name <- paste0(evaluated_columns, '_', names(numeric_df)[!(names(numeric_df) %in% c('id', target))])
94+
names(numeric_df) <- c('id', adjusted_name)
9695
}
9796

9897
# Filters categorical columns and grabs the top n category for each
9998
# categorical column
100-
final_df <- inner_join(final_df, numeric_df, by = 'customerid')
99+
final_df <- inner_join(final_df, numeric_df, by = 'id')
101100
}
102101

103102

104103
if (!is.null(categories)) {
105104
for (col_name in categories) {
106-
107105
if(!is.na(target)) {
108106
temp_df <- df %>%
109107
select(-target) %>%
110-
group_by(.data$customerid, !!as.symbol(col_name)) %>%
108+
group_by(.data$id, !!as.symbol(col_name)) %>%
111109
summarise(n = n()) %>%
112110
ungroup() %>%
113-
group_by(.data$customerid) %>%
111+
group_by(.data$id) %>%
114112
arrange(desc(n)) %>%
115113
filter(row_number() == 1) %>%
116114
ungroup() %>%
@@ -120,18 +118,18 @@ preprocess <- function(df,
120118

121119
} else {
122120
temp_df <- df %>%
123-
group_by(.data$customerid, !!as.symbol(col_name)) %>%
121+
group_by(.data$id, !!as.symbol(col_name)) %>%
124122
summarise(n = n()) %>%
125123
ungroup() %>%
126-
group_by(.data$customerid) %>%
124+
group_by(.data$id) %>%
127125
arrange(desc(n)) %>%
128126
filter(row_number() == 1) %>%
129127
ungroup() %>%
130128
select(-n)
131129
var <- paste0('top_', col_name)
132130
temp_df[var] <- temp_df[col_name]
133131
}
134-
final_df <- inner_join(final_df, temp_df, by = 'customerid')
132+
final_df <- inner_join(final_df, temp_df, by = 'id')
135133
}
136134

137135
final_df <- select(final_df, -categories)
@@ -141,11 +139,11 @@ preprocess <- function(df,
141139
if (!is.na(target)) {
142140
if(verbose == TRUE) {message('Calculating target values')}
143141
target_df <- df %>%
144-
group_by(.data$customerid) %>%
142+
group_by(.data$id) %>%
145143
summarise(response = get(target_agg)(!!as.symbol(target), na.rm = TRUE)) %>%
146144
ungroup()
147145

148-
final_df <- left_join(final_df, target_df, by = 'customerid')
146+
final_df <- left_join(final_df, target_df, by = 'id')
149147
}
150148

151149
return(final_df)

R/segment.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ segment <- function(data,
5353
# Default hyperparameters
5454
default_hyperparameters = list(dependent_variable = 'response',
5555
min_segmentation_fraction = 0.05,
56-
number_of_personas = 6,
56+
number_of_segments = 6,
5757
print_plot = ifelse(prettify == FALSE, print_plot, FALSE),
5858
print_safety_check=20)
5959
if(is.null(hyperparameters)){
@@ -67,7 +67,7 @@ segment <- function(data,
6767

6868
if(verbose == TRUE) {message('Training model')}
6969
model = tree_segment(data, hyperparameters, verbose = verbose)
70-
if(verbose == TRUE) {message('Number of segments: ', paste0(max(model$persona_table$persona, '\n')))}
70+
if(verbose == TRUE) {message('Number of segments: ', paste0(max(model$segment_table$segment, '\n')))}
7171

7272
# Prettify layer
7373
if(prettify == T){

R/tree_segment.R

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
#' Tree Segment Function
22
#'
3-
#' Runs decision tree optimisation on the data to segment customers.
3+
#' Runs decision tree optimisation on the data to segment ids.
44
#' @param data data.frame, the data to segment
55
#' @param hyperparameters list, list of hyperparameters to pass. They include
66
#' segmentation_variables: a vector or list with variable names that will be used as segmentation variables;
77
#' dependent_variable: a string with the name of the dependent variable that is used in the clustering;
88
#' min_segmentation_fraction: integer, the minimum segment size as a proportion of the total data set;
9-
#' number_of_personas: integer, number of leaves you want the decision tree to have.
9+
#' number_of_segments: integer, number of leaves you want the decision tree to have.
1010
#' @importFrom dplyr mutate_all left_join select %>%
1111
#' @importFrom treeClust rpart.predict.leaves
1212
#' @importFrom rpart.plot rpart.plot
@@ -16,14 +16,14 @@
1616
tree_segment <- function(data, hyperparameters, verbose = TRUE){
1717

1818
if(is.null(hyperparameters$segmentation_variables)){
19-
segmentation_variables <- colnames(data)[colnames(data)!= hyperparameters$dependent_variable & colnames(data)!='customerid']
19+
segmentation_variables <- colnames(data)[colnames(data)!= hyperparameters$dependent_variable & colnames(data)!='id']
2020
}else{
2121
segmentation_variables <- hyperparameters$segmentation_variables
2222
}
2323
inputs_params <- list(segmentation_variables=segmentation_variables,
2424
dependent_variable=hyperparameters$dependent_variable,
2525
min_segmentation_fraction=hyperparameters$min_segmentation_fraction,
26-
number_of_personas=hyperparameters$number_of_personas)
26+
number_of_segments=hyperparameters$number_of_segments)
2727

2828
int_colnames <- names(data)[unname(sapply(data, typeof)) == 'integer']
2929

@@ -42,22 +42,22 @@ tree_segment <- function(data, hyperparameters, verbose = TRUE){
4242
segmentation_variables=segmentation_variables,
4343
dependent_variable=hyperparameters$dependent_variable,
4444
min_segmentation_fraction=hyperparameters$min_segmentation_fraction,
45-
number_of_leafs=hyperparameters$number_of_personas)
45+
number_of_leafs=hyperparameters$number_of_segments)
4646

4747
if(nrow(first_tree$frame)==1){print('Only 1 segment. Change parameters or inputs!')}else{
48-
persona_table <- tree_table.make(first_tree, int_colnames)
49-
persona_tree <- persona_tree.make(first_tree)
50-
persona_tree_df <- persona_tree$df
51-
persona_tree <- persona_tree$tree
52-
persona_predicted <- data.frame(customerid = data$customerid, orig_row=as.numeric(rpart.predict.leaves(persona_tree, data, type = "where")))
53-
persona_predicted <- left_join(persona_predicted,persona_tree_df %>% select(.data$orig_row,.data$persona), by = "orig_row") %>% select(.data$customerid, .data$persona)
48+
segment_table <- tree_table.make(first_tree, int_colnames)
49+
segment_tree <- segment_tree.make(first_tree)
50+
segment_tree_df <- segment_tree$df
51+
segment_tree <- segment_tree$tree
52+
segment_predicted <- data.frame(id = data$id, orig_row=as.numeric(rpart.predict.leaves(segment_tree, data, type = "where")))
53+
segment_predicted <- left_join(segment_predicted,segment_tree_df %>% select(.data$orig_row,.data$segment), by = "orig_row") %>% select(.data$id, .data$segment)
5454

55-
if(hyperparameters$print_plot&(hyperparameters$number_of_personas<hyperparameters$print_safety_check)){rpart.plot(first_tree)}
55+
if(hyperparameters$print_plot&(hyperparameters$number_of_segments<hyperparameters$print_safety_check)){rpart.plot(first_tree)}
5656

5757
return(
58-
list(persona_model = persona_tree,
59-
persona_table = persona_table,
60-
persona_predicted = persona_predicted,
58+
list(segment_model = segment_tree,
59+
segment_table = segment_table,
60+
segment_predicted = segment_predicted,
6161
model_inputs = inputs_params)
6262
)
6363
}
@@ -76,7 +76,7 @@ decision_tree_user_defined_leafs.make <- function(df,segmentation_variables,depe
7676
tree <- rpart(f,data=df,method='anova',control = control)
7777

7878
if(nrow(tree$frame %>% filter(.data$var=='<leaf>'))<number_of_leafs){
79-
print('WARNING: Output number of personas is less than than the requested amount. Reduce the minimum segmentation fraction, increase the number of segmentation variables, get more data etc.')
79+
print('WARNING: Output number of segments is less than than the requested amount. Reduce the minimum segmentation fraction, increase the number of segmentation variables, get more data etc.')
8080
pruned_tree <- tree
8181
} else{
8282
cp_adjusted_tree <- tree
@@ -120,7 +120,7 @@ tree_table.make <- function(tree, integer_columns){
120120
df1 <- rownames_to_column(tree$frame) %>% arrange(as.numeric(.data$rowname)) %>%
121121
bind_cols(tibble(rules=unlist(rpart.rules(tree))) %>% filter(nchar(.data$rules)>0)) %>%
122122
filter(.data$var=='<leaf>') %>%
123-
transmute(persona=row_number(),n,.data$yval,.data$rules)
123+
transmute(segment=row_number(),n,.data$yval,.data$rules)
124124
var_names <- tree$frame %>% filter(.data$var!='<leaf>') %>% select(.data$var) %>% unique()
125125
df2 <- df1 %>% bind_cols(as.data.frame(matrix(data=NA,nrow = nrow(df1),ncol = nrow(var_names),dimnames = list(c(),var_names$var))))
126126

@@ -156,15 +156,15 @@ tree_table.make <- function(tree, integer_columns){
156156
}
157157
}
158158

159-
df3 <- df2 %>% mutate(percentage=n/sum(n)*100) %>% select(.data$persona,.data$yval,.data$percentage,everything()) %>%
159+
df3 <- df2 %>% mutate(percentage=n/sum(n)*100) %>% select(.data$segment,.data$yval,.data$percentage,everything()) %>%
160160
rename(mean_value=.data$yval)%>% select(-.data$rules)
161161
df3[,5:ncol(df3)][is.na( df3[,5:ncol(df3)])] <- 'All'
162162

163163
# Ensures that the conditions for integer columns in the table remain formatted as integers.
164164
# Without this step, a condition for an integer column could be, e.g., > 1.5.
165165
# With this step, this condition gets changed to >= 2.
166166

167-
# Select the columns in the persona table that are integers in the raw DF
167+
# Select the columns in the segment table that are integers in the raw DF
168168

169169
if (sum(names(df3) %in% integer_columns) == 1) {
170170
df_to_change <- data.frame(df3[, names(df3) %in% integer_columns], stringsAsFactors = FALSE)
@@ -209,13 +209,13 @@ tree_table.make <- function(tree, integer_columns){
209209
#' @importFrom dplyr mutate row_number arrange bind_cols filter transmute %>%
210210
#' @importFrom rpart.utils rpart.rules
211211
#' @importFrom rlang .data
212-
persona_tree.make <- function(tree){
212+
segment_tree.make <- function(tree){
213213

214214
df1 <- rownames_to_column(tree$frame) %>% mutate(orig_row=row_number()) %>% arrange(as.numeric(.data$rowname)) %>%
215215
bind_cols(tibble(rules=unlist(rpart.rules(tree))) %>% filter(nchar(.data$rules)>0)) %>%
216216
filter(.data$var=='<leaf>') %>%
217-
transmute(persona=row_number(),n,.data$yval,.data$rules,.data$orig_row) %>% arrange(.data$orig_row)
218-
tree$frame$yval[tree$frame$var=='<leaf>'] <- df1$persona
217+
transmute(segment=row_number(),n,.data$yval,.data$rules,.data$orig_row) %>% arrange(.data$orig_row)
218+
tree$frame$yval[tree$frame$var=='<leaf>'] <- df1$segment
219219
return(list(tree = tree,
220220
df = df1))
221221
}
@@ -408,11 +408,11 @@ rpart.plot_pretty <- function(model,main="",sub,caption,palettes,type=2,fontfami
408408
#' @export
409409
tree_segment_prettify <- function(tree, char_length = 20, print_plot = F){
410410

411-
if(print_plot){rpart.plot_pretty(tree$persona_model)}
411+
if(print_plot){rpart.plot_pretty(tree$segment_model)}
412412

413-
features_used <- names(tree$persona_table)
414-
features_used <- features_used[!features_used %in% c("persona","mean_value","percentage","n")]
415-
split_data <- tree$persona_table %>% select(features_used)
413+
features_used <- names(tree$segment_table)
414+
features_used <- features_used[!features_used %in% c("segment","mean_value","percentage","n")]
415+
split_data <- tree$segment_table %>% select(features_used)
416416

417417
character_check <- function(x){
418418
words <- unique(x)
@@ -436,7 +436,7 @@ tree_segment_prettify <- function(tree, char_length = 20, print_plot = F){
436436
split_data[,col_number] <- sapply(split_data[,col_number],dynamic_binning)
437437
}
438438

439-
tree$persona_table[,features_used] <- split_data
439+
tree$segment_table[,features_used] <- split_data
440440

441441
return(tree)
442442
}
@@ -451,10 +451,10 @@ tree_abstract <- function(model, inputdata){
451451
#TODO: add performance statistics
452452
#tree_performance()
453453
structure(
454-
list(persona_model = model$persona_model,
454+
list(segment_model = model$segment_model,
455455
model_hyperparameters = model$model_inputs,
456-
persona_table = model$persona_table,
457-
predicted_values = model$persona_predicted,
456+
segment_table = model$segment_table,
457+
predicted_values = model$segment_predicted,
458458
input_data = inputdata),
459459

460460
class = "tree_model")

0 commit comments

Comments
 (0)