peak-ai
diff --git a/‎R/model_management.R
Lines changed: 7 additions & 7 deletions b/‎R/model_management.R
Lines changed: 7 additions & 7 deletions
diff --git a/‎R/output_table.R
Lines changed: 5 additions & 5 deletions b/‎R/output_table.R
Lines changed: 5 additions & 5 deletions
diff --git a/‎R/plotting.R
Lines changed: 2 additions & 2 deletions b/‎R/plotting.R
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/preprocess.R
Lines changed: 20 additions & 22 deletions b/‎R/preprocess.R
Lines changed: 20 additions & 22 deletions
diff --git a/‎R/segment.R
Lines changed: 2 additions & 2 deletions b/‎R/segment.R
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/tree_segment.R
Lines changed: 30 additions & 30 deletions b/‎R/tree_segment.R
Lines changed: 30 additions & 30 deletions
@@ -17,9 +17,9 @@ model_management <- function(model,hyperparameters){
         directory_path <- paste0('~/segmentationoutputs/',format(Sys.time(),format = '%Y-%m-%d-%H-%M-%S'))
         dir.create(directory_path)
         #Save model
-        persona_model <- model$persona_model
-        save(persona_model,
-             file=paste0(directory_path,'/persona_model.RData'), ascii=TRUE)
+        segment_model <- model$segment_model
+        save(segment_model,
+             file=paste0(directory_path,'/segment_model.RData'), ascii=TRUE)
         #Save hyperparameters
         model_hyperparameters <- model$model_hyperparameters
         save(model_hyperparameters,
@@ -46,10 +46,10 @@ model_management <- function(model,hyperparameters){
           }
         }
         #Bespoke management layers - if(class(model) == 'abc'){...}
-        #TODO: Save persona_table?
+        #TODO: Save segment_table?
         # if(class(model) == 'abc'){
-        #   save(model$persona_table,
-        #        file=paste0(directory_path,'/persona_table.RData'), ascii=TRUE)
+        #   save(model$segment_table,
+        #        file=paste0(directory_path,'/segment_table.RData'), ascii=TRUE)
         # }
         if(class(model) == 'k-clusters'){
           outliers <- model$outliers_table
@@ -62,7 +62,7 @@ model_management <- function(model,hyperparameters){
         #Save rpart.plot
         if(class(model) == 'tree_model'){
            pdf(paste0(directory_path,'/tree.pdf'))
-           rpart.plot_pretty(persona_model)
+           rpart.plot_pretty(segment_model)
            dev.off()
         }
       }
 
@@ -2,29 +2,29 @@
 #'
 #' Generates the output table for model and data
 #' @param data A dataframe generated from the pre-processing step
-#' @param model A model object used to classify customers with, generated from the model selection layer
+#' @param model A model object used to classify ids with, generated from the model selection layer
 #' @importFrom dplyr left_join select mutate group_by summarise summarise_each funs
 #' @importFrom rlang .data
 #' @export
 output_table <- function(data, model) {
   #TODO: Add summary stats for the predictors
-  output <- data.frame(segment = model$predicted_values$persona,
-                       customerid = as.character(model$predicted_values$customerid), 
+  output <- data.frame(segment = model$predicted_values$segment,
+                       id = as.character(model$predicted_values$id), 
                        stringsAsFactors = FALSE)
   if(!is.null(model$model_hyperparameters$dependent_variable)) {
     response <- model$model_hyperparameters$dependent_variable
   } else {
     response <- "response"
   }
 
-  df <- left_join(data, output, by = 'customerid')
+  df <- left_join(data, output, by = 'id')
 
 
   segmentation_vars <- model$model_hyperparameters$segmentation_variables
 
   if(is.null(segmentation_vars)){
     allcolumnnames <- colnames(df)
-    segmentation_vars <- allcolumnnames[!allcolumnnames %in% c('customerid', response , 'segment')]  
+    segmentation_vars <- allcolumnnames[!allcolumnnames %in% c('id', response , 'segment')]  
   }
 
   df_agg <- df %>% select(c('segment',model$model_hyperparameters$segmentation_variables)) 
 
@@ -7,9 +7,9 @@
 
 citrus_pair_plot <- function(model,vars = NULL) {
 
-  segments <- model$predicted_values$persona
+  segments <- model$predicted_values$segment
   data <- model$input_data
-  data <- data[ , -which(names(data) == "customerid")]
+  data <- data[ , -which(names(data) == "id")]
   if(!is.null(vars)){
     data <- data[,vars]
   }
 
@@ -1,8 +1,8 @@
 #' Preprocess Function
 #'
-#' Transforms a transactional table into a customer aggregated table with custom options for aggregation methods for numeric and categorical columns.
+#' Transforms a transactional table into an id aggregated table with custom options for aggregation methods for numeric and categorical columns.
 #' @param df data.frame, the data to preprocess
-#' @param samplesize numeric, the fraction of customers used to create a sub-sample of the input df
+#' @param samplesize numeric, the fraction of ids used to create a sub-sample of the input df
 #' @param numeric_operation_list list, a list of the aggregation functions to apply to numeric columns
 #' @param categories list, a list of the categorical columns to aggregate
 #' @param target character, the column to use as a response variable for supervised learning
@@ -20,14 +20,14 @@ preprocess <- function(df,
 
   # Warning: Rename data
   print('Please ensure columns are renamed accordingly:')
-  print('Customer Identifier: customerid')
+  print('Unique Identifier: id')
   print('Transaction Identifier: transactionid')
   print('Transaction Date: orderdate')
   print('Value Column: transactionvalue')
   print(paste0('Target column: ', target, ' (', target_agg, ')'))
 
   # Column name check
-  need_to_have <- c('customerid', 'transactionid', 'orderdate', 'transactionvalue')
+  need_to_have <- c('id', 'transactionid', 'orderdate', 'transactionvalue')
   if (!all(need_to_have %in% names(df))) {
     stop('Missing need to haves')
   }
@@ -51,13 +51,13 @@ preprocess <- function(df,
 
   # Standard column formatting
   df$orderdate <- as.Date(df$orderdate)
-  df$customerid <- as.character(df$customerid)
+  df$id <- as.character(df$id)
   df$transactionvalue <- as.numeric(df$transactionvalue)
 
   # RFM aggregations
   latest_date <- max(df$orderdate)
   final_df <- df %>%
-    group_by(.data$customerid) %>%
+    group_by(.data$id) %>%
     summarise(recency = as.integer(latest_date - max(.data$orderdate, na.rm = TRUE)),
               frequency = n_distinct(.data$transactionid),
               monetary = sum(.data$transactionvalue, na.rm = TRUE)) %>%
@@ -72,45 +72,43 @@ preprocess <- function(df,
     if(!is.na(target)) {
       numeric_df <- df %>% 
         select(-target) %>%
-        group_by(.data$customerid) %>% 
+        group_by(.data$id) %>% 
         summarise_if(is.numeric, function_vector) %>% 
         ungroup()
     } else {
       numeric_df <- df %>% 
-        group_by(.data$customerid) %>% 
+        group_by(.data$id) %>% 
         summarise_if(is.numeric, function_vector) %>% 
         ungroup()
     }
-    
 
     if (is.na(target)) {
-      evaluated_columns <- names(df)[sapply(df, is.numeric) & names(df) != 'customerid']
+      evaluated_columns <- names(df)[sapply(df, is.numeric) & names(df) != 'id']
     } else {
-      evaluated_columns <- names(df)[sapply(df, is.numeric) & names(df) != 'customerid' & names(df) != target]
+      evaluated_columns <- names(df)[sapply(df, is.numeric) & names(df) != 'id' & names(df) != target]
     }
 
 
     if (length(evaluated_columns) == 1) {
-      adjusted_name <- paste0(evaluated_columns, '_', names(numeric_df)[!(names(numeric_df) %in% c('customerid', target))])
-      names(numeric_df) <- c('customerid', adjusted_name)
+      adjusted_name <- paste0(evaluated_columns, '_', names(numeric_df)[!(names(numeric_df) %in% c('id', target))])
+      names(numeric_df) <- c('id', adjusted_name)
     }
 
     # Filters categorical columns and grabs the top n category for each
     # categorical column
-    final_df <- inner_join(final_df, numeric_df, by = 'customerid')
+    final_df <- inner_join(final_df, numeric_df, by = 'id')
   }
 
 
   if (!is.null(categories)) {
     for (col_name in categories) {
-      
       if(!is.na(target)) {
         temp_df <- df %>%
           select(-target) %>%
-          group_by(.data$customerid, !!as.symbol(col_name)) %>%
+          group_by(.data$id, !!as.symbol(col_name)) %>%
           summarise(n = n()) %>%
           ungroup() %>%
-          group_by(.data$customerid) %>%
+          group_by(.data$id) %>%
           arrange(desc(n)) %>%
           filter(row_number() == 1) %>%
           ungroup() %>%
@@ -120,18 +118,18 @@ preprocess <- function(df,
 
       } else {
         temp_df <- df %>%
-          group_by(.data$customerid, !!as.symbol(col_name)) %>%
+          group_by(.data$id, !!as.symbol(col_name)) %>%
           summarise(n = n()) %>%
           ungroup() %>%
-          group_by(.data$customerid) %>%
+          group_by(.data$id) %>%
           arrange(desc(n)) %>%
           filter(row_number() == 1) %>%
           ungroup() %>%
           select(-n)
         var <- paste0('top_', col_name)
         temp_df[var] <- temp_df[col_name]
       }      
-      final_df <- inner_join(final_df, temp_df, by = 'customerid')
+      final_df <- inner_join(final_df, temp_df, by = 'id')
     }
 
     final_df <- select(final_df, -categories)
@@ -141,11 +139,11 @@ preprocess <- function(df,
   if (!is.na(target)) {
     if(verbose == TRUE) {message('Calculating target values')}
     target_df <- df %>%
-      group_by(.data$customerid) %>%
+      group_by(.data$id) %>%
       summarise(response = get(target_agg)(!!as.symbol(target), na.rm = TRUE)) %>%
       ungroup()
 
-    final_df <- left_join(final_df, target_df, by = 'customerid')
+    final_df <- left_join(final_df, target_df, by = 'id')
   }
 
   return(final_df)
 
@@ -53,7 +53,7 @@ segment <- function(data,
         # Default hyperparameters
         default_hyperparameters = list(dependent_variable = 'response',
                                        min_segmentation_fraction = 0.05,
-                                       number_of_personas = 6,
+                                       number_of_segments = 6,
                                        print_plot = ifelse(prettify == FALSE, print_plot, FALSE),
                                        print_safety_check=20)
         if(is.null(hyperparameters)){
@@ -67,7 +67,7 @@ segment <- function(data,
 
         if(verbose == TRUE) {message('Training model')}
         model = tree_segment(data, hyperparameters, verbose = verbose)
-        if(verbose == TRUE) {message('Number of segments: ', paste0(max(model$persona_table$persona, '\n')))}
+        if(verbose == TRUE) {message('Number of segments: ', paste0(max(model$segment_table$segment, '\n')))}
 
         # Prettify layer
         if(prettify == T){
 
@@ -1,12 +1,12 @@
 #' Tree Segment Function
 #'
-#' Runs decision tree optimisation on the data to segment customers.
+#' Runs decision tree optimisation on the data to segment ids.
 #' @param data data.frame, the data to segment
 #' @param hyperparameters list, list of hyperparameters to pass. They include
 #' segmentation_variables: a vector or list with variable names that will be used as segmentation variables; 
 #' dependent_variable: a string with the name of the dependent variable that is used in the clustering;
 #' min_segmentation_fraction: integer, the minimum segment size as a proportion of the total data set;
-#' number_of_personas: integer, number of leaves you want the decision tree to have.
+#' number_of_segments: integer, number of leaves you want the decision tree to have.
 #' @importFrom dplyr mutate_all left_join select %>%
 #' @importFrom treeClust rpart.predict.leaves 
 #' @importFrom rpart.plot rpart.plot
@@ -16,14 +16,14 @@
 tree_segment <- function(data, hyperparameters, verbose = TRUE){
 
   if(is.null(hyperparameters$segmentation_variables)){
-    segmentation_variables <- colnames(data)[colnames(data)!= hyperparameters$dependent_variable & colnames(data)!='customerid']
+    segmentation_variables <- colnames(data)[colnames(data)!= hyperparameters$dependent_variable & colnames(data)!='id']
   }else{
     segmentation_variables <- hyperparameters$segmentation_variables
   }
   inputs_params <- list(segmentation_variables=segmentation_variables,
                         dependent_variable=hyperparameters$dependent_variable,
                         min_segmentation_fraction=hyperparameters$min_segmentation_fraction,
-                        number_of_personas=hyperparameters$number_of_personas)
+                        number_of_segments=hyperparameters$number_of_segments)
 
   int_colnames <- names(data)[unname(sapply(data, typeof)) == 'integer']
 
@@ -42,22 +42,22 @@ tree_segment <- function(data, hyperparameters, verbose = TRUE){
                                                        segmentation_variables=segmentation_variables,
                                                        dependent_variable=hyperparameters$dependent_variable,
                                                        min_segmentation_fraction=hyperparameters$min_segmentation_fraction,
-                                                       number_of_leafs=hyperparameters$number_of_personas)
+                                                       number_of_leafs=hyperparameters$number_of_segments)
 
   if(nrow(first_tree$frame)==1){print('Only 1 segment. Change parameters or inputs!')}else{
-    persona_table <- tree_table.make(first_tree, int_colnames)
-    persona_tree  <- persona_tree.make(first_tree)
-    persona_tree_df <- persona_tree$df
-    persona_tree <- persona_tree$tree
-    persona_predicted <- data.frame(customerid = data$customerid, orig_row=as.numeric(rpart.predict.leaves(persona_tree, data, type = "where")))
-    persona_predicted <- left_join(persona_predicted,persona_tree_df %>% select(.data$orig_row,.data$persona), by = "orig_row") %>% select(.data$customerid, .data$persona)
+    segment_table <- tree_table.make(first_tree, int_colnames)
+    segment_tree  <- segment_tree.make(first_tree)
+    segment_tree_df <- segment_tree$df
+    segment_tree <- segment_tree$tree
+    segment_predicted <- data.frame(id = data$id, orig_row=as.numeric(rpart.predict.leaves(segment_tree, data, type = "where")))
+    segment_predicted <- left_join(segment_predicted,segment_tree_df %>% select(.data$orig_row,.data$segment), by = "orig_row") %>% select(.data$id, .data$segment)
 
-    if(hyperparameters$print_plot&(hyperparameters$number_of_personas<hyperparameters$print_safety_check)){rpart.plot(first_tree)}
+    if(hyperparameters$print_plot&(hyperparameters$number_of_segments<hyperparameters$print_safety_check)){rpart.plot(first_tree)}
 
     return(
-      list(persona_model = persona_tree,
-           persona_table = persona_table,
-           persona_predicted = persona_predicted,
+      list(segment_model = segment_tree,
+           segment_table = segment_table,
+           segment_predicted = segment_predicted,
            model_inputs = inputs_params)
     )
   }
@@ -76,7 +76,7 @@ decision_tree_user_defined_leafs.make <- function(df,segmentation_variables,depe
   tree <- rpart(f,data=df,method='anova',control = control)
 
   if(nrow(tree$frame %>% filter(.data$var=='<leaf>'))<number_of_leafs){
-    print('WARNING: Output number of personas is less than than the requested amount. Reduce the minimum segmentation fraction, increase the number of segmentation variables, get more data etc.')
+    print('WARNING: Output number of segments is less than than the requested amount. Reduce the minimum segmentation fraction, increase the number of segmentation variables, get more data etc.')
     pruned_tree <- tree
   } else{
     cp_adjusted_tree <- tree
@@ -120,7 +120,7 @@ tree_table.make <- function(tree, integer_columns){
   df1 <- rownames_to_column(tree$frame) %>% arrange(as.numeric(.data$rowname)) %>%
     bind_cols(tibble(rules=unlist(rpart.rules(tree))) %>% filter(nchar(.data$rules)>0)) %>%
     filter(.data$var=='<leaf>') %>%
-    transmute(persona=row_number(),n,.data$yval,.data$rules)
+    transmute(segment=row_number(),n,.data$yval,.data$rules)
   var_names <- tree$frame %>% filter(.data$var!='<leaf>') %>% select(.data$var) %>% unique()
   df2 <- df1 %>% bind_cols(as.data.frame(matrix(data=NA,nrow = nrow(df1),ncol = nrow(var_names),dimnames = list(c(),var_names$var))))
 
@@ -156,15 +156,15 @@ tree_table.make <- function(tree, integer_columns){
       }
     }
 
-    df3 <- df2 %>% mutate(percentage=n/sum(n)*100) %>% select(.data$persona,.data$yval,.data$percentage,everything()) %>%
+    df3 <- df2 %>% mutate(percentage=n/sum(n)*100) %>% select(.data$segment,.data$yval,.data$percentage,everything()) %>%
       rename(mean_value=.data$yval)%>% select(-.data$rules)
     df3[,5:ncol(df3)][is.na( df3[,5:ncol(df3)])] <- 'All'
 
     # Ensures that the conditions for integer columns in the table remain formatted as integers.
     # Without this step, a condition for an integer column could be, e.g., > 1.5.
     # With this step, this condition gets changed to >= 2.
 
-    # Select the columns in the persona table that are integers in the raw DF
+    # Select the columns in the segment table that are integers in the raw DF
 
     if (sum(names(df3) %in% integer_columns) == 1) {
       df_to_change <- data.frame(df3[, names(df3) %in% integer_columns], stringsAsFactors = FALSE)
@@ -209,13 +209,13 @@ tree_table.make <- function(tree, integer_columns){
 #' @importFrom dplyr mutate row_number arrange bind_cols filter transmute %>%
 #' @importFrom rpart.utils rpart.rules
 #' @importFrom rlang .data
-persona_tree.make <- function(tree){
+segment_tree.make <- function(tree){
 
   df1 <- rownames_to_column(tree$frame) %>% mutate(orig_row=row_number()) %>% arrange(as.numeric(.data$rowname)) %>%
     bind_cols(tibble(rules=unlist(rpart.rules(tree))) %>% filter(nchar(.data$rules)>0)) %>%
     filter(.data$var=='<leaf>') %>%
-    transmute(persona=row_number(),n,.data$yval,.data$rules,.data$orig_row) %>% arrange(.data$orig_row)
-  tree$frame$yval[tree$frame$var=='<leaf>'] <- df1$persona
+    transmute(segment=row_number(),n,.data$yval,.data$rules,.data$orig_row) %>% arrange(.data$orig_row)
+  tree$frame$yval[tree$frame$var=='<leaf>'] <- df1$segment
   return(list(tree = tree,
               df = df1))
 }
@@ -408,11 +408,11 @@ rpart.plot_pretty <- function(model,main="",sub,caption,palettes,type=2,fontfami
 #' @export
 tree_segment_prettify <- function(tree, char_length = 20, print_plot = F){
 
-  if(print_plot){rpart.plot_pretty(tree$persona_model)}
+  if(print_plot){rpart.plot_pretty(tree$segment_model)}
 
-  features_used <- names(tree$persona_table)
-  features_used <- features_used[!features_used %in% c("persona","mean_value","percentage","n")]
-  split_data <- tree$persona_table %>% select(features_used)
+  features_used <- names(tree$segment_table)
+  features_used <- features_used[!features_used %in% c("segment","mean_value","percentage","n")]
+  split_data <- tree$segment_table %>% select(features_used)
 
   character_check <- function(x){
     words <- unique(x)
@@ -436,7 +436,7 @@ tree_segment_prettify <- function(tree, char_length = 20, print_plot = F){
     split_data[,col_number] <- sapply(split_data[,col_number],dynamic_binning)
   }
 
-  tree$persona_table[,features_used] <- split_data
+  tree$segment_table[,features_used] <- split_data
 
   return(tree)
 }
@@ -451,10 +451,10 @@ tree_abstract <- function(model, inputdata){
   #TODO: add performance statistics
   #tree_performance()
   structure(
-    list(persona_model = model$persona_model,
+    list(segment_model = model$segment_model,
          model_hyperparameters = model$model_inputs,
-         persona_table = model$persona_table,
-         predicted_values = model$persona_predicted,
+         segment_table = model$segment_table,
+         predicted_values = model$segment_predicted,
          input_data = inputdata),
 
     class = "tree_model")