Hierarchical clustering based on PCs

Introduction

Hierachical clustering based on principal components

12.3s
hierarchical_clustering_R (R)
## Import libraries
library(FactoMineR)
library(factoextra)
library(plyr)
library(dplyr)
library(arulesCBA) 

## Import data
df <-read.csv("https://github.com/nchelaru/data-prep/raw/master/telco_cleaned_renamed.csv")

## Drop TotalCharges variable, as it is a product of MonthlyCharges and Tenure
df <- within(df, rm('TotalCharges'))

## Discretize "MonthlyCharges" with respect to "Churn"/"No Churn" label and assign to new column in dataframe
df$Binned_MonthlyCharges <- discretizeDF.supervised(Churn ~ ., df[, c('MonthlyCharges', 'Churn')], method='mdlp')$MonthlyCharges

## Rename the levels based on knowledge of min/max monthly charges
df$Binned_MonthlyCharges = revalue(df$Binned_MonthlyCharges, 
                                   c("[-Inf,29.4)"="$0-29.4", 
                                     "[29.4,56)"="$29.4-56", 
                                     "[56,68.8)"="$56-68.8", 
                                     "[68.8,107)"="$68.8-107", 
                                     "[107, Inf]" = "$107-118.75"))

## Discretize "Tenure" with respect to "Churn"/"No Churn" label and assign to new column in dataframe
df$Binned_Tenure <- discretizeDF.supervised(Churn ~ ., 
                                            df[, c('Tenure', 'Churn')], 
                                            method='mdlp')$Tenure

## Rename the levels based on knowledge of min/max tenures
df$Binned_Tenure = revalue(df$Binned_Tenure, 
                           c("[-Inf,1.5)"="1-1.5m", 
                             "[1.5,5.5)"="1.5-5.5m",
                             "[5.5,17.5)"="5.5-17.5m",
                             "[17.5,43.5)"="17.5-43.5m",
                             "[43.5,59.5)"="43.5-59.5m",
                             "[59.5,70.5)"="59.5-70.5m",
                             "[70.5, Inf]"="70.5-72m"))

## FAMD
res.famd <- FAMD(df, 
                 sup.var = c(19), 
                 graph = FALSE)

## Hierachical clustering
res.hcpc <- HCPC(res.famd, nb.clust=4, graph = FALSE)

## Rename column
names(res.hcpc$data.clust)[names(res.hcpc$data.clust) == 'clust'] <- 'Cluster'
plot(res.hcpc, axes=c(1,2), choice="tree", rect=TRUE, 
  draw.tree=TRUE, ind.names=FALSE, t.level="all", title=NULL,
  new.plot=FALSE, max.plot=15, tree.barplot=FALSE)

Comparison of clusters

Visualize

## Import libraries
library(autoEDA)
library(cowplot) 

## autoEDA 
autoEDA_results <- autoEDA(res.hcpc$data.clust, 
                           y = "Cluster", returnPlotList = TRUE,
                           outcomeType = "automatic", removeConstant = TRUE, 
                           removeZeroSpread = TRUE, removeMajorityMissing = TRUE, 
                           imputeMissing = TRUE, clipOutliers = FALSE, 
                           minLevelPercentage = 0.025, predictivePower = TRUE, 
                           outlierMethod = "tukey", lowPercentile = 0.01, 
                           upPercentile = 0.99, plotCategorical = "groupedBar", 
                           plotContinuous = "histogram", bins = 30, 
                           rotateLabels = TRUE, color = "#26A69A", 
                           verbose = FALSE) 

## Plot figures in a grid
p <- plot_grid(plotlist = autoEDA_results$plots, ncol = 3)

ggsave("/results/grid.svg", width=12, height=24)

Which variables are most predictive of cluster label?

"When the outcome feature is continuous of nature or is a regression problem, correlation calculations are performed. When the outcome feature is categorical of nature or is a classification problem, the Kolmogorov Smirnov distance measure is used to determine predictive power. For multi-class classification outcomes, a one vs all approach is taken which is then averaged to arrive at the mean KS distance measure. The predictive power is sensitive towards the manner in which the data has been prepared and will differ should the manner in which the data has been prepared changes." [Documentation]

res <- autoEDA_results$overview[autoEDA_results$overview$PredictivePower %in% c("High", "Medium"),]

res[, c('Feature', 'PredictivePowerPercentage', 'PredictivePower')]
0 items

Factor map

# Import libraries
library(FactoMineR, quietly=T)
library(factoextra, quietly=T)
library(plotly, quietly=T)

## Plot relationship between levels of categorical variables obtained from MCA
res.mca <- MCA(res.hcpc$data.clust, 
               quanti.sup=c(5, 18), 
               quali.sup=c(19, 22), 
               graph = FALSE)

p <- fviz_mca_var(res.mca, col.var = "cos2", 
                  gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
                  labelsize = 4, repel=TRUE) +
             xlim(-2, 2) + ylim (-1.25, 1.25)

ggsave("./results/MCA_facto_map.svg", width=8, height=8)
## PCA
fviz_famd_ind(res.mca,
             habillage = "Churn", # color by groups 
             palette = c("#00AFBB", "#FC4E07"),
             repel = TRUE, alpha.ind = 0.2, label = 'none') +
		xlim(-1.25, 1.25) + ylim (-1.25, 1.25)
fviz_famd_ind(res.mca,
             habillage = "Cluster", # color by groups 
             palette = c("#00AFBB", "#FC4E07", "#FFA500",	'#800080'),
             repel = TRUE, alpha.ind = 0.2, label= 'none') +
	xlim(-1.25, 1.25) + ylim (-1.25, 1.25)