Principal component methods / Aug 19 2019
Hierarchical clustering based on PCs
Introduction
Hierachical clustering based on principal components
12.3s
hierarchical_clustering_R (R)
## Import libraries library(FactoMineR) library(factoextra) library(plyr) library(dplyr) library(arulesCBA) ## Import data df <-read.csv("https://github.com/nchelaru/data-prep/raw/master/telco_cleaned_renamed.csv") ## Drop TotalCharges variable, as it is a product of MonthlyCharges and Tenure df <- within(df, rm('TotalCharges')) ## Discretize "MonthlyCharges" with respect to "Churn"/"No Churn" label and assign to new column in dataframe df$Binned_MonthlyCharges <- discretizeDF.supervised(Churn ~ ., df[, c('MonthlyCharges', 'Churn')], method='mdlp')$MonthlyCharges ## Rename the levels based on knowledge of min/max monthly charges df$Binned_MonthlyCharges = revalue(df$Binned_MonthlyCharges, c("[-Inf,29.4)"="$0-29.4", "[29.4,56)"="$29.4-56", "[56,68.8)"="$56-68.8", "[68.8,107)"="$68.8-107", "[107, Inf]" = "$107-118.75")) ## Discretize "Tenure" with respect to "Churn"/"No Churn" label and assign to new column in dataframe df$Binned_Tenure <- discretizeDF.supervised(Churn ~ ., df[, c('Tenure', 'Churn')], method='mdlp')$Tenure ## Rename the levels based on knowledge of min/max tenures df$Binned_Tenure = revalue(df$Binned_Tenure, c("[-Inf,1.5)"="1-1.5m", "[1.5,5.5)"="1.5-5.5m", "[5.5,17.5)"="5.5-17.5m", "[17.5,43.5)"="17.5-43.5m", "[43.5,59.5)"="43.5-59.5m", "[59.5,70.5)"="59.5-70.5m", "[70.5, Inf]"="70.5-72m")) ## FAMD res.famd <- FAMD(df, sup.var = c(19), graph = FALSE) ## Hierachical clustering res.hcpc <- HCPC(res.famd, nb.clust=4, graph = FALSE) ## Rename column names(res.hcpc$data.clust)[names(res.hcpc$data.clust) == 'clust'] <- 'Cluster'
plot(res.hcpc, axes=c(1,2), choice="tree", rect=TRUE, draw.tree=TRUE, ind.names=FALSE, t.level="all", title=NULL, new.plot=FALSE, max.plot=15, tree.barplot=FALSE)
Comparison of clusters
Visualize
## Import libraries library(autoEDA) library(cowplot) ## autoEDA autoEDA_results <- autoEDA(res.hcpc$data.clust, y = "Cluster", returnPlotList = TRUE, outcomeType = "automatic", removeConstant = TRUE, removeZeroSpread = TRUE, removeMajorityMissing = TRUE, imputeMissing = TRUE, clipOutliers = FALSE, minLevelPercentage = 0.025, predictivePower = TRUE, outlierMethod = "tukey", lowPercentile = 0.01, upPercentile = 0.99, plotCategorical = "groupedBar", plotContinuous = "histogram", bins = 30, rotateLabels = TRUE, color = "#26A69A", verbose = FALSE) ## Plot figures in a grid p <- plot_grid(plotlist = autoEDA_results$plots, ncol = 3) ggsave("/results/grid.svg", width=12, height=24)
Which variables are most predictive of cluster label?
"When the outcome feature is continuous of nature or is a regression problem, correlation calculations are performed. When the outcome feature is categorical of nature or is a classification problem, the Kolmogorov Smirnov distance measure is used to determine predictive power. For multi-class classification outcomes, a one vs all approach is taken which is then averaged to arrive at the mean KS distance measure. The predictive power is sensitive towards the manner in which the data has been prepared and will differ should the manner in which the data has been prepared changes." [Documentation]
res <- autoEDA_results$overview[autoEDA_results$overview$PredictivePower %in% c("High", "Medium"),] res[, c('Feature', 'PredictivePowerPercentage', 'PredictivePower')]
0 items
Factor map
# Import libraries library(FactoMineR, quietly=T) library(factoextra, quietly=T) library(plotly, quietly=T) ## Plot relationship between levels of categorical variables obtained from MCA res.mca <- MCA(res.hcpc$data.clust, quanti.sup=c(5, 18), quali.sup=c(19, 22), graph = FALSE) p <- fviz_mca_var(res.mca, col.var = "cos2", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), labelsize = 4, repel=TRUE) + xlim(-2, 2) + ylim (-1.25, 1.25) ggsave("./results/MCA_facto_map.svg", width=8, height=8)
## PCA fviz_famd_ind(res.mca, habillage = "Churn", # color by groups palette = c("#00AFBB", "#FC4E07"), repel = TRUE, alpha.ind = 0.2, label = 'none') + xlim(-1.25, 1.25) + ylim (-1.25, 1.25)
fviz_famd_ind(res.mca, habillage = "Cluster", # color by groups palette = c("#00AFBB", "#FC4E07", "#FFA500", '#800080'), repel = TRUE, alpha.ind = 0.2, label= 'none') + xlim(-1.25, 1.25) + ylim (-1.25, 1.25)