user4797853 user4797853 - 2 months ago 12
R Question

Sum correlated variables

I have a list of 200 variables and I want to sum those that are highly correlated.

Assuming this is my data

mydata <- structure(list(APPLE= c(1L, 2L, 5L, 4L, 366L, 65L, 43L, 456L, 876L, 78L, 687L, 378L, 378L, 34L, 53L, 43L),
PEAR= c(2L, 2L, 5L, 4L, 366L, 65L, 43L, 456L, 876L, 78L, 687L, 378L, 378L, 34L, 53L, 41L),
PLUM = c(10L, 20L, 10L, 20L, 10L, 20L, 1L, 0L, 1L, 2010L,20L, 10L, 10L, 10L, 10L, 10L),
BANANA= c(2L, 10L, 31L, 2L, 2L, 5L, 2L, 5L, 1L, 52L, 1L, 2L, 52L, 6L, 2L, 1L),
LEMON = c(4L, 10L, 31L, 2L, 2L, 5L, 2L, 5L, 1L, 52L, 1L, 2L, 52L, 6L, 2L, 3L)),
.Names = c("APPLE", "PEAR", "PLUM", "BANANA", "LEMON"),
class = "data.frame", row.names = c(NA,-16L))


I have found this code which I am not sure how to tweak in order to leverage it for my purpose
http://stackoverflow.com/a/39484353/4797853

var.corelation <- cor(as.matrix(mydata), method="pearson")

library(igraph)
# prevent duplicated pairs
var.corelation <- var.corelation*lower.tri(var.corelation)
check.corelation <- which(var.corelation>0.62, arr.ind=TRUE)

graph.cor <- graph.data.frame(check.corelation, directed = FALSE)
groups.cor <- split(unique(as.vector(check.corelation)), clusters(graph.cor)$membership)
lapply(groups.cor,FUN=function(list.cor){rownames(var.corelation)[list.cor]})


The output that I am looking for is 2 data frames as follow:

DF1

GROUP1 GROUP2
3 16
4 40
ETC..


The values are the sum of the values within a group

DF2

ORIGINAL_VAR GROUP

APPLE 1
PEAR 1
PLUM 2
BANANA 2
LEMON 2

Answer

Try this (assuming that you have only clustered into 2 groups):

DF1 <- cbind.data.frame(GROUP1=rowSums(mydata[,groups.cor[[1]]]), 
                        GROUP2=rowSums(mydata[,groups.cor[[2]]]))
DF1 

   GROUP1 GROUP2
1       3     16
2       4     40
3      10     72
4       8     24
5     732     14
6     130     30
7      86      5
8     912     10
9    1752      3
10    156   2114
11   1374     22
12    756     14
13    756    114
14     68     22
15    106     14
16     84     14

DF2 <- NULL
for (i in 1:2) {
  DF2 <- rbind(DF2, 
           cbind.data.frame(ORIGINAL_VAR=rownames(var.corelation)[groups.cor[[i]]], 
           GROUP=i))
}

DF2

  ORIGINAL_VAR GROUP
1         PEAR     1
2        APPLE     1
3       BANANA     2
4        LEMON     2
5         PLUM     2