R.B R.B - 1 year ago 32
R Question

split data frame faster

I create a function and i use the split function but it take a long time to get the result :

st=c(0 ,0, 9,39,44 ,100, 0, 0, 8,26 ,100, 0, 0, 6, 9,16,20,24,29,35,37,47,54,73 ,100, 0, 0, 6,35,44 ,100, 0, 0,10,16,27,40,51,91, 100, 0, 0,3, 7,28,69,71,75, 100, 0, 0,19 ,100, 0, 0, 7,24,29,35 ,100, 0, 0, 8,11,14,15,18,31,32,33,50,53,56,62,79,80,82,87,88,89, 100, 0, 0, 2,7,31,34,40 ,100, 0, 0,10,41,51,76 ,100, 0, 0, 4,32,41,46 ,100, 0, 0,19,26,59,76,83,88,92 ,100, 0, 0,11,27,51, 100, 0, 0, 5, 7,45,56,78,3 ,100, 0, 0, 3,12,23,46,53,72 ,100)

int=c(0.00,3.52 ,11.94,1.78 ,22.00,0.00,0.00,5.85 ,14.26 ,56.65,0.00,0.00,4.52,2.76,4.89,3.17,3.36,3.67,4.49,1.97,7.47,5.55, 14.79 ,20.78,0.00,0.00,4.51 ,20.71,6.60 ,40.08,0.00,0.00 ,11.28,7.30 , 12.14 ,14.01, 12.82 ,45.65,9.97,0.00,0.00,2.33,3.72 ,19.55, 37.61,1.72,3.56 ,23.05,0.00,0.00 ,13.51 ,57.64,0.00,0.00,4.74 ,11.42,3.51,4.38 ,43.83,0.00,0.00,5.66,2.35,1.62,1.09,2.05,8.76,0.63,1.05, 11.65,2.34,1.82,4.78, 11.41,1.10,1.52,3.41,0.61,1.01,7.41,0.00,0.00,2.09,3.72,21.57,2.69,5.65 ,53.43,0.00,0.00,3.77 ,12.05,3.85,9.88,9.13,0.00,0.00,3.32 ,20.97,6.61,3.47 ,40.62,0.00,0.00,3.26,1.27,5.71,2.94,1.13,0.89,0.78,1.31,0.00,0.00,4.91,7.03 ,10.14 ,21.36,0.00,0.00,4.16,2.22 ,33.84 ,10.72, 19.17 ,13.68,6.49,0.00,0.00,1.83,5.22,6.95, 13.92,4.04, 11.66 ,17.04,0.00)

id=c(1:length(st))

Attr=c("sta","a", "cr","a", "hf", "sp", "sta","hf", "cr",
"a", "sp", "sta","a", "ac","a", "hf" ,"cr","a",
"ac","a", "sl", "cr","a", "pq","sp", "sta","a",
"sl", "cr","hf" ,"sp", "sta","a", "cr","sl", "hf",
"a", "pq","hf", "sp", "sta","cr","a", "hf", "sl",
"cr","hf" ,"a", "sp", "sta","hf" ,"cr","sp", "sta",
"hc","cr","hf", "sl", "a", "sp", "sta","hf", "a",
"cr","hf" ,"a", "cr","hf", "a", "cr","hf", "a",
"hf", "cr","hf" ,"a", "cr","hf", "a", "cr","sp",
"sta","cr","a", "hf", "a", "cr","hf" ,"sp", "sta",
"sl", "a", "hf" ,"cr","a", "sp", "sta","a", "ac",
"sl", "hf" ,"cr","sp", "sta","hc","pv","a", "hf",
"a", "pv","hc","sl", "sp", "sta","hf", "a", "cr",
"sl", "sp", "sta","hf", "a", "cr","a", "a", "sl",
"a", "sp", "sta","cr","hf" ,"a", "sl", "cr","a","hf" ,"sp")
p=replicate(length(Attr),sample(1:3,1,replace=T))
data=cbind.data.frame(id,st,int,Attr,p)


si<-function(data,...){
ff<-list()
library(MASS)
library(Hmisc)
att<-function(data,...){
d=data
f=list()
z=list()

f=split(data, data$Attr,drop=T)
z=lapply(f,function(x){if(nrow(x)> 1){fitdistr(as.integer(x$int),"Negative Binomial")}})
z=z[!sapply(z, is.null)]
return(z)
}
data$p=as.factor(data$p)

datap=list()
d=list()
s=list()
for(i in 1:3){
datap[[i]]=data[data$p==i,]
d[[i]]=subset(datap[[i]],int != 0)
}

s=lapply(d,att)

return(s)
}


and I have to use this function 4000 time :

system.time(a<-replicate(4000,si(data)))
utilisateur système écoulé
110.02 1.01 111.33


So my question is if there is any other alternative to split data faster and speed up the function execution time

Answer Source

Your function was too complicated, I simplified it a bit but it didn't gain much time. Like others have said, the largest share of time spent goes to fitdistr, then densfun, then .Call. And those are all in the call to fitdistr, so cannot be optimized. (I used the profiling code by Federico Manigrasso.) First of all I put the librarycalls at the beginning of the code, not inside the function. I also changed the way you create the data.frame.

library(MASS)
library(Hmisc)

data <- data.frame(id,st,int,Attr,p) 

si2 <- function(data,...){
    att<-function(data,...){
      z=lapply(split(data, data$Attr,drop=T), function(x){
          if(nrow(x)> 1) fitdistr(as.integer(x$int),"Negative Binomial")
      })
      z[!sapply(z, is.null)]
    }

    inx <- data$int != 0
    lapply(lapply(1:3, function(i) data[data$p==i & inx,]), att)
}

system.time(a<-replicate(4000,si(data)))
   user  system elapsed 
  89.40    0.00   89.73 
system.time(b<-replicate(4000,si2(data)))
   user  system elapsed 
  84.21    0.03   84.33 
identical(a, b)
[1] TRUE
Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download