Ajay Ohri Ajay Ohri - 5 months ago 21
R Question

How to replace missing data in R with median data based on a condition

I have data from https://drive.google.com/file/d/0B9YMMvghK2ytSXI4RFo0clNLc28/view

basically a diamonds dataset of ~600,000 rows

It has missing values for one column price. I want to replace NA values with median prices of that particular color

summary(BigDiamonds)
## X1 carat cut color
## Min. : 1 Min. :0.200 Length:598024 Length:598024
## 1st Qu.:149507 1st Qu.:0.500 Class :character Class :character
## Median :299013 Median :0.900 Mode :character Mode :character
## Mean :299013 Mean :1.071
## 3rd Qu.:448518 3rd Qu.:1.500
## Max. :598024 Max. :9.250
##
## clarity table depth cert
## Length:598024 Min. : 0.00 Min. : 0.00 Length:598024
## Class :character 1st Qu.:56.00 1st Qu.:61.00 Class :character
## Mode :character Median :58.00 Median :62.10 Mode :character
## Mean :57.63 Mean :61.06
## 3rd Qu.:59.00 3rd Qu.:62.70
## Max. :75.90 Max. :81.30
##
## measurements price x y
## Length:598024 Min. : 300 Min. : 0.150 Min. : 1.000
## Class :character 1st Qu.: 1220 1st Qu.: 4.740 1st Qu.: 4.970
## Mode :character Median : 3503 Median : 5.780 Median : 6.050
## Mean : 8753 Mean : 5.991 Mean : 6.199
## 3rd Qu.:11174 3rd Qu.: 6.970 3rd Qu.: 7.230
## Max. :99990 Max. :13.890 Max. :13.890
## NA's :713 NA's :1815 NA's :1852
## z
## Min. : 0.040
## 1st Qu.: 3.120
## Median : 3.860
## Mean : 4.033
## 3rd Qu.: 4.610
## Max. :13.180
## NA's :2544


and

table(BigDiamonds$color)
##
## D E F G H I J K L
## 73630 93483 93573 96204 86619 70282 48709 25868 9656



Diamonds2=BigDiamonds[is.na(BigDiamonds$price),]
Diamonds3=BigDiamonds[is.na(BigDiamonds$price)==F,]
library(Hmisc)
summarize(Diamonds3$price,Diamonds3$color,median)
## Diamonds3$color Diamonds3$price
## 1 D 2690
## 2 E 2342
## 3 F 2966
## 4 G 3720
## 5 H 4535
## 6 I 4717
## 7 J 4697
## 8 K 4418
## 9 L 3017


I tried this, but its not working

Diamonds21=select(Diamonds2,price,color,cut)

Diamonds21$newprice=ifelse(Diamonds21$color=="J",4697,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="D",2690,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="E",2342,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="F",2966,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="G",3720,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="H",4535,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="I",4717,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="K",4418,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="L",3017,Diamonds21$newprice)


Whats wrong with my logic?

Answer Source

I tried this and it worked

note first line is different

Diamonds21$newprice=ifelse(Diamonds21$color=="J",4697,Diamonds21$price)
Diamonds21$newprice<-ifelse(Diamonds21$color=="D",2690,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="E",2342,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="F",2966,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="G",3720,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="H",4535,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="I",4717,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="K",4418,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="L",3017,Diamonds21$newprice)