Kim Jenkins Kim Jenkins - 21 days ago 4
R Question

r Match and replace

I have a dataset with several rows and columns. Below is the snapshot of what some of the rows and columns looks like.

ID Date Gender Age Col1 Col2 Col3 Col4 Col5 Col6 Col7 Col8
10 2015-10-14 F 68 345.50 884.2 008.69 202.18 189.8 435.2 084.7 757
93 2002-07-22 F 87 242.80 710.9 345.50 884.2 008.69 202.18 189.8 435.2
14 2004-07-28 M 92 084.7 757 242.80 710.9 427.2 530.10 567.89 227.9
41 2011-02-24 M 39 714.0 084.7 757 242.80 710.9 427.2 530.10 567.89
64 2002-03-14 F 39 227.9 714.0 V58.49 906.7 800.35 V88.0 349.31 289.84
22 2015-11-21 F 68 324.0 V65.44 411.8 200.41 187.7 E869.3 041.04 170.4
36 2003-09-17 F 75 389.1 176.3 788.37 E936.3 277.82 812.12 E816.7 663.90
11 2000-10-07 M 74 716.90 396.3 482.1 E816.7 663.90 716.90 396.3 482.1
45 2001-07-14 F 31 614.2 945.44 799.4 864.05 371.31 268 626.2 780.72
60 1999-02-23 M 45 674 645.2 006.5 V68.2 V67.00 665.24 434.00 914.3


I have another dataset that is a lookup table which contains short description for the codes in
Col1, Col2, Col3, Col4, Col5, Col6, Col7
and
Col8
which is as shown below

Code Short_Description
345.50 interStellar
884.2 indispensable
008.69 hallucination
202.18 flow
189.8 categorizing
435.2 choppiness
084.7 chieftain
757 substantiating
V58.49 unbridled
V88.0 polish
324.0 stumble
V65.44 hoopster
411.8 overtrimmed
E869.3 overbrutalizing
041.04 choric
E936.3 busera
277.82 subdelegating
E816.7 baton
663.90 Space


My question is how do I, match the codes in the first dataset with codes in the second lookup dataset and replace the matching codes with their corresponding short description ?

The expected output below shows the codes
345.50
matched and replaced with
interStellar
,
V58.49
matched and replaced with
unbridled
I am hoping for an output where all the codes are matched and replaced with their corresponding descriptions. I know how to do this using if-then-else but it will be very inefficient and I am assuming there should be some easy way to do this. Any help is much appreciated. Thanks in advance.

ID Date Gender Age Col1 Col2 Col3 Col4 Col5 Col6 Col7 Col8
10 2015-10-14 F 68 interStellar 884.2 008.69 202.18 189.8 435.2 084.7 757
93 2002-07-22 F 87 242.80 710.9 interStellar 884.2 008.69 202.18 189.8 435.2
14 2004-07-28 M 92 084.7 757 242.80 710.9 427.2 530.10 567.89 227.9
41 2011-02-24 M 39 714.0 084.7 757 242.80 710.9 427.2 530.10 567.89
64 2002-03-14 F 39 227.9 714.0 unbridled 906.7 800.35 V88.0 349.31 289.84
22 2015-11-21 F 68 324.0 hoopster 411.8 200.41 187.7 E869.3 041.04 170.4
36 2003-09-17 F 75 389.1 176.3 788.37 E936.3 277.82 812.12 baton 663.90
11 2000-10-07 M 74 716.90 396.3 482.1 baton 663.90 716.90 396.3 482.1
45 2001-07-14 F 31 614.2 945.44 799.4 864.05 371.31 268 626.2 780.72
60 1999-02-23 M 45 674 645.2 006.5 V68.2 V67.00 665.24 434.00 914.3


====================
Reproducible dataset used in this example========================

df1 = structure(list(ID = c(10L, 93L, 14L, 41L, 64L, 22L, 36L, 11L,
45L, 60L), Date = c("10/14/2015", "7/22/2002", "7/28/2004", "2/24/2011",
"3/14/2002", "11/21/2015", "9/17/2003", "10/7/2000", "7/14/2001",
"2/23/1999"), Gender = c("F", "F", "M", "M", "F", "F", "F", "M",
"F", "M"), Age = c(68L, 87L, 92L, 39L, 39L, 68L, 75L, 74L, 31L,
45L), Col1 = c(345.5, 242.8, 84.7, 714, 227.9, 324, 389.1, 716.9,
614.2, 674), Col2 = c("884.2", "710.9", "757", "84.7", "714",
"V65.44", "176.3", "396.3", "945.44", "645.2"), Col3 = c("8.69",
"345.5", "242.8", "757", "V58.49", "411.8", "788.37", "482.1",
"799.4", "6.5"), Col4 = c("202.18", "884.2", "710.9", "242.8",
"906.7", "200.41", "E936.3", "E816.7", "864.05", "V68.2"), Col5 = c("189.8",
"8.69", "427.2", "710.9", "800.35", "187.7", "277.82", "663.9",
"371.31", "V67.00"), Col6 = c("435.2", "202.18", "530.1", "427.2",
"V88.0", "E869.3", "812.12", "716.9", "268", "665.24"), Col7 = c("84.7",
"189.8", "567.89", "530.1", "349.31", "41.04", "E816.7", "396.3",
"626.2", "434"), Col8 = c(757, 435.2, 227.9, 567.89, 289.84,
170.4, 663.9, 482.1, 780.72, 914.3)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -10L), .Names = c("ID", "Date",
"Gender", "Age", "Col1", "Col2", "Col3", "Col4", "Col5", "Col6",
"Col7", "Col8"), spec = structure(list(cols = structure(list(
ID = structure(list(), class = c("collector_integer", "collector"
)), Date = structure(list(), class = c("collector_character",
"collector")), Gender = structure(list(), class = c("collector_character",
"collector")), Age = structure(list(), class = c("collector_integer",
"collector")), Col1 = structure(list(), class = c("collector_double",
"collector")), Col2 = structure(list(), class = c("collector_character",
"collector")), Col3 = structure(list(), class = c("collector_character",
"collector")), Col4 = structure(list(), class = c("collector_character",
"collector")), Col5 = structure(list(), class = c("collector_character",
"collector")), Col6 = structure(list(), class = c("collector_character",
"collector")), Col7 = structure(list(), class = c("collector_character",
"collector")), Col8 = structure(list(), class = c("collector_double",
"collector"))), .Names = c("ID", "Date", "Gender", "Age",
"Col1", "Col2", "Col3", "Col4", "Col5", "Col6", "Col7", "Col8"
)), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))


lookup_table = structure(list(Code = c("345.5", "884.2", "8.69", "202.18", "189.8",
"435.2", "84.7", "757", "V58.49", "V88.0", "324", "V65.44", "411.8",
"E869.3", "41.04", "E936.3", "277.82", "E816.7", "63.9"), Short_Description = c("interStellar",
"indispensable", "hallucination", "flow", "\tcategorizing", "choppiness",
"chieftain", "\tsubstantiating", "unbridled", "polish", "stumble",
"hoopster", "overtrimmed", "overbrutalizing", "choric", "busera",
"subdelegating", "baton\t", "Space")), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -19L), .Names = c("Code", "Short_Description"
), spec = structure(list(cols = structure(list(Code = structure(list(), class = c("collector_character",
"collector")), Short_Description = structure(list(), class = c("collector_character",
"collector"))), .Names = c("Code", "Short_Description")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))

Answer

We can do this with gather/spread by reshaping the 'wide' dataset to 'long' format initially with gather, do a left_join with the 'lookup_table', mutate the 'Code' by replacing the elements in 'Code' with 'Short_Description' (where it is not a missing value) and spread to 'wide' format after selecting the required columns (removing 'Short_Description').

library(dplyr)
library(tidyr)
gather(df1, Var, Code, matches("Col")) %>% 
      left_join(., lookup_table) %>% 
      mutate(Code = if_else(!is.na(Short_Description), Short_Description, Code)) %>% 
      select(-Short_Description) %>%
      spread(Var, Code)

For large datasets, another option is set from data.table. Create a numeric index of column names that have 'Col' as substring ('nm1'). Convert the 'data.frame' to 'data.table' (setDT(df1)), loop through the 'nm1' columns after specifying the .SDcols and convert it to character (as the expected output will be having character strings from the 'Short_Description' column. Use a for loop and use set to change the 'values' (using match) of the columns and rows specified in the 'i'.

library(data.table)
nm1 <- grep("Col", names(df1))
setDT(df1)[, (nm1) := lapply(.SD, as.character), .SDcols= nm1]
for(j in nm1){
  set(df1, i = which(df1[[j]] %chin% lookup_table$Code), j = j,
    value = lookup_table$Short_Description[match(df1[[j]], lookup_table$Code, nomatch=0)])
 }

df1