user3067851 user3067851 - 2 months ago 10
R Question

Collapse rows in data.table with lapply, Reduce, and union

I have a data.table sample(JACcar) that should collapse into one row based on ID using my code below. However, I cannot figure out why it won't collapse fewer than 2 rows. I am also attempting to verify my output by limiting the columns to only those that contain values other than NA since there are 123 columns in the original data. Can someone help explain why the data will not collapse into one row being that the IDs for all 4 rows in the original data are the same?

dput(JACcar)
structure(list(ID = c("64909", "64909", "64909", "64909"), totni = c(-341.31,
-341.31, -341.31, -341.31), I1 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), I10 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), I11 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), I12 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), I14 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), I15 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), I2 = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_), I20 = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_), I21 = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_), I22 = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_), I3 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I30 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I31 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I32 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I33 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I34 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I35 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I36 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I37 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I38 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I4 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_),
I40 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I41 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I42 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I43 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I44 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I45 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I46 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I47 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I48 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I5 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I50 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I51 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I52 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I54 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I55 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I56 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I57 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I58 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I6 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I60 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I61 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I62 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I63 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I64 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I66 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I7 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I70 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I71 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I72 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I73 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I74 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I75 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I76 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I77 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I8 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), I9 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L1 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L10 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L12 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L19 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L2 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L21 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L22 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L23 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L24 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L25 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L26 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L28 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), L29 = c(NA, NA, NA, 4L), L3 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L32 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L35 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L36 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L37 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L4 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L40 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L41 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L42 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L5 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L50 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L51 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L54 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L55 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L56 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L57 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L58 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L6 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L61 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L63 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L64 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L66 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L7 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L70 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L71 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L77 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L8 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L80 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L81 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L82 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L83 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L84 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L85 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L86 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L87 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), L9 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), S14 = c(NA, NA, 3L, NA), S15 = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_), S2 = c(1L, NA, NA,
NA), S22 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), S23 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), S3 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), S35 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), S4 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), S5 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), S66 = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
), S7 = c(NA, 2L, NA, NA), S70 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), S97 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), S98 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_), S99 = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_)), .Names = c("ID", "totni", "I1",
"I10", "I11", "I12", "I14", "I15", "I2", "I20", "I21", "I22",
"I3", "I30", "I31", "I32", "I33", "I34", "I35", "I36", "I37",
"I38", "I4", "I40", "I41", "I42", "I43", "I44", "I45", "I46",
"I47", "I48", "I5", "I50", "I51", "I52", "I54", "I55", "I56",
"I57", "I58", "I6", "I60", "I61", "I62", "I63", "I64", "I66",
"I7", "I70", "I71", "I72", "I73", "I74", "I75", "I76", "I77",
"I8", "I9", "L1", "L10", "L12", "L19", "L2", "L21", "L22", "L23",
"L24", "L25", "L26", "L28", "L29", "L3", "L32", "L35", "L36",
"L37", "L4", "L40", "L41", "L42", "L5", "L50", "L51", "L54",
"L55", "L56", "L57", "L58", "L6", "L61", "L63", "L64", "L66",
"L7", "L70", "L71", "L77", "L8", "L80", "L81", "L82", "L83",
"L84", "L85", "L86", "L87", "L9", "S14", "S15", "S2", "S22",
"S23", "S3", "S35", "S4", "S5", "S66", "S7", "S70", "S97", "S98",
"S99"), class = c("tbl_dt", "tbl", "data.table", "data.frame"
), row.names = c(NA, -4L), .internal.selfref = <pointer: 0x00000000003e0788>)

jacCARtest <- JACcar[,lapply(.SD, function(x)
Reduce(union, x)),
by = ID]
jacCARfull <- jacCARtest[, Filter(function(x) !all(is.na(x)), jacCARtest)]

Answer

The reason is that some columns have NA and non-NA elements. Using the OP's code, the NA and non-NA element remain resulting in >1 row. If we need only a single row grouped by 'ID', create a if/else condition to return the unique non-NA element or if all the elements are NA, return 'NA'

JACcar[,lapply(.SD, function(x) if(all(is.na(x))) NA
                                else unique(x[!is.na(x)])) , by = ID]

Or we can use which, get the first index to subset the element in the column (as there is only a single unique non-NA element in each column)

JACcar[,lapply(.SD, function(x) x[which(!is.na(x))[1]]) , ID]
Comments