Hack-R Hack-R - 4 months ago 6
R Question

Put placeholder rows in correct positions in a data.frame, filling in non-unique label columns

I am writing an ETL script and at one point there are several extracted tables which should all have the same specific dimensions (16 rows, 9 columns).

Sometimes a row will be missing due to upstream data problems, i.e. it will be 15x9. I need to add the missing row in the correct place, filling in the first 2 columns with their static, non-unique label values.

I tried creating a template data.frame and using

sqldf
to join the non-missing values onto the template, leaving the missing row with the appropriate labels and
NA
values where data is missing. However the fact that the
on
condition of the join was not distinct caused the data shape to change.

# ill-fated attempt, doubles the number of rows
require(sqldf)
sqldf("select a.label1, a.label2, b.data1,
b.data2, b.data3, b.data4, b.data5,
b.data6, b.data7 from t a join d1 b
on a.label1 = b.label1 and a.label2 = b.label2")


Reproducible data is below.
d1
in the example above corresponds to the 2nd data.frame below and
template
corresponds to the 3rd.

Example of non-problematic data (no data is missing here):

structure(list(label1 = c("BS", "Certificates", "MS", "PHD",
"BS", "Certificates", "MS", "PHD", "BS", "Certificates", "MS",
"PHD", "BS", "Certificates", "MS", "PHD"), label2 = c("INTL",
"INTL", "INTL", "INTL", "US", "US", "US", "US", "INTL", "INTL",
"INTL", "INTL", "US", "US", "US", "US"), data1 = c("fake data",
"fake data", "fake data", "fake data", "fake data", "fake data",
"fake data", "fake data", "fake data", "fake data", "fake data",
"fake data", "fake data", "fake data", "fake data", "fake data"
), data2 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16), data3 = c("09B", "09B", "09B", "09B", "09B", "09B", "09B",
"09B", "09B", "09B", "09B", "09B", "09B", "09B", "09B", "09B"
), data4 = c(887L, 220L, 3633L, 1491L, 3331L, 379L, 9521L, 3785L,
624L, 188L, 1842L, 863L, 3119L, 428L, 7868L, 3138L), data5 = c(531L,
62L, 1114L, 625L, 2454L, 286L, 6298L, 2512L, 521L, 161L, 1344L,
649L, 2795L, 397L, 6984L, 2651L), data6 = c(35L, 12L, 389L, 253L,
468L, 140L, 3143L, 1244L, 36L, 7L, 337L, 167L, 621L, 135L, 3116L,
1222L), data7 = c(27L, 10L, 313L, 214L, 404L, 128L, 2853L, 1111L,
22L, 5L, 224L, 123L, 459L, 103L, 2650L, 984L)), .Names = c("label1",
"label2", "data1", "data2", "data3", "data4", "data5", "data6",
"data7"), row.names = c(NA, -16L), class = "data.frame")


Example of problematic data (2 rows missing):

structure(list(label1 = c("BS", "Certificates", "MS", "PHD",
"BS", "MS", "PHD", "BS", "Certificates", "MS", "PHD", "Certificates",
"MS", "PHD"), label2 = c("INTL", "INTL", "INTL", "INTL", "US",
"US", "US", "INTL", "INTL", "INTL", "INTL", "US", "US", "US"),
data1 = c("fake data", "fake data", "fake data", "fake data",
"fake data", "fake data", "fake data", "fake data", "fake data",
"fake data", "fake data", "fake data", "fake data", "fake data"
), data2 = c(1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15,
16), data3 = c("09B", "09B", "09B", "09B", "09B", "09B",
"09B", "09B", "09B", "09B", "09B", "09B", "09B", "09B"),
data4 = c(887L, 220L, 3633L, 1491L, 3331L, 9521L, 3785L,
624L, 188L, 1842L, 863L, 428L, 7868L, 3138L), data5 = c(531L,
62L, 1114L, 625L, 2454L, 6298L, 2512L, 521L, 161L, 1344L,
649L, 397L, 6984L, 2651L), data6 = c(35L, 12L, 389L, 253L,
468L, 3143L, 1244L, 36L, 7L, 337L, 167L, 135L, 3116L, 1222L
), data7 = c(27L, 10L, 313L, 214L, 404L, 2853L, 1111L, 22L,
5L, 224L, 123L, 103L, 2650L, 984L)), .Names = c("label1",
"label2", "data1", "data2", "data3", "data4", "data5", "data6",
"data7"), row.names = c(NA, -14L), class = "data.frame")


Example of template:

structure(list(label1 = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), .Label = c("BS", "Certificates",
"MS", "PHD"), class = "factor"), label2 = structure(c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("INTL",
"US"), class = "factor"), data1 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), data2 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), data3 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
data4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), data5 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), data6 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), data7 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), .Names = c("label1", "label2", "data1", "data2", "data3",
"data4", "data5", "data6", "data7"), row.names = c(NA, -16L), class = "data.frame")

Answer

Suppose t and d2 are the template and the 2nd example with the missing rows. Then add a count column and perform a left join.

tc <- transform(t, count = ave(1:nrow(t), label1, label2, FUN = seq_along))
d2c <- transform(d2, count = ave(1:nrow(d2), label1, label2, FUN = seq_along))

library(sqldf)
sqldf("select label1, label2, 
              b.data1, b.data2, b.data3, b.data4, b.data5, b.data6, b.data7
       from tc left join d2c b using (label1, label2, count)")

giving:

         label1 label2     data1 data2 data3 data4 data5 data6 data7
1            BS   INTL fake data     1   09B   887   531    35    27
2  Certificates   INTL fake data     2   09B   220    62    12    10
3            MS   INTL fake data     3   09B  3633  1114   389   313
4           PHD   INTL fake data     4   09B  1491   625   253   214
5            BS     US fake data     5   09B  3331  2454   468   404
6  Certificates     US fake data    14   09B   428   397   135   103
7            MS     US fake data     7   09B  9521  6298  3143  2853
8           PHD     US fake data     8   09B  3785  2512  1244  1111
9            BS   INTL fake data     9   09B   624   521    36    22
10 Certificates   INTL fake data    10   09B   188   161     7     5
11           MS   INTL fake data    11   09B  1842  1344   337   224
12          PHD   INTL fake data    12   09B   863   649   167   123
13           BS     US      <NA>    NA  <NA>    NA    NA    NA    NA
14 Certificates     US      <NA>    NA  <NA>    NA    NA    NA    NA
15           MS     US fake data    15   09B  7868  6984  3116  2650
16          PHD     US fake data    16   09B  3138  2651  1222   984

The sqldf statement could be replaced by these statements if there were too many data fields to conveniently list:

dataNames <- toString(paste("b", tail(names(d2c), -2), sep = "."))
fn$sqldf("select label1, label2, $dataNames
       from tc left join d2c b using (label1, label2, count)")