vino88 vino88 - 10 months ago 31
R Question

Dummy variable for if x occurs within y time from time z

I am trying to create a dummy variable in row A based on if rows within 1 year of row A, x occurs.
I believe this is probably a common issue, and there are similar questions already posted (this is the most similar I found). Unfortunately the zoo package doesn't fit well since it doesn't deal well with irregular spaced dates (I don't want to aggregate rows and my data is too large to deal with this well) and I have been trying unsuccessfully to figure out a datatable way to do this, though I would prefer tidyverse given my experience.

dates <- rep(as.Date(c('2015-01-01', '2015-02-02', '2015-03-03', '2016-02-02'), '%Y-%m-%d'), 3)

names <- c(rep('John', 4), rep('Phil', 4), rep('Ty', 4))

df <- data.frame(Name = names, Date = dates,
did_y = c(0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0),
did_x = c(1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1))

Name Date did_y did_x
John 2015-01-01 0 1
John 2015-02-02 1 0
John 2015-03-03 1 0
John 2016-02-02 0 0
Phil 2015-01-01 1 0
Phil 2015-02-02 1 1
Phil 2015-03-03 0 1
Phil 2016-02-02 0 0
Ty 2015-01-01 0 0
Ty 2015-02-02 0 0
Ty 2015-03-03 0 0
Ty 2016-02-02 0 1


What I'd like is

dffinal <- data.frame(Name = names, Date = dates,
did_y = c(0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0),
did_x = c(1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1),
did_x_within_year = c(1, 1, 1, NA, 1, 1, 1, 1, 0, 1, 1, 1),
did_x_next_year = c(0, 0, 0, NA, 1, 1, 0, NA, 0, 1, 1, NA))

Name Date did_y did_x did_x_within_year did_x_next_year
John 2015-01-01 0 1 1 0
John 2015-02-02 1 0 1 0
John 2015-03-03 1 0 1 0
John 2016-02-02 0 0 NA NA
Phil 2015-01-01 1 0 1 1
Phil 2015-02-02 1 1 1 1
Phil 2015-03-03 0 1 1 0
Phil 2016-02-02 0 0 1 NA
Ty 2015-01-01 0 0 0 0
Ty 2015-02-02 0 0 1 1
Ty 2015-03-03 0 0 1 1
Ty 2016-02-02 0 1 1 NA


So I'd like two columns, one for if x occurred within 1 year of row A (regardless of before or after) and another if it occurred within 1 year in the future.

I experimented with RcppRoll, but it seems to only look backward in dates, i.e. if something happened a year before it will dummy, but not if it will occur 1 year in the future.

df$did_x_next_year <- roll_max(df$did_x, 365, fill = NA)


EDIT: Attempted solution based on other question

I have tried to implement this solution (1b), unfortunately nothing in my dataframe/datatable actually changes. Even if I leave the function as is from the example when applied to my data, it does not update.

library(zoo)
library(data.table)
df$Year <- lubridate::year(df$Date)
df$Month <- lubridate::month(df$Date)
df$did_x_next_year <- df$did_x

DT <- as.data.table(df)

k <- 12 # prior 12 months

# inputs zoo object x, subsets it to specified window and sums
Max2 <- function(x) {
w <- window(x, start = end(x) - k/12, end = end(x) - 1/12)
if (length(w) == 0 || all(is.na(w))) NA_real_ else max(w, na.rm = TRUE)
}

nms <- names(DT)[7]

setkey(DT, Name, Year, Month) # sort

# create zoo object from arguments and run rollapplyr using Sum2
roll2 <- function(x, year, month) {
z <- zoo(x, as.yearmon(year + (month - 1)/12))
coredata(rollapplyr(z, k+1, Max2, coredata = FALSE, partial = TRUE))
}

DT <- DT[, nms := lapply(.SD, roll2, Year, Month), .SDcols = nms, by = "Name"]

Answer Source

After a suggestion from a friend, I came up with the following:

# Filtering to the obs I care about
dfadd <- df %>% filter(did_x == 1) %>% select(Name, Date) %>% rename(x_date = Date)

# Converting to character since in dcast it screws up the dates
dfadd$x_date <- as.character(dfadd$x_date)

# Merging data
df <- plyr::join(df, dfadd, by = 'Name')

# Creating new column used for dcasting
df <- df %>% group_by(Name, Date) %>% mutate(x_date_index = seq(from = 1, to = n()))
df$x_date_index <- paste0('x_date_',df$x_date_index)

#casting the data wide
df <- reshape2::dcast(df,
                  Name + Date + did_y + did_x ~ x_date_index,
                  value.var = "x_date",
                  fill = NA)

# Converting to back to date
df$x_date_1 <- as.Date(df$x_date_1)
df$x_date_2 <- as.Date(df$x_date_2)

# Creating dummy variables
df$did_x_within_year <- 0
df$did_x_within_year <- ifelse((df$x_date_1 - df$Date) <= 366, 1, 
df$did_x_within_year)

df$did_x_next_year <- 0
df$did_x_next_year <- ifelse(((df$x_date_1 > df$Date) & (df$x_date_1 - df$Date<= 365)), 
                         1, df$did_x_next_year)

# Can extend to account for x_date_2, x_date_3, etc

# Changing the last entry to NA as desired
df <- df %>% group_by(Name) %>% mutate(did_x_next_year = c(did_x_next_year[-n()], NA))
Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download