Adamaki Adamaki - 21 days ago 4
R Question

How do I select values from a lookup table based on their time stamp?

I have a data frame called

dayfile
containing the following variables:

Period SubCode EchoTime PosX PosY PosZ
1 8753 13 2016-06-19 00:06:27 16.81 39.21 12.66
2 8753 13 2016-06-19 00:06:35 16.67 39.08 12.66
3 8753 13 2016-06-19 00:06:36 16.33 39.60 13.03
4 8753 13 2016-06-19 00:06:45 17.14 38.14 12.23
5 8753 13 2016-06-19 00:06:53 16.95 38.21 12.38
6 8753 13 2016-06-19 00:06:53 17.44 37.67 11.95


And I have another data frame called
probe.DOT1
that looks like this:

DO.time.1m DO.1m Temp.1m
1 2016-06-18 10:24:50 7.69 18.04
2 2016-06-18 11:24:50 7.54 19.12
3 2016-06-18 11:54:50 7.57 18.98
4 2016-06-18 12:24:50 9.51 19.88
5 2016-06-18 12:54:50 9.30 11.62
6 2016-06-18 13:24:50 8.81 11.54


I want to create a new column in
dayfile
called
O2
from
probe.DOT1$DO.1m
where
dayfile$EchoTime
is within the time range between two consecutive values of
probe.DOT1$Do.time.1m
, i.e. it should look something like this:

Period SubCode EchoTime PosX PosY PosZ O2
1 8753 13 2016-06-19 00:06:27 16.81 39.21 12.66 7.54
2 8753 13 2016-06-19 00:06:35 16.67 39.08 12.66 7.54
3 8753 13 2016-06-19 00:06:36 16.33 39.60 13.03 7.59
4 8753 13 2016-06-19 00:06:45 17.14 38.14 12.23 7.59
5 8753 13 2016-06-19 00:06:53 16.95 38.21 12.38 7.59
6 8753 13 2016-06-19 00:06:53 17.44 37.67 11.95 7.59


I have tried this:

dayfile$O2 <- probe.DOT1[dayfile$EchoTime < probe.DOT1$DO.time.1m &
dayfile$EchoTime > diff(probe.DOT1$DO.time.1m, lag = 1) , 'DO.1m']


But it gives me errors. Does anyone have any suggestions?

Thanks.

EDIT

Here is some dput so you can recreate small portions of my files:

dayfile
:

structure(list(Period = c(7017, 7017, 7017, 7017, 7017, 7017,
7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017,
7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017,
7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017,
7017, 7017), SubCode = c(20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
), EchoTime = structure(c(1466249003, 1466249010, 1466249017,
1466249025, 1466249032, 1466249039, 1466249046, 1466249053, 1466249060,
1466249067, 1466249074, 1466249081, 1466249088, 1466249095, 1466249102,
1466249109, 1466249116, 1466249123, 1466249130, 1466249137, 1466249144,
1466249151, 1466249158, 1466249165, 1466249172, 1466249179, 1466249186,
1466249193, 1466249200, 1466249207, 1466249214, 1466249221, 1466249228,
1466249235, 1466249242, 1466249249, 1466249256, 1466249263, 1466249270,
1466249277, 1466249284), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
PosX = c(14.69, 14.69, 14.69, 16.31, 14.94, 14.94, 14.85,
14.73, 16.64, 16.65, 16.7, 16.36, 16.38, 16.37, 16.41, 16.39,
16.12, 15.8, 16.33, 16.17, 16.32, 15.61, 15.5, 15.82, 16.68,
16.68, 16.68, 16.61, 16.68, 16.68, 11.39, 11.39, 16.68, 16.6,
16.6, 15.08, 15.08, 14.9, 14.81, 15.08, 15.15), PosY = c(36.98,
36.98, 36.98, 37.68, 36.9, 36.9, 36.96, 37.09, 37.19, 37.19,
37.01, 37.49, 37.47, 37.47, 37.45, 37.46, 37.75, 38.08, 38.06,
38.49, 38.06, 38.54, 38.39, 38.06, 37.16, 37.16, 37.16, 37.36,
37.16, 37.16, 40.09, 40.09, 37.16, 37.37, 37.37, 36.79, 36.79,
36.98, 36.94, 36.79, 36.73), PosZ = c(14.68, 14.68, 14.68,
15.67, 15.03, 15.03, 14.66, 14.88, 15.12, 15.22, 14.84, 15.28,
15.58, 15.48, 15.88, 15.68, 16.05, 15.91, 15.37, 15.64, 15.27,
16.1, 16.3, 16.33, 15.61, 15.61, 15.61, 15.8, 15.61, 15.61,
18.06, 18.06, 15.61, 15.7, 15.7, 15.13, 15.13, 15.3, 15.38,
15.13, 15.19)), .Names = c("Period", "SubCode", "EchoTime",
"PosX", "PosY", "PosZ"), row.names = c(213387L, 213389L, 213391L,
213393L, 213395L, 213397L, 213399L, 213401L, 213403L, 213405L,
213407L, 213409L, 213411L, 213413L, 213415L, 213417L, 213419L,
213421L, 213423L, 213425L, 213427L, 213429L, 213431L, 213433L,
213435L, 213437L, 213439L, 213441L, 213443L, 213445L, 213447L,
213449L, 213450L, 213452L, 213454L, 213456L, 213458L, 213460L,
213462L, 213464L, 213466L), class = "data.frame")


probe.DOT1
:

structure(list(DO.time.1m = structure(c(1466245490, 1466249090,
1466250890, 1466252690, 1466254490, 1466256290, 1466258090, 1466259890,
1466261690, 1466263490, 1466265290, 1466267090, 1466268890, 1466270690,
1466272490, 1466274290, 1466276090, 1466277890, 1466279690, 1466281490
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), DO.1m = c(7.69,
7.54, 7.57, 9.51, 9.3, 8.81, 8.71, 8.66, 8.19, 8.52, 8.79, 9.01,
9.19, 9.39, 9.79, 9.82, 9.48, 9.5, 9.29, 9.38), Temp.1m = c(18.04,
19.12, 18.98, 19.88, 11.62, 11.54, 12.24, 12.22, 12.32, 12.2,
12.18, 12.16, 12.14, 11.8, 11.62, 11.68, 11.52, 11.32, 11.34,
11.14)), .Names = c("DO.time.1m", "DO.1m", "Temp.1m"), row.names = c(NA,
20L), class = "data.frame")

Answer

Ah man... fuzzyjoin almost can solve this (which would be infinitely "eleganter"), but o well.

Here's my pipeline to do this w/out loops... but the dput you gave us is a bit crappy in that there was only 1 value properly between ranges... but let's see how it does on your real data.

## First create a df that contains all the timestamp info:
all <- data.frame(ts = unique(unlist(c(dayfile$EchoTime,probe.DOT1$DO.time.1m))))

## Now join both tables to this master record, fill in the blanks in 
## Period,SubCode,PosX,Y,Z, remove Temp.1m, remove useless rows, rename O2
library(dplyr)
library(tidyr)
all %>%
  left_join(dayfile,   by=c("ts"="EchoTime")) %>%
  left_join(probe.DOT1,by=c("ts" = "DO.time.1m")) %>%
  arrange(ts) %>%
  fill(Period,SubCode,PosX,PosY,PosZ) %>%
  select(-Temp.1m) %>%
  filter(!is.na(DO.1m)) %>%
  rename(O2=DO.1m)

# ts Period SubCode  PosX  PosY  PosZ   O2
# 1  2016-06-18 10:24:50     NA      NA    NA    NA    NA 7.69
# 2  2016-06-18 11:24:50   7017      20 16.38 37.47 15.58 7.54
# 3  2016-06-18 11:54:50   7017      20 15.15 36.73 15.19 7.57
# 4  2016-06-18 12:24:50   7017      20 15.15 36.73 15.19 9.51
# 5  2016-06-18 12:54:50   7017      20 15.15 36.73 15.19 9.30
# 6  2016-06-18 13:24:50   7017      20 15.15 36.73 15.19 8.81
# 7  2016-06-18 13:54:50   7017      20 15.15 36.73 15.19 8.71
# 8  2016-06-18 14:24:50   7017      20 15.15 36.73 15.19 8.66
# 9  2016-06-18 14:54:50   7017      20 15.15 36.73 15.19 8.19
# 10 2016-06-18 15:24:50   7017      20 15.15 36.73 15.19 8.52
# 11 2016-06-18 15:54:50   7017      20 15.15 36.73 15.19 8.79
# 12 2016-06-18 16:24:50   7017      20 15.15 36.73 15.19 9.01
# 13 2016-06-18 16:54:50   7017      20 15.15 36.73 15.19 9.19
# 14 2016-06-18 17:24:50   7017      20 15.15 36.73 15.19 9.39
# 15 2016-06-18 17:54:50   7017      20 15.15 36.73 15.19 9.79
# 16 2016-06-18 18:24:50   7017      20 15.15 36.73 15.19 9.82
# 17 2016-06-18 18:54:50   7017      20 15.15 36.73 15.19 9.48
# 18 2016-06-18 19:24:50   7017      20 15.15 36.73 15.19 9.50
# 19 2016-06-18 19:54:50   7017      20 15.15 36.73 15.19 9.29
# 20 2016-06-18 20:24:50   7017      20 15.15 36.73 15.19 9.38

EDIT:

To see the whole dataset with all the data together, use:

all %>%
  left_join(dayfile,   by=c("ts"="EchoTime")) %>%
  left_join(probe.DOT1,by=c("ts" = "DO.time.1m")) 

In R, you can (and should) run every row one at a time...just to see what each row does. Do this by selecting "blocks of text"... from all to whatever. See how running what I have above shows you all the steps so far? You can do that with every other line in the pipeline too... so just select the block everything including the fill, then run:

all %>%
  left_join(dayfile,   by=c("ts"="EchoTime")) %>%
  left_join(probe.DOT1,by=c("ts" = "DO.time.1m")) %>%
  arrange(ts) %>%
  fill(Period,SubCode,PosX,PosY,PosZ)