pacomet pacomet - 1 month ago 9
R Question

Reshaping data by aggregating columns in a row

I'm facing a problem with reshaping my data but I'm not sure if reshape2 package is the solution. The original data I need to reshape are stored in a peculiar way. They are daily temperature data in csv files, this is how file header look like:

ID,YEAR,MONTH,NAME,ALTITUDE,REGION,LON,LAT,DATUM,TMAX1,TMAX2,......,TMAX31,TMIN1,TMIN2,..........,TMIN31


where
TMAX1
stands for the maximum temperature of day 1 in
MONTH
. Then, following values are the maximum temperature for all month days.
TMIN1
column gives minimum temperature for day 1 and so on until the last column with minimum temperature for last day in the month. If a month has less than 31 days the field is empty.

Short example data file can be found at link

Reformatting is needed to save data in two new files with just four columns (
ID
,
DATE
,
TEMP
,
VALIDTEMP
where
TEMP
is
TMAX
or
TMIN
) with station ID, date, temperature (
TMAX
or
TMIN
) value and validation flag as seen in the figure:

enter image description here

Thinking over my problem I should create a vector with all possible dates (in the original data only year and month are stated, day comes from the data column number/name) and then make some sort of transposing to fit every daily
TMAX
/
TMIN
data with its corresponding date in this new data frame.. Not sure if this can be accomplished by reshape2.

I made a simple first attempt with reshape2 but this gives
TMAX1
and
TMAXn
as different variables while they are all temperature data. I want to melt all
TMAXn
/
TMIN
in a single variable called
TMAX
/
TMIN
.

I will continue trying to sort out the problem but any help is greatly appreciated

Output of 20 first rows of original data file

> dput(kk)
structure(list(INDICATIVO = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("8008A",
"8036B", "8251E", "8325C", "8433I", "8472B", "8496E", "8520B"
), class = "factor"), ANYO = c(2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2016L, 2016L, 2016L, 2016L,
2016L, 2016L, 2016L, 2016L, 2016L, 2016L), MES = c(3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L), NOMBRE = structure(c(8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L), .Label = c("ALGEMESSI AUMAR",
"ARENOS (C.H. JUCAR)", "BENIDORM (AQUAGEST)", "CAUDIEL-MAS DE NOGUERA",
"EMBALSE DE ALCORA", "LA POBLA DE BENIFASS\xc0-FREDES I.", "VILLARGORDO DEL CABRIEL-CONTRERAS",
"VILLENA"), class = "factor"), ALTITUD = c(486L, 486L, 486L,
486L, 486L, 486L, 486L, 486L, 486L, 486L, 486L, 486L, 486L, 486L,
486L, 486L, 486L, 486L, 486L, 486L), NOM_PROV = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("ALICANTE", "CASTELLON", "VALENCIA"), class = "factor"),
LONGITUD = c(51562L, 51562L, 51562L, 51562L, 51562L, 51562L,
51562L, 51562L, 51562L, 51562L, 51562L, 51562L, 51562L, 51562L,
51562L, 51562L, 51562L, 51562L, 51562L, 51562L), LATITUD = c(383437L,
383437L, 383437L, 383437L, 383437L, 383437L, 383437L, 383437L,
383437L, 383437L, 383437L, 383437L, 383437L, 383437L, 383437L,
383437L, 383437L, 383437L, 383437L, 383437L), DATUM = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = "ETRS89", class = "factor"), TMAX1 = c(230L,
220L, 310L, 280L, 370L, 310L, 330L, 270L, 200L, 180L, 180L,
190L, 170L, 160L, 210L, 290L, 340L, 320L, 300L, 310L), TMAX2 = c(270L,
200L, 310L, 295L, 330L, 330L, 310L, 270L, 160L, 195L, 150L,
200L, 220L, 180L, 225L, 290L, 360L, 330L, 330L, 300L), TMAX3 = c(240L,
220L, 370L, 300L, 370L, 250L, 330L, 230L, 190L, 200L, 170L,
170L, 180L, 190L, 240L, 290L, 340L, 360L, 350L, 290L), TMAX4 = c(230L,
200L, 330L, 300L, 380L, 360L, 290L, 290L, 230L, 200L, 210L,
130L, 220L, 190L, 230L, 300L, 320L, 400L, 390L, 300L), TMAX5 = c(180L,
240L, 290L, 310L, 400L, 360L, 240L, 300L, 220L, 180L, 170L,
140L, 140L, 120L, 170L, 290L, 360L, 330L, 425L, 270L), TMAX6 = c(170L,
150L, 290L, 320L, 390L, 360L, 190L, 280L, 235L, 160L, 110L,
180L, 140L, 180L, 210L, 310L, 340L, 310L, 360L, 330L), TMAX7 = c(250L,
150L, 260L, 310L, 425L, 400L, 220L, 250L, 230L, 160L, 190L,
130L, 90L, 195L, 190L, 310L, 310L, 310L, 340L, 310L), TMAX8 = c(180L,
170L, 290L, 320L, 400L, 325L, 250L, 220L, 200L, 200L, 220L,
160L, 140L, 150L, 180L, 340L, 360L, 340L, 340L, 290L), TMAX9 = c(200L,
180L, 290L, 330L, 340L, 330L, 240L, 230L, 220L, 160L, 200L,
180L, 140L, 200L, 190L, 360L, 350L, 330L, 320L, 270L), TMAX10 = c(195L,
200L, 270L, 310L, 360L, 390L, 320L, 250L, 240L, 135L, 170L,
210L, 120L, 190L, 190L, 350L, 360L, 250L, 320L, 260L), TMAX11 = c(260L,
180L, 310L, 240L, 340L, 350L, 300L, 270L, 230L, 140L, 100L,
210L, 150L, 200L, 210L, 320L, 370L, 290L, 300L, 300L), TMAX12 = c(260L,
190L, 340L, 300L, 355L, 380L, 290L, 270L, 190L, 140L, 140L,
220L, 130L, 230L, 210L, 310L, 370L, 290L, 320L, 230L), TMAX13 = c(185L,
200L, 380L, 290L, 360L, 350L, 320L, 240L, 210L, 170L, 150L,
220L, 150L, 200L, 210L, 375L, 300L, 300L, 300L, 190L), TMAX14 = c(160L,
260L, 415L, 170L, 360L, 320L, 300L, 230L, 200L, 180L, 160L,
170L, 150L, 230L, 240L, 350L, 280L, 310L, 250L, 230L), TMAX15 = c(130L,
230L, 390L, 260L, 360L, 300L, 300L, 250L, 200L, 210L, 120L,
140L, 150L, 240L, 230L, 320L, 270L, 320L, 260L, 240L), TMAX16 = c(150L,
240L, 370L, 270L, 360L, 300L, 330L, 230L, 230L, 215L, 160L,
120L, 180L, 250L, 230L, 270L, 280L, 260L, 290L, 250L), TMAX17 = c(140L,
260L, 380L, 300L, 370L, 350L, 270L, 230L, 230L, 210L, 130L,
130L, 150L, 240L, 220L, 280L, 310L, 375L, 300L, 290L), TMAX18 = c(120L,
250L, 280L, 280L, 350L, 320L, 250L, 240L, 220L, 215L, 170L,
130L, 160L, 220L, 250L, 300L, 320L, 370L, 280L, 230L), TMAX19 = c(140L,
240L, 280L, 320L, 380L, 320L, 265L, 220L, 240L, 220L, 140L,
120L, 150L, 180L, 260L, 280L, 340L, 370L, 300L, 220L), TMAX20 = c(120L,
215L, 230L, 310L, 370L, 330L, 290L, 190L, 260L, 180L, 150L,
160L, 170L, 200L, 250L, 280L, 380L, 390L, 270L, 220L), TMAX21 = c(170L,
200L, 220L, 310L, 370L, 330L, 330L, 200L, 200L, 185L, 140L,
170L, 140L, 230L, 250L, 295L, 360L, 300L, 310L, 200L), TMAX22 = c(140L,
240L, 220L, 400L, 370L, 330L, 340L, 210L, 130L, 155L, 160L,
220L, 130L, 210L, 300L, 310L, 340L, 320L, 310L, 210L), TMAX23 = c(160L,
270L, 240L, 350L, 340L, 340L, 300L, 230L, 100L, 160L, 190L,
180L, 170L, 230L, 240L, 320L, 310L, 330L, 290L, 240L), TMAX24 = c(150L,
250L, 230L, 310L, 390L, 330L, 260L, 190L, 130L, 150L, 190L,
200L, 200L, 240L, 270L, 350L, 310L, 330L, 280L, NA), TMAX25 = c(110L,
260L, 250L, 310L, 350L, 320L, 280L, 190L, 180L, 160L, 190L,
160L, 230L, 200L, 300L, 330L, 310L, 330L, 290L, NA), TMAX26 = c(150L,
260L, 290L, 335L, 340L, 320L, 280L, 190L, 190L, 160L, 140L,
160L, 240L, 250L, 280L, 300L, 325L, 310L, 280L, NA), TMAX27 = c(230L,
220L, 260L, 360L, 420L, 330L, 260L, 230L, 160L, 160L, 130L,
85L, 210L, 250L, 300L, 320L, 340L, 340L, 270L, NA), TMAX28 = c(260L,
250L, 260L, 390L, 350L, 350L, 230L, 200L, 190L, 160L, 170L,
150L, 170L, 220L, 300L, 320L, 330L, 360L, 240L, NA), TMAX29 = c(260L,
250L, 300L, 370L, 390L, 330L, 210L, 230L, 180L, 180L, 130L,
150L, 250L, 130L, 260L, 330L, 350L, 350L, 260L, NA), TMAX30 = c(280L,
260L, 300L, 370L, 340L, 315L, 230L, 220L, 230L, 180L, 170L,
NA, 240L, 240L, 290L, 350L, 380L, 310L, 270L, NA), TMAX31 = c(310L,
NA, 290L, NA, 330L, 340L, NA, 200L, NA, 210L, 210L, NA, 175L,
NA, 270L, NA, 400L, 310L, NA, NA), TMIN1 = c(70L, 60L, 70L,
130L, 160L, 210L, 210L, 130L, 90L, -50L, 50L, 20L, 20L, 40L,
90L, 90L, 170L, 240L, 155L, 110L), TMIN2 = c(130L, 20L, 140L,
130L, 200L, 210L, 190L, 90L, 60L, -15L, 30L, -10L, 40L, -10L,
10L, 100L, 140L, 150L, 150L, 130L), TMIN3 = c(70L, 30L, 90L,
100L, 210L, 190L, 165L, 100L, 60L, -10L, 35L, 10L, 80L, 30L,
30L, 130L, 210L, 140L, 130L, 110L), TMIN4 = c(70L, 80L, 150L,
90L, 190L, 180L, 200L, 120L, 50L, 0L, 120L, -10L, -10L, 70L,
30L, 130L, 200L, 160L, 130L, 110L), TMIN5 = c(-20L, 100L,
150L, 100L, 150L, 220L, 170L, 150L, 90L, -10L, 70L, 20L,
60L, 80L, 55L, 120L, 180L, 210L, 160L, 150L), TMIN6 = c(-30L,
55L, 135L, 80L, 150L, 230L, 170L, 130L, 90L, 70L, 50L, 10L,
35L, 100L, 70L, 110L, 190L, 170L, 190L, 120L), TMIN7 = c(-30L,
80L, 70L, 70L, 150L, 240L, 170L, 120L, 100L, 80L, 40L, 65L,
-20L, 75L, 90L, 130L, 160L, 130L, 140L, 130L), TMIN8 = c(30L,
50L, 70L, 100L, 180L, 200L, 150L, 120L, 90L, 40L, 10L, 95L,
20L, 50L, 90L, 125L, 170L, 150L, 150L, 120L), TMIN9 = c(50L,
80L, 110L, 110L, 210L, 210L, 125L, 100L, 80L, 30L, 85L, 60L,
-20L, 0L, 110L, 140L, 210L, 130L, 160L, 140L), TMIN10 = c(-10L,
70L, 70L, 80L, 180L, 230L, 180L, 100L, 30L, 30L, 50L, 130L,
40L, 20L, 80L, 160L, 160L, 180L, 150L, 120L), TMIN11 = c(-10L,
80L, 50L, 110L, 160L, 220L, 170L, 130L, 25L, 30L, 50L, 120L,
-20L, 50L, 100L, 180L, 170L, 140L, 210L, 150L), TMIN12 = c(20L,
90L, 60L, 150L, 180L, 210L, 140L, 120L, 25L, 30L, 20L, 90L,
-10L, 50L, 75L, 160L, 180L, 140L, 140L, 100L), TMIN13 = c(20L,
110L, 80L, 150L, 160L, 200L, 150L, 140L, 100L, 20L, -15L,
160L, -30L, 60L, 100L, 150L, 200L, 105L, 140L, 120L), TMIN14 = c(20L,
70L, 120L, 130L, 160L, 190L, 180L, 130L, 100L, 0L, -15L,
70L, -10L, 40L, 110L, 220L, 200L, 120L, 140L, 90L), TMIN15 = c(-5L,
115L, 110L, 140L, 170L, 180L, 125L, 40L, 60L, 0L, 40L, 60L,
40L, 50L, 110L, 170L, 180L, 120L, 80L, 60L), TMIN16 = c(50L,
100L, 60L, 100L, 155L, 160L, 120L, 115L, 50L, 20L, -30L,
55L, 10L, 50L, 80L, 170L, 150L, 160L, 100L, 50L), TMIN17 = c(-10L,
80L, 60L, 110L, 170L, 210L, 140L, 90L, 40L, 5L, -60L, -80L,
5L, 100L, 50L, 120L, 100L, 155L, 90L, 110L), TMIN18 = c(70L,
50L, 50L, 120L, 170L, 205L, 120L, 100L, 0L, 0L, 70L, 0L,
90L, 80L, 60L, 100L, 130L, 185L, 110L, 160L), TMIN19 = c(100L,
90L, 160L, 120L, 180L, 220L, 140L, 100L, 30L, -10L, 50L,
70L, 100L, 110L, 80L, 100L, 130L, 180L, 100L, 160L), TMIN20 = c(110L,
60L, 130L, 120L, 190L, 200L, 100L, 140L, 50L, -30L, -15L,
-15L, 50L, 110L, 80L, 110L, 180L, 200L, 160L, 130L), TMIN21 = c(90L,
40L, 80L, 130L, 175L, 175L, 120L, 90L, 50L, -10L, 0L, 40L,
50L, 80L, 80L, 100L, 170L, 220L, 140L, 130L), TMIN22 = c(100L,
100L, 60L, 130L, 195L, 175L, 120L, 70L, -10L, 0L, 25L, 60L,
80L, 80L, 80L, 100L, 150L, 170L, 130L, 110L), TMIN23 = c(70L,
70L, 50L, 150L, 190L, 170L, 130L, 50L, -10L, 20L, 15L, 20L,
85L, 50L, 120L, 110L, 180L, 150L, 130L, 150L), TMIN24 = c(80L,
60L, 60L, 140L, 200L, 210L, 100L, 90L, -45L, 0L, 20L, 20L,
25L, 80L, 100L, 110L, 140L, 135L, 125L, 130L), TMIN25 = c(30L,
110L, 65L, 150L, 230L, 150L, 90L, 130L, -20L, -20L, 70L,
90L, 20L, 40L, 110L, 125L, 140L, 130L, 135L, NA), TMIN26 = c(10L,
110L, 80L, 150L, 230L, 150L, 90L, 130L, 100L, 20L, 40L, 20L,
115L, 40L, 130L, 170L, 140L, 145L, 185L, NA), TMIN27 = c(70L,
100L, 100L, 150L, 200L, 140L, 130L, 120L, 10L, 35L, 40L,
40L, 70L, 70L, 120L, 170L, 150L, 140L, 160L, NA), TMIN28 = c(70L,
90L, 90L, 150L, 200L, 140L, 110L, 70L, 10L, -20L, 25L, 40L,
90L, 120L, 150L, 170L, 160L, 160L, 170L, NA), TMIN29 = c(60L,
60L, 90L, 150L, 200L, 160L, 100L, 80L, -30L, 50L, 60L, 60L,
75L, 110L, 150L, 200L, 140L, 200L, 120L, NA), TMIN30 = c(90L,
60L, 70L, 150L, 210L, 170L, 130L, 65L, -10L, 0L, 60L, NA,
10L, 110L, 140L, 180L, 170L, 220L, 130L, NA), TMIN31 = c(110L,
NA, 130L, NA, 210L, 220L, NA, 100L, NA, 70L, 80L, NA, 65L,
NA, 120L, NA, 210L, 170L, NA, NA)), .Names = c("INDICATIVO",
"ANYO", "MES", "NOMBRE", "ALTITUD", "NOM_PROV", "LONGITUD", "LATITUD",
"DATUM", "TMAX1", "TMAX2", "TMAX3", "TMAX4", "TMAX5", "TMAX6",
"TMAX7", "TMAX8", "TMAX9", "TMAX10", "TMAX11", "TMAX12", "TMAX13",
"TMAX14", "TMAX15", "TMAX16", "TMAX17", "TMAX18", "TMAX19", "TMAX20",
"TMAX21", "TMAX22", "TMAX23", "TMAX24", "TMAX25", "TMAX26", "TMAX27",
"TMAX28", "TMAX29", "TMAX30", "TMAX31", "TMIN1", "TMIN2", "TMIN3",
"TMIN4", "TMIN5", "TMIN6", "TMIN7", "TMIN8", "TMIN9", "TMIN10",
"TMIN11", "TMIN12", "TMIN13", "TMIN14", "TMIN15", "TMIN16", "TMIN17",
"TMIN18", "TMIN19", "TMIN20", "TMIN21", "TMIN22", "TMIN23", "TMIN24",
"TMIN25", "TMIN26", "TMIN27", "TMIN28", "TMIN29", "TMIN30", "TMIN31"
), row.names = 786:805, class = "data.frame")

Answer

This answer uses the tidyr and dplyr packages.

library(tidyr)
library(dplyr)
library(lubridate)
temp_orig <- read.csv("data-raw/data_temp_orig.csv", 
                     stringsAsFactors = FALSE) 
# I prefer lowercase column names
names(temp_orig) <- tolower(names(temp_orig))

temp2 <- temp_orig %>% 
    # select only interesting columns
    select(id, year, month, tmax1:tmin31) %>% 
    # reshape in long format
    gather(key, temp, -id, -year, -month) %>% 
    # separate at the fourth character
    separate(key, c("key", "day"), sep = 4) %>%  
    # Combine year, month, day in a single date
    mutate(date = ymd(paste(year,month,day))) 

Filter minimum temperatures

tmindata <- temp2 %>%         
    # filter for existing dates
    filter(key == "tmin" & !is.na(date)) %>% 
    # Remove year month day
    select(-year, -month, -day) 

head(tmindata)
#      id  key temp       date
# 1 8008A tmin   20 1942-12-01
# 2 8008A tmin    5 1943-01-01
# 3 8008A tmin   55 1943-02-01
# 4 8008A tmin   20 1943-03-01
# 5 8008A tmin   40 1943-04-01
# 6 8008A tmin  109 1943-05-01

You can do the same to filter tmax data

You might have noticed a warning from lubridate above Warning message: 40 failed to parse. That's because some dates such as February 30 don't exist and there is no data for them:

tmissingdate <- temp2 %>% 
    filter(is.na(date))
head(tmissingdate)
#      id year month  key day temp date
# 1 8008A 1943     2 tmax  29   NA <NA>
# 2 8008A 1945     2 tmax  29   NA <NA>
# 3 8008A 1943     2 tmax  30   NA <NA>
# 4 8008A 1944     2 tmax  30   NA <NA>
# 5 8008A 1945     2 tmax  30   NA <NA>
# 6 8008A 1943     2 tmax  31   NA <NA>

Plot of the data

library(ggplot2)
ggplot(temp2, aes(x =  date, y = temp, color = key)) +
    geom_line()

enter image description here

Comments