advance advance - 1 month ago 39
R Question

Dplyr: group_by and convert multiple columns to a vector

I have a question about how to convert multiple columns to a vector. I have the following dataset that I would like to group them by their condition and take all the position count into one vector. I know I can use as.vector() to convert them individually but I wonder if there is a dplyr way. Thank you!

test -> structure(list(gene_id = c("gene0", "gene0", "gene0", "gene0",
"gene0", "gene0", "gene0", "gene0", "gene0", "gene0", "gene0",
"gene0", "gene0", "gene0", "gene0", "gene0", "gene0", "gene0",
"gene0", "gene0", "gene0", "gene0", "gene0", "gene0"), codon_index = c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L), position_1_count = c(2L, 7L, 8L,
0L, 2L, 22L, 19L, 15L, 134L, 1L, 127L, 30L, 0L, 0L, 1L, 4L, 65L,
234L, 1L, 3L, 57L, 0L, 4L, 16L), position_2_count = c(0L, 5L,
5L, 0L, 3L, 2L, 3L, 13L, 134L, 0L, 36L, 5L, 0L, 0L, 0L, 1L, 150L,
7L, 0L, 7L, 7L, 0L, 6L, 1L), position_3_count = c(0L, 2L, 1L,
0L, 4L, 0L, 3L, 32L, 43L, 3L, 9L, 1L, 0L, 0L, 0L, 4L, 105L, 1L,
0L, 14L, 5L, 0L, 6L, 1L), condition = structure(c(1L, 1L, 1L,
7L, 7L, 7L, 3L, 3L, 3L, 5L, 5L, 5L, 8L, 8L, 8L, 2L, 2L, 2L, 4L,
4L, 4L, 6L, 6L, 6L), .Label = c("c", "cup", "n", "nup", "p",
"pup", "min", "rich"), class = "factor")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -24L), .Names = c("gene_id",
"codon_index", "position_1_count", "position_2_count", "position_3_count",
"condition"))

> head(a)
# A tibble: 6 × 6
gene_id codon_index position_1_count position_2_count position_3_count condition
<chr> <int> <int> <int> <int> <fctr>
1 gene0 1 2 0 0 c
2 gene0 2 7 5 2 c
3 gene0 3 8 5 1 c
4 gene0 1 0 0 0 min
5 gene0 2 2 3 4 min
6 gene0 3 22 2 0 min


How can we convert this dataset to (I didn't add the column names here)

2 0 0 7 5 2 8 5 1 c
0 0 0 2 3 4 22 2 0 min

Answer

Another alternative:

library(purrr)

test %>%
  slice_rows("condition") %>%
  by_slice(function(x) unlist(x[-(1:2)]), .to = "vec")

Which gives:

## A tibble: 8 × 2
#  condition      rows
#     <fctr>    <list>
#1         c <int [9]>
#2       cup <int [9]>
#3         n <int [9]>
#4       nup <int [9]>
#5         p <int [9]>
#6       pup <int [9]>
#7       min <int [9]>
#8      rich <int [9]>

As per mentionned by @DavidArenburg, if you want the result rowwise:

library(dplyr)
library(tidyr)

test %>%
  select(-gene_id) %>%
  gather(key, value, -condition, -codon_index) %>%
  group_by(condition, codon_index) %>%
  summarise(val = toString(value)) %>%
  group_by(condition) %>%
  summarise(vec = toString(val))

Which gives:

## A tibble: 8 × 2
#  condition                                vec
#     <fctr>                              <chr>
#1         c          2, 0, 0, 7, 5, 2, 8, 5, 1
#2       cup   4, 1, 4, 65, 150, 105, 234, 7, 1
#3         n 19, 3, 3, 15, 13, 32, 134, 134, 43
#4       nup        1, 0, 0, 3, 7, 14, 57, 7, 5
#5         p      1, 0, 3, 127, 36, 9, 30, 5, 1
#6       pup         0, 0, 0, 4, 6, 6, 16, 1, 1
#7       min         0, 0, 0, 2, 3, 4, 22, 2, 0
#8      rich          0, 0, 0, 0, 0, 0, 1, 0, 0