CodeShaman - 1 year ago 74
R Question

# Split string based on alternating character in R

I'm trying to figure out an efficient way to go about splitting a string like

``````"111110000011110000111000"
``````

into a vector

``````[1] "11111" "00000" "1111" "0000" "111" "000"
``````

where "0" and "1" can be any alternating characters.

Try

``````strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"
``````

### Update

A modification of @rawr's solution with `stri_extract_all_regex`

``````library(stringi)
stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"

stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"
#[10] "000"

stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]]
#[1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"
#[8] "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"
#[15] "D"       "aa"      "BB"
``````

### Benchmarks

``````library(stringi)
set.seed(24)
x3 <- stri_rand_strings(1, 1e4)

akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]]
#modified @thelatemail's function to make it bit more general
thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3,
perl=TRUE))[[1]]
rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
ananda <- function() unlist(read.fwf(textConnection(x3),
rle(strsplit(x3, "")[[1]])\$lengths,
colClasses = "character"))
Colonel <- function() with(rle(strsplit(x3,'')[[1]]),
mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))

Cryo <- function(){
res_vector=rep(NA_character_,nchar(x3))
res_vector[1]=substr(x3,1,1)
counter=1
old_tmp=''

for (i in 2:nchar(x3)) {
tmp=substr(x3,i,i)
if (tmp==old_tmp) {
res_vector[counter]=paste0(res_vector[counter],tmp)
} else {
res_vector[counter+1]=tmp
counter=counter+1
}
old_tmp=tmp
}

res_vector[!is.na(res_vector)]
}

richard <- function(){
cs <- cumsum(
rle(stri_split_boundaries(x3, type = "character")[[1L]])\$lengths
)
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}

nicola<-function(x) {
indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))
substring(x,indices[-length(indices)]+1,indices[-1])
}

richard2 <- function() {
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}

system.time(akrun())
# user  system elapsed
# 0.003   0.000   0.003

system.time(thelate())
#   user  system elapsed
#  0.272   0.001   0.274

system.time(rawr())
# user  system elapsed
#  0.397   0.001   0.398

system.time(ananda())
#  user  system elapsed
# 3.744   0.204   3.949

system.time(Colonel())
#   user  system elapsed
#  0.154   0.001   0.154

system.time(Cryo())
#  user  system elapsed
# 0.220   0.005   0.226

system.time(richard())
#  user  system elapsed
# 0.007   0.000   0.006

system.time(nicola(x3))
# user  system elapsed
# 0.190   0.001   0.191
``````

On a slightly bigger string,

``````set.seed(24)
x3 <- stri_rand_strings(1, 1e6)

system.time(akrun())
#user  system elapsed
#0.166   0.000   0.155
system.time(richard())
#  user  system elapsed
# 0.606   0.000   0.569
system.time(richard2())
#  user  system elapsed
# 0.518   0.000   0.487

system.time(Colonel())
#  user  system elapsed
# 9.631   0.000   9.358

library(microbenchmark)
microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')
#Unit: relative
#     expr      min       lq     mean   median       uq      max neval cld
# richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581    20   b
#richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861    20   b
# akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20  a
``````

NOTE: Tried to run the other methods, but it takes a long time

### data

``````str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"