CodeShaman CodeShaman - 4 months ago 15
R Question

Split string based on alternating character in R

I'm trying to figure out an efficient way to go about splitting a string like

"111110000011110000111000"


into a vector

[1] "11111" "00000" "1111" "0000" "111" "000"


where "0" and "1" can be any alternating characters.

Answer

Try

strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"  

Update

A modification of @rawr's solution with stri_extract_all_regex

library(stringi)
stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"  


stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"  
#[10] "000"  

stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]]
#[1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"  
#[8] "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"     
#[15] "D"       "aa"      "BB"     

Benchmarks

library(stringi) 
set.seed(24)
x3 <- stri_rand_strings(1, 1e4)

akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]]
#modified @thelatemail's function to make it bit more general
thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3, 
            perl=TRUE))[[1]]
rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
ananda <- function() unlist(read.fwf(textConnection(x3), 
                rle(strsplit(x3, "")[[1]])$lengths, 
                colClasses = "character"))
Colonel <- function() with(rle(strsplit(x3,'')[[1]]), 
   mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))

Cryo <- function(){
   res_vector=rep(NA_character_,nchar(x3))
  res_vector[1]=substr(x3,1,1)
  counter=1
  old_tmp=''

   for (i in 2:nchar(x3)) {
    tmp=substr(x3,i,i)
    if (tmp==old_tmp) {
    res_vector[counter]=paste0(res_vector[counter],tmp)
    } else {
    res_vector[counter+1]=tmp
    counter=counter+1
    }
  old_tmp=tmp
   }

 res_vector[!is.na(res_vector)]
  }


 richard <- function(){
     cs <- cumsum(
     rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
   )
   stri_sub(x3, c(1, head(cs + 1, -1)), cs)
  }

 nicola<-function(x) {
   indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))
   substring(x,indices[-length(indices)]+1,indices[-1])
 }

 richard2 <- function() {
  cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
 }

system.time(akrun())
# user  system elapsed 
# 0.003   0.000   0.003 

system.time(thelate())
#   user  system elapsed 
#  0.272   0.001   0.274 

system.time(rawr())
# user  system elapsed 
#  0.397   0.001   0.398 

system.time(ananda())
#  user  system elapsed 
# 3.744   0.204   3.949 

system.time(Colonel())
#   user  system elapsed 
#  0.154   0.001   0.154 

system.time(Cryo())
#  user  system elapsed 
# 0.220   0.005   0.226 

system.time(richard())
#  user  system elapsed 
# 0.007   0.000   0.006 

system.time(nicola(x3))
# user  system elapsed 
# 0.190   0.001   0.191 

On a slightly bigger string,

set.seed(24)
x3 <- stri_rand_strings(1, 1e6)

system.time(akrun())
#user  system elapsed 
#0.166   0.000   0.155 
system.time(richard())
#  user  system elapsed 
# 0.606   0.000   0.569 
system.time(richard2())
#  user  system elapsed 
# 0.518   0.000   0.487 

system.time(Colonel())
#  user  system elapsed 
# 9.631   0.000   9.358 


library(microbenchmark)
 microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')
 #Unit: relative
 #     expr      min       lq     mean   median       uq      max neval cld
 # richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581    20   b
 #richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861    20   b
 # akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20  a 

NOTE: Tried to run the other methods, but it takes a long time

data

str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"