14

I have the following vector called input:

input <- c(1,2,1,NA,3,2,NA,1,5,6,NA,2,2)

[1]  1  2  1 NA  3  2 NA  1  5  6 NA  2  2

I would like to split this vector into multiple vectors by each NA. So the desired output should look like this:

> output
[[1]]
[1] 1 2 1

[[2]]
[1] 3 2

[[3]]
[1] 1 5 6

[[4]]
[1] 2 2

As you can see every time a NA appears, it splits into a new vector. So I was wondering if anyone knows how to split a vector by each NA into multiple vectors?

6 Answers 6

10

Using a similar logic to @tpetzoldt, but removing the NAs before the split:

split(na.omit(input), cumsum(is.na(input))[!is.na(input)])

$`0`
[1] 1 2 1

$`1`
[1] 3 2

$`2`
[1] 1 5 6

$`3`
[1] 2 2
8

One way could go like follows:

  1. identify the NAs
  2. do cumsum
  3. split according to the cumulative sums
  4. remove the NAs
input <- c(1,2,1,NA,3,2,NA,1,5,6,NA,2,2)
tmp <- cumsum(is.na(input))
lapply(split(input, tmp), na.omit)
2
  • 1
    It's a bit more verbose, but you can get a cleaner solution if you remove attributes after na.omit: lapply(split(input, tmp), \(x) na.omit(x) |> magrittr::set_attributes(NULL) ). Or adjust the anonymous function to base R.
    – Santiago
    Commented Dec 4, 2022 at 11:19
  • 2
    Thanks @Santiago for the suggestion. Without additional packages one can also use lapply(split(input, tmp), na.omit) |> lapply(\(x) {attributes(x) <- NULL; x}) .
    – tpetzoldt
    Commented Dec 4, 2022 at 18:21
4

This one is too verbose and overcomplicated, but for me it is easier to think of such problems, just wanted to share:

library(tidyverse)

tibble(input) %>% 
  group_by(id = cumsum(is.na(input))) %>% 
  na.omit %>% 
  group_split() %>% 
  map(.,~(.x %>%select(-id))) %>% 
  map(.,~(.x %>%pull))

[[1]]
[1] 1 2 1

[[2]]
[1] 3 2

[[3]]
[1] 1 5 6

[[4]]
[1] 2 2
4

Here's a solution that is not verbose:

strsplit(paste(input, collapse = " "), " NA ")
[[1]]
[1] "1 2 1" "3 2"   "1 5 6" "2 2" 
1
  • 3
    This is very elegant but the output is a list of vectors rather than a list where each item is a vector
    – stevec
    Commented Dec 5, 2022 at 15:22
3

Another, quite similar way like @tpetzoldt and @tmfmnk, also removing the NA.

input <- c(1,2,1,NA,NA,3,2,NA,1,5,6,NA,2,2) #Add consecutive NA

. <- is.na(input)
split(input[!.], cumsum(.)[!.])
#$`0`
#[1] 1 2 1
#
#$`2`
#[1] 3 2
#
#$`3`
#[1] 1 5 6
#
#$`4`
#[1] 2 2

Or the other way round

i <- !is.na(input)
split(input[i], cumsum(!i)[i])

or even

i <- is.na(input)
j <- which(!i)
split(input[j], cumsum(i)[j])

In case consecutive NA should not be removed just convert it to factor.

split(input[!.], factor(cumsum(.))[!.])
#$`0`
#[1] 1 2 1
#
#$`1`
#numeric(0)
#
#$`2`
#[1] 3 2
#
#$`3`
#[1] 1 5 6
#
#$`4`
#[1] 2 2

Or another way, not using split.

i <- is.na(input)
j <- which(i)
Map(\(...) input[seq(...)], from = c(1, 1+j), by = 1,
    length.out = c(j-1, length(i)) - c(0, j))
#[[1]]
#[1] 1 2 1
#
#[[2]]
#numeric(0)
#
#[[3]]
#[1] 3 2
#
#[[4]]
#[1] 1 5 6
#
#[[5]]
#[1] 2 2

Or using Rcpp:

Rcpp::cppFunction("
Rcpp::List splitByNa(const Rcpp::NumericVector& x) {
  std::vector< std::vector<double> > res;
  res.push_back(std::vector<double>());
  for(auto const& y : x) {
    if(NumericVector::is_na(y)) res.push_back(std::vector<double>());
    else res.back().push_back(y);
  }
  return wrap( res );
}")

splitByNa(input)
#[[1]]
#[1] 1 2 1
#
#[[2]]
#numeric(0)
#
#[[3]]
#[1] 3 2
#
#[[4]]
#[1] 1 5 6
#
#[[5]]
#[1] 2 2

Benchmark

set.seed(42)
n <- 1e5
input <- sample(c(1:9, NA), n, TRUE)

library(tidyverse) #for TarJae

bench::mark(check = FALSE,
tmfmnk = split(na.omit(input), cumsum(is.na(input))[!is.na(input)]),
tpetzoldt = {tmp <- cumsum(is.na(input))
    lapply(split(input, tmp), na.omit)},
TarJae = {tibble(input) %>% 
  group_by(id = cumsum(is.na(input))) %>% 
  na.omit %>% 
  group_split() %>% 
  map(.,~(.x %>%select(-id))) %>% 
      map(.,~(.x %>%pull))},
ChrisR = strsplit(paste(input, collapse = " "), " NA "), #Returns String
Thomas = split(na.omit(input), findInterval(seq_along(input)[!is.na(input)], which(is.na(input)))),
GKi1 = {. <- is.na(input); split(input[!.], cumsum(.)[!.])},
GKi2 = {i <- !is.na(input); split(input[i], cumsum(!i)[i])},
GKi3 = {i <- is.na(input); j <- which(!i); split(input[j], cumsum(.)[j])},
GKi4 = {i <- is.na(input)
j <- which(i)
Map(\(...) input[seq(...)], from = c(1, 1+j), by = 1,
  length.out = c(j-1, length(i)) - c(0, j))},
GKi5 = splitByNa(input)
)
   express…¹     min  median itr/s…² mem_al…³ gc/se…⁴ n_itr  n_gc total…⁵ result
   <bch:exp> <bch:t> <bch:t>   <dbl> <bch:by>   <dbl> <int> <dbl> <bch:t> <list>
 1 tmfmnk     4.86ms  5.29ms 8.77e+1   7.95MB   24.9     67    19 764.3ms <NULL>
 2 tpetzoldt 37.32ms 38.24ms 2.44e+1    4.4MB    5.63    13     3 532.5ms <NULL>
 3 TarJae     10.88s  10.88s 9.19e-2 109.69MB    3.68     1    40   10.9s <NULL>
 4 ChrisR    13.72ms 13.94ms 6.75e+1    1.8MB    1.99    34     1 503.6ms <NULL>
 5 Thomas     5.46ms  5.74ms 1.55e+2   8.71MB   25.8     78    13 503.5ms <NULL>
 6 GKi1       4.67ms  4.92ms 1.77e+2   6.63MB   27.8     89    14 502.7ms <NULL>
 7 GKi2       4.68ms  4.92ms 1.79e+2   6.63MB   29.8     90    15 504.2ms <NULL>
 8 GKi3       4.33ms  4.54ms 1.20e+2   5.52MB   15.9     60     8 501.9ms <NULL>
 9 GKi4      56.37ms 61.99ms 1.64e+1   1.88MB    5.47     9     3 548.5ms <NULL>
10 GKi5       2.41ms  2.72ms 3.20e+2   1.26MB    9.99   160     5 500.3ms <NULL>

The Rcpp version is the fastest, has lowest memory consumption and is able to handle consecutive NA.

1

We can use split + findIntervals as well

> split(na.omit(input), findInterval(seq_along(input)[!is.na(input)], which(is.na(input))))
$`0`
[1] 1 2 1

$`1`
[1] 3 2

$`2`
[1] 1 5 6

$`3`
[1] 2 2

Not the answer you're looking for? Browse other questions tagged or ask your own question.