There are many, many ways to subset data frames and tibbles.
This vignette is an attempt to provide a comprehensive overview over the behavior of the subsetting operators $
, [[
and [
, highlighting where the tibble implementation differs from the data frame implementation.
library(tibble) new_df <- function() { df <- data.frame(a = 1:4) df$b <- letters[5:8] df$cd <- list(9, 10:11, 12:14, "text") df } new_tbl <- function() { as_tibble(new_df()) }
Results of the same code for data frames and tibbles are presented side by side:
new_df() #> a b cd #> 1 1 e 9 #> 2 2 f 10, 11 #> 3 3 g 12, 13, 14 #> 4 4 h text |
new_tbl() #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]> |
In the following, if the results are identical (after converting to a data frame if necessary), only the tibble result is shown, as in the example below. This allows to spot differences easier.
new_tbl() #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]> |
Subsetting operations are read-only. The same objects are reused in all examples:
df <- new_df() tbl <- new_tbl()
With $
subsetting, accessing a missing column gives a warning. Inexact matching is not supported:
tbl$a #> [1] 1 2 3 4 |
|
tbl$"a" #> [1] 1 2 3 4 |
|
tbl$a[2:3] #> [1] 2 3 |
|
tbl$cd #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text" |
|
df$c #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text" |
tbl$c
|
df$d #> NULL |
tbl$d
|
The exact
argument is not supported by tibbles.
tbl[["a"]] #> [1] 1 2 3 4 |
|
tbl[["cd", exact = TRUE]] #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text" |
|
df[["cd", exact = FALSE]] #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text" |
tbl[["cd", exact = FALSE]]
|
tbl[["c", exact = TRUE]] #> NULL |
|
df[["c", exact = FALSE]] #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text" |
tbl[["c", exact = FALSE]]
|
With two indexes, a single element is returned. List columns are not unpacked by tibbles, the [[
only unpacks columns.
tbl[[2, "a"]] #> [1] 2 |
|
df[[2, "cd"]] #> [1] 10 11 |
tbl[[2, "cd"]] #> [[1]] #> [1] 10 11 |
df[[1:2, "cd"]]
|
tbl[[1:2, "cd"]]
|
tbl[[2, "c"]] #> NULL |
|
df[[1:2, "c"]] #> NULL |
tbl[[1:2, "c"]]
|
Exotic variants like recursive indexing are deprecated for tibbles.
tbl[[c(1, 2)]] #> [1] 2 |
With [
subsetting, tibbles always return a tibble. The drop
argument is supported but has different defaults:
tbl["a"] #> # A tibble: 4 x 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4 |
|
df["a", drop = FALSE]
|
tbl["a", drop = FALSE]
|
df["a", drop = TRUE]
|
tbl["a", drop = TRUE]
|
tbl[1] #> # A tibble: 4 x 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4 |
|
tbl[0] #> # A tibble: 4 x 0 |
|
df[4]
|
tbl[4]
|
df[NA]
|
tbl[NA]
|
df[NA_character_]
|
tbl[NA_character_]
|
df[NA_integer_]
|
tbl[NA_integer_]
|
The same examples are repeated for two-dimensional indexing when omitting the row index:
df[, "a"] #> [1] 1 2 3 4 |
tbl[, "a"] #> # A tibble: 4 x 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4 |
tbl[, "a", drop = FALSE] #> # A tibble: 4 x 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4 |
|
tbl[, "a", drop = TRUE] #> [1] 1 2 3 4 |
|
df[, 1] #> [1] 1 2 3 4 |
tbl[, 1] #> # A tibble: 4 x 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4 |
tbl[, 0] #> # A tibble: 4 x 0 |
|
df[, 4]
|
tbl[, 4]
|
df[, NA]
|
tbl[, NA]
|
df[, NA_character_]
|
tbl[, NA_character_]
|
df[, NA_integer_]
|
tbl[, NA_integer_]
|
Multiple columns can be queried by passing a vector of column indexes (names, positions, or even a logical vector). With the latter option, tibbles are a tad stricter:
tbl[c("a", "b")] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h |
|
tbl[character()] #> # A tibble: 4 x 0 |
|
tbl[1:2] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h |
|
tbl[1:3] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]> |
|
df[1:4]
|
tbl[1:4]
|
tbl[0:2] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h |
|
df[-1:2]
|
tbl[-1:2]
|
tbl[-1] #> # A tibble: 4 x 2 #> b cd #> <chr> <list> #> 1 e <dbl [1]> #> 2 f <int [2]> #> 3 g <int [3]> #> 4 h <chr [1]> |
|
tbl[-(1:2)] #> # A tibble: 4 x 1 #> cd #> <list> #> 1 <dbl [1]> #> 2 <int [2]> #> 3 <int [3]> #> 4 <chr [1]> |
|
tbl[integer()] #> # A tibble: 4 x 0 |
|
tbl[TRUE] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]> |
|
tbl[FALSE] #> # A tibble: 4 x 0 |
|
tbl[c(TRUE, TRUE, FALSE)] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h |
|
tbl[c(FALSE, TRUE, FALSE)] #> # A tibble: 4 x 1 #> b #> <chr> #> 1 e #> 2 f #> 3 g #> 4 h |
|
df[c(FALSE, TRUE)] #> b #> 1 e #> 2 f #> 3 g #> 4 h |
tbl[c(FALSE, TRUE)]
|
df[c(FALSE, TRUE, FALSE, TRUE)]
|
tbl[c(FALSE, TRUE, FALSE, TRUE)]
|
The same examples are repeated for two-dimensional indexing when omitting the row index:
tbl[, c("a", "b")] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h |
|
tbl[, character()] #> # A tibble: 4 x 0 |
|
tbl[, 1:2] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h |
|
tbl[, 1:3] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]> |
|
df[, 1:4]
|
tbl[, 1:4]
|
tbl[, 0:2] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h |
|
df[, -1:2]
|
tbl[, -1:2]
|
tbl[, -1] #> # A tibble: 4 x 2 #> b cd #> <chr> <list> #> 1 e <dbl [1]> #> 2 f <int [2]> #> 3 g <int [3]> #> 4 h <chr [1]> |
|
df[, -(1:2)] #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text" |
tbl[, -(1:2)] #> # A tibble: 4 x 1 #> cd #> <list> #> 1 <dbl [1]> #> 2 <int [2]> #> 3 <int [3]> #> 4 <chr [1]> |
tbl[, integer()] #> # A tibble: 4 x 0 |
|
tbl[, TRUE] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]> |
|
tbl[, FALSE] #> # A tibble: 4 x 0 |
|
tbl[, c(TRUE, TRUE, FALSE)] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h |
|
df[, c(FALSE, TRUE, FALSE)] #> [1] "e" "f" "g" "h" |
tbl[, c(FALSE, TRUE, FALSE)] #> # A tibble: 4 x 1 #> b #> <chr> #> 1 e #> 2 f #> 3 g #> 4 h |
df[, c(FALSE, TRUE)] #> [1] "e" "f" "g" "h" |
tbl[, c(FALSE, TRUE)]
|
df[, c(FALSE, TRUE, FALSE, TRUE)]
|
tbl[, c(FALSE, TRUE, FALSE, TRUE)]
|
Row subsetting with integer indexes works almost identical. Out-of-bounds subsetting is not recommended and may lead to an error in future versions. Another special case is subsetting with [1, , drop = TRUE]
where the data frame implementation returns a list.
tbl[1, ] #> # A tibble: 1 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> |
|
df[1, , drop = TRUE] #> $a #> [1] 1 #> #> $b #> [1] "e" #> #> $cd #> $cd[[1]] #> [1] 9 |
tbl[1, , drop = TRUE] #> # A tibble: 1 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> |
tbl[1:2, ] #> # A tibble: 2 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> |
|
tbl[0, ] #> # A tibble: 0 x 3 #> # … with 3 variables: a <int>, b <chr>, #> # cd <list> |
|
tbl[integer(), ] #> # A tibble: 0 x 3 #> # … with 3 variables: a <int>, b <chr>, #> # cd <list> |
|
tbl[5, ] #> # A tibble: 1 x 3 #> a b cd #> <int> <chr> <list> #> 1 NA <NA> <NULL> |
|
tbl[4:5, ] #> # A tibble: 2 x 3 #> a b cd #> <int> <chr> <list> #> 1 4 h <chr [1]> #> 2 NA <NA> <NULL> |
|
tbl[-1, ] #> # A tibble: 3 x 3 #> a b cd #> <int> <chr> <list> #> 1 2 f <int [2]> #> 2 3 g <int [3]> #> 3 4 h <chr [1]> |
|
df[-1:2, ]
|
tbl[-1:2, ]
|
tbl[NA, ] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 NA <NA> <NULL> #> 2 NA <NA> <NULL> #> 3 NA <NA> <NULL> #> 4 NA <NA> <NULL> |
|
tbl[NA_integer_, ] #> # A tibble: 1 x 3 #> a b cd #> <int> <chr> <list> #> 1 NA <NA> <NULL> |
|
tbl[c(NA, 1), ] #> # A tibble: 2 x 3 #> a b cd #> <int> <chr> <list> #> 1 NA <NA> <NULL> #> 2 1 e <dbl [1]> |
Row subsetting with logical indexes also works almost identical, the index vector must have length one or the number of rows with tibbles.
tbl[TRUE, ] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]> |
|
tbl[FALSE, ] #> # A tibble: 0 x 3 #> # … with 3 variables: a <int>, b <chr>, #> # cd <list> |
|
df[c(TRUE, FALSE), ] #> a b cd #> 1 1 e 9 #> 3 3 g 12, 13, 14 |
tbl[c(TRUE, FALSE), ]
|
df[c(TRUE, FALSE, TRUE), ] #> a b cd #> 1 1 e 9 #> 3 3 g 12, 13, 14 #> 4 4 h text |
tbl[c(TRUE, FALSE, TRUE), ]
|
tbl[c(TRUE, FALSE, TRUE, FALSE), ] #> # A tibble: 2 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 3 g <int [3]> |
|
df[c(TRUE, FALSE, TRUE, FALSE, TRUE), ] #> a b cd #> 1 1 e 9 #> 3 3 g 12, 13, 14 #> NA NA <NA> NULL |
tbl[c(TRUE, FALSE, TRUE, FALSE, TRUE), ]
|
Indexing both row and column works more or less the same, except for drop
:
df[1, "a"] #> [1] 1 |
tbl[1, "a"] #> # A tibble: 1 x 1 #> a #> <int> #> 1 1 |
tbl[1, "a", drop = FALSE] #> # A tibble: 1 x 1 #> a #> <int> #> 1 1 |
|
tbl[1, "a", drop = TRUE] #> [1] 1 |
|
df[1:2, "a"] #> [1] 1 2 |
tbl[1:2, "a"] #> # A tibble: 2 x 1 #> a #> <int> #> 1 1 #> 2 2 |
tbl[1:2, "a", drop = FALSE] #> # A tibble: 2 x 1 #> a #> <int> #> 1 1 #> 2 2 |
|
tbl[1:2, "a", drop = TRUE] #> [1] 1 2 |
|
tbl[1, c("a", "b")] #> # A tibble: 1 x 2 #> a b #> <int> <chr> #> 1 1 e |
|
tbl[1, c("a", "b"), drop = FALSE] #> # A tibble: 1 x 2 #> a b #> <int> <chr> #> 1 1 e |
|
df[1, c("a", "b"), drop = TRUE] #> $a #> [1] 1 #> #> $b #> [1] "e" |
tbl[1, c("a", "b"), drop = TRUE] #> # A tibble: 1 x 2 #> a b #> <int> <chr> #> 1 1 e |
tbl[1:2, c("a", "b")] #> # A tibble: 2 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f |
Indexes can be omitted altogether, no differences here:
tbl[] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]> |
|
tbl[,] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]> |