Subsetting • tibble

There are many, many ways to subset data frames and tibbles.

This vignette is an attempt to provide a comprehensive overview over the behavior of the subsetting operators $, [[ and [, highlighting where the tibble implementation differs from the data frame implementation.

library(tibble)
new_df <- function() {
  df <- data.frame(a = 1:4)
  df$b <- letters[5:8]
  df$cd <- list(9, 10:11, 12:14, "text")
  df
}
new_tbl <- function() {
  as_tibble(new_df())
}

Results of the same code for data frames and tibbles are presented side by side:

new_df()
#>   a b         cd
#> 1 1 e          9
#> 2 2 f     10, 11
#> 3 3 g 12, 13, 14
#> 4 4 h       text

new_tbl()
#> # A tibble: 4 x 3
#>       a b     cd       
#>   <int> <chr> <list>   
#> 1     1 e     <dbl [1]>
#> 2     2 f     <int [2]>
#> 3     3 g     <int [3]>
#> 4     4 h     <chr [1]>

In the following, if the results are identical (after converting to a data frame if necessary), only the tibble result is shown, as in the example below. This allows to spot differences easier.

new_tbl()
#> # A tibble: 4 x 3
#>       a b     cd       
#>   <int> <chr> <list>   
#> 1     1 e     <dbl [1]>
#> 2     2 f     <int [2]>
#> 3     3 g     <int [3]>
#> 4     4 h     <chr [1]>

Subsetting operations are read-only. The same objects are reused in all examples:

df <- new_df()
tbl <- new_tbl()

$

With $ subsetting, accessing a missing column gives a warning. Inexact matching is not supported:

	tbl$a #> [1] 1 2 3 4
	tbl$"a" #> [1] 1 2 3 4
	tbl$a[2:3] #> [1] 2 3
	tbl$cd #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"
df$c #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"	tbl$c #> Warning: Unknown or uninitialised #> column: `c`. `#> NULL`
df$d #> NULL	tbl$d #> Warning: Unknown or uninitialised #> column: `d`. `#> NULL`

[[

The exact argument is not supported by tibbles.

	tbl[["a"]] #> [1] 1 2 3 4
	tbl[["cd", exact = TRUE]] #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"
df[["cd", exact = FALSE]] #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"	tbl[["cd", exact = FALSE]] #> Warning: `exact` ignored. `#> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"`
	tbl[["c", exact = TRUE]] #> NULL
df[["c", exact = FALSE]] #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"	tbl[["c", exact = FALSE]] #> Warning: `exact` ignored. `#> NULL`

With two indexes, a single element is returned. List columns are not unpacked by tibbles, the [[ only unpacks columns.

	tbl[[2, "a"]] #> [1] 2
df[[2, "cd"]] #> [1] 10 11	tbl[[2, "cd"]] #> [[1]] #> [1] 10 11
df[[1:2, "cd"]] `#> Error in col[[i, exact = exact]]: #> subscript out of bounds`	tbl[[1:2, "cd"]] #> Error: Must extract row with a single #> valid subscript. #> [31mx[39m The subscript `1:2` has size 2 but #> must be size 1.
	tbl[[2, "c"]] #> NULL
df[[1:2, "c"]] #> NULL	tbl[[1:2, "c"]] #> Error: Must extract row with a single #> valid subscript. #> [31mx[39m The subscript `1:2` has size 2 but #> must be size 1.

Exotic variants like recursive indexing are deprecated for tibbles.

tbl[[c(1, 2)]]
#> [1] 2

[

With [ subsetting, tibbles always return a tibble. The drop argument is supported but has different defaults:

	tbl["a"] #> # A tibble: 4 x 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4
df["a", drop = FALSE] #> Warning in `[.data.frame`(df, "a", #> drop = FALSE): 'drop' argument will be #> ignored `#> a #> 1 1 #> 2 2 #> 3 3 #> 4 4`	tbl["a", drop = FALSE] #> Warning: `drop` argument ignored for #> subsetting a tibble with `x[j]`, it has #> an effect only for `x[i, j]`. `#> # A tibble: 4 x 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4`
df["a", drop = TRUE] #> Warning in `[.data.frame`(df, "a", drop #> = TRUE): 'drop' argument will be ignored `#> a #> 1 1 #> 2 2 #> 3 3 #> 4 4`	tbl["a", drop = TRUE] #> Warning: `drop` argument ignored for #> subsetting a tibble with `x[j]`, it has #> an effect only for `x[i, j]`. `#> # A tibble: 4 x 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4`
	tbl[1] #> # A tibble: 4 x 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4
	tbl[0] #> # A tibble: 4 x 0
df[4] #> Error in `[.data.frame`(df, 4): #> undefined columns selected	tbl[4] `#> Error: Can't subset columns that don't #> exist. #> [31mx[39m The location 4 doesn't exist. #> [34mℹ[39m There are only 3 columns.`
df[NA] #> Error in `[.data.frame`(df, NA): #> undefined columns selected	tbl[NA] #> Error: Can't use NA as column index with #> `[` at position 1.
df[NA_character_] #> Error in `[.data.frame`(df, #> NA_character_): undefined columns #> selected	tbl[NA_character_] #> Error: Can't use NA as column index with #> `[` at position 1.
df[NA_integer_] #> Error in `[.data.frame`(df, #> NA_integer_): undefined columns selected	tbl[NA_integer_] #> Error: Can't use NA as column index with #> `[` at position 1.

The same examples are repeated for two-dimensional indexing when omitting the row index:

df[, "a"] #> [1] 1 2 3 4	tbl[, "a"] #> # A tibble: 4 x 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4
	tbl[, "a", drop = FALSE] #> # A tibble: 4 x 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4
	tbl[, "a", drop = TRUE] #> [1] 1 2 3 4
df[, 1] #> [1] 1 2 3 4	tbl[, 1] #> # A tibble: 4 x 1 #> a #> <int> #> 1 1 #> 2 2 #> 3 3 #> 4 4
	tbl[, 0] #> # A tibble: 4 x 0
df[, 4] #> Error in `[.data.frame`(df, , 4): #> undefined columns selected	tbl[, 4] `#> Error: Can't subset columns that don't #> exist. #> [31mx[39m The location 4 doesn't exist. #> [34mℹ[39m There are only 3 columns.`
df[, NA] #> Error in `[.data.frame`(df, , NA): #> undefined columns selected	tbl[, NA] #> Error: Can't use NA as column index with #> `[` at position 1.
df[, NA_character_] #> Error in `[.data.frame`(df, , #> NA_character_): undefined columns #> selected	tbl[, NA_character_] #> Error: Can't use NA as column index with #> `[` at position 1.
df[, NA_integer_] #> Error in `[.data.frame`(df, , #> NA_integer_): undefined columns selected	tbl[, NA_integer_] #> Error: Can't use NA as column index with #> `[` at position 1.

Multiple columns can be queried by passing a vector of column indexes (names, positions, or even a logical vector). With the latter option, tibbles are a tad stricter:

	tbl[c("a", "b")] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h
	tbl[character()] #> # A tibble: 4 x 0
	tbl[1:2] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h
	tbl[1:3] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]>
df[1:4] #> Error in `[.data.frame`(df, 1:4): #> undefined columns selected	tbl[1:4] `#> Error: Can't subset columns that don't #> exist. #> [31mx[39m The location 4 doesn't exist. #> [34mℹ[39m There are only 3 columns.`
	tbl[0:2] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h
df[-1:2] #> Error in `[.default`(df, -1:2): only 0's #> may be mixed with negative subscripts	tbl[-1:2] #> Error: Must subset columns with a valid #> subscript vector. #> [31mx[39m Negative locations can't be mixed with #> positive locations. #> [34mℹ[39m The subscript `-1:2` has 2 positive #> values at locations 3 and 4.
	tbl[-1] #> # A tibble: 4 x 2 #> b cd #> <chr> <list> #> 1 e <dbl [1]> #> 2 f <int [2]> #> 3 g <int [3]> #> 4 h <chr [1]>
	tbl[-(1:2)] #> # A tibble: 4 x 1 #> cd #> <list> #> 1 <dbl [1]> #> 2 <int [2]> #> 3 <int [3]> #> 4 <chr [1]>
	tbl[integer()] #> # A tibble: 4 x 0
	tbl[TRUE] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]>
	tbl[FALSE] #> # A tibble: 4 x 0
	tbl[c(TRUE, TRUE, FALSE)] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h
	tbl[c(FALSE, TRUE, FALSE)] #> # A tibble: 4 x 1 #> b #> <chr> #> 1 e #> 2 f #> 3 g #> 4 h
df[c(FALSE, TRUE)] #> b #> 1 e #> 2 f #> 3 g #> 4 h	tbl[c(FALSE, TRUE)] #> Error: Must subset columns with a valid #> subscript vector. #> [34mℹ[39m Logical subscripts must match the size #> of the indexed input. #> [31mx[39m The input has size 3 but the subscript #> `c(FALSE, TRUE)` has size 2.
df[c(FALSE, TRUE, FALSE, TRUE)] #> Error in `[.data.frame`(df, c(FALSE, #> TRUE, FALSE, TRUE)): undefined columns #> selected	tbl[c(FALSE, TRUE, FALSE, TRUE)] #> Error: Must subset columns with a valid #> subscript vector. #> [34mℹ[39m Logical subscripts must match the size #> of the indexed input. #> [31mx[39m The input has size 3 but the subscript #> `c(FALSE, TRUE, FALSE, TRUE)` has size #> 4.

The same examples are repeated for two-dimensional indexing when omitting the row index:

	tbl[, c("a", "b")] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h
	tbl[, character()] #> # A tibble: 4 x 0
	tbl[, 1:2] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h
	tbl[, 1:3] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]>
df[, 1:4] #> Error in `[.data.frame`(df, , 1:4): #> undefined columns selected	tbl[, 1:4] `#> Error: Can't subset columns that don't #> exist. #> [31mx[39m The location 4 doesn't exist. #> [34mℹ[39m There are only 3 columns.`
	tbl[, 0:2] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h
df[, -1:2] `#> Error in .subset(x, j): only 0's may be #> mixed with negative subscripts`	tbl[, -1:2] #> Error: Must subset columns with a valid #> subscript vector. #> [31mx[39m Negative locations can't be mixed with #> positive locations. #> [34mℹ[39m The subscript `-1:2` has 2 positive #> values at locations 3 and 4.
	tbl[, -1] #> # A tibble: 4 x 2 #> b cd #> <chr> <list> #> 1 e <dbl [1]> #> 2 f <int [2]> #> 3 g <int [3]> #> 4 h <chr [1]>
df[, -(1:2)] #> [[1]] #> [1] 9 #> #> [[2]] #> [1] 10 11 #> #> [[3]] #> [1] 12 13 14 #> #> [[4]] #> [1] "text"	tbl[, -(1:2)] #> # A tibble: 4 x 1 #> cd #> <list> #> 1 <dbl [1]> #> 2 <int [2]> #> 3 <int [3]> #> 4 <chr [1]>
	tbl[, integer()] #> # A tibble: 4 x 0
	tbl[, TRUE] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]>
	tbl[, FALSE] #> # A tibble: 4 x 0
	tbl[, c(TRUE, TRUE, FALSE)] #> # A tibble: 4 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f #> 3 3 g #> 4 4 h
df[, c(FALSE, TRUE, FALSE)] #> [1] "e" "f" "g" "h"	tbl[, c(FALSE, TRUE, FALSE)] #> # A tibble: 4 x 1 #> b #> <chr> #> 1 e #> 2 f #> 3 g #> 4 h
df[, c(FALSE, TRUE)] #> [1] "e" "f" "g" "h"	tbl[, c(FALSE, TRUE)] #> Error: Must subset columns with a valid #> subscript vector. #> [34mℹ[39m Logical subscripts must match the size #> of the indexed input. #> [31mx[39m The input has size 3 but the subscript #> `c(FALSE, TRUE)` has size 2.
df[, c(FALSE, TRUE, FALSE, TRUE)] #> Error in `[.data.frame`(df, , c(FALSE, #> TRUE, FALSE, TRUE)): undefined columns #> selected	tbl[, c(FALSE, TRUE, FALSE, TRUE)] #> Error: Must subset columns with a valid #> subscript vector. #> [34mℹ[39m Logical subscripts must match the size #> of the indexed input. #> [31mx[39m The input has size 3 but the subscript #> `c(FALSE, TRUE, FALSE, TRUE)` has size #> 4.

Row subsetting with integer indexes works almost identical. Out-of-bounds subsetting is not recommended and may lead to an error in future versions. Another special case is subsetting with [1, , drop = TRUE] where the data frame implementation returns a list.

	tbl[1, ] #> # A tibble: 1 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]>
df[1, , drop = TRUE] #> $a #> [1] 1 #> #> $b #> [1] "e" #> #> $cd #> $cd[[1]] #> [1] 9	tbl[1, , drop = TRUE] #> # A tibble: 1 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]>
	tbl[1:2, ] #> # A tibble: 2 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]>
	tbl[0, ] #> # A tibble: 0 x 3 #> # … with 3 variables: a <int>, b <chr>, #> # cd <list>
	tbl[integer(), ] #> # A tibble: 0 x 3 #> # … with 3 variables: a <int>, b <chr>, #> # cd <list>
	tbl[5, ] #> # A tibble: 1 x 3 #> a b cd #> <int> <chr> <list> #> 1 NA <NA> <NULL>
	tbl[4:5, ] #> # A tibble: 2 x 3 #> a b cd #> <int> <chr> <list> #> 1 4 h <chr [1]> #> 2 NA <NA> <NULL>
	tbl[-1, ] #> # A tibble: 3 x 3 #> a b cd #> <int> <chr> <list> #> 1 2 f <int [2]> #> 2 3 g <int [3]> #> 3 4 h <chr [1]>
df[-1:2, ] `#> Error in xj[i]: only 0's may be mixed #> with negative subscripts`	tbl[-1:2, ] #> Error: Must subset rows with a valid #> subscript vector. #> [31mx[39m Negative locations can't be mixed with #> positive locations. #> [34mℹ[39m The subscript `-1:2` has 2 positive #> values at locations 3 and 4.
	tbl[NA, ] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 NA <NA> <NULL> #> 2 NA <NA> <NULL> #> 3 NA <NA> <NULL> #> 4 NA <NA> <NULL>
	tbl[NA_integer_, ] #> # A tibble: 1 x 3 #> a b cd #> <int> <chr> <list> #> 1 NA <NA> <NULL>
	tbl[c(NA, 1), ] #> # A tibble: 2 x 3 #> a b cd #> <int> <chr> <list> #> 1 NA <NA> <NULL> #> 2 1 e <dbl [1]>

Row subsetting with logical indexes also works almost identical, the index vector must have length one or the number of rows with tibbles.

	tbl[TRUE, ] #> # A tibble: 4 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 2 f <int [2]> #> 3 3 g <int [3]> #> 4 4 h <chr [1]>
	tbl[FALSE, ] #> # A tibble: 0 x 3 #> # … with 3 variables: a <int>, b <chr>, #> # cd <list>
df[c(TRUE, FALSE), ] #> a b cd #> 1 1 e 9 #> 3 3 g 12, 13, 14	tbl[c(TRUE, FALSE), ] #> Error: Must subset rows with a valid #> subscript vector. #> [34mℹ[39m Logical subscripts must match the size #> of the indexed input. #> [31mx[39m The input has size 4 but the subscript #> `c(TRUE, FALSE)` has size 2.
df[c(TRUE, FALSE, TRUE), ] #> a b cd #> 1 1 e 9 #> 3 3 g 12, 13, 14 #> 4 4 h text	tbl[c(TRUE, FALSE, TRUE), ] #> Error: Must subset rows with a valid #> subscript vector. #> [34mℹ[39m Logical subscripts must match the size #> of the indexed input. #> [31mx[39m The input has size 4 but the subscript #> `c(TRUE, FALSE, TRUE)` has size 3.
	tbl[c(TRUE, FALSE, TRUE, FALSE), ] #> # A tibble: 2 x 3 #> a b cd #> <int> <chr> <list> #> 1 1 e <dbl [1]> #> 2 3 g <int [3]>
df[c(TRUE, FALSE, TRUE, FALSE, TRUE), ] #> a b cd #> 1 1 e 9 #> 3 3 g 12, 13, 14 #> NA NA <NA> NULL	tbl[c(TRUE, FALSE, TRUE, FALSE, TRUE), ] #> Error: Must subset rows with a valid #> subscript vector. #> [34mℹ[39m Logical subscripts must match the size #> of the indexed input. #> [31mx[39m The input has size 4 but the subscript #> `c(TRUE, FALSE, TRUE, FALSE, TRUE)` has #> size 5.

Indexing both row and column works more or less the same, except for drop:

df[1, "a"] #> [1] 1	tbl[1, "a"] #> # A tibble: 1 x 1 #> a #> <int> #> 1 1
	tbl[1, "a", drop = FALSE] #> # A tibble: 1 x 1 #> a #> <int> #> 1 1
	tbl[1, "a", drop = TRUE] #> [1] 1
df[1:2, "a"] #> [1] 1 2	tbl[1:2, "a"] #> # A tibble: 2 x 1 #> a #> <int> #> 1 1 #> 2 2
	tbl[1:2, "a", drop = FALSE] #> # A tibble: 2 x 1 #> a #> <int> #> 1 1 #> 2 2
	tbl[1:2, "a", drop = TRUE] #> [1] 1 2
	tbl[1, c("a", "b")] #> # A tibble: 1 x 2 #> a b #> <int> <chr> #> 1 1 e
	tbl[1, c("a", "b"), drop = FALSE] #> # A tibble: 1 x 2 #> a b #> <int> <chr> #> 1 1 e
df[1, c("a", "b"), drop = TRUE] #> $a #> [1] 1 #> #> $b #> [1] "e"	tbl[1, c("a", "b"), drop = TRUE] #> # A tibble: 1 x 2 #> a b #> <int> <chr> #> 1 1 e
	tbl[1:2, c("a", "b")] #> # A tibble: 2 x 2 #> a b #> <int> <chr> #> 1 1 e #> 2 2 f

Indexes can be omitted altogether, no differences here:

tbl[]
#> # A tibble: 4 x 3
#>       a b     cd       
#>   <int> <chr> <list>   
#> 1     1 e     <dbl [1]>
#> 2     2 f     <int [2]>
#> 3     3 g     <int [3]>
#> 4     4 h     <chr [1]>

tbl[,]
#> # A tibble: 4 x 3
#>       a b     cd       
#>   <int> <chr> <list>   
#> 1     1 e     <dbl [1]>
#> 2     2 f     <int [2]>
#> 3     3 g     <int [3]>
#> 4     4 h     <chr [1]>