Title: | Fast Tidying of Data |
---|---|
Description: | Tidying functions built on 'data.table' to provide quick and efficient data manipulation with minimal overhead. |
Authors: | Tyson Barrett [aut, cre] , Mark Fairbanks [ctb], Ivan Leung [ctb], Indrajeet Patil [ctb] (<https://orcid.org/0000-0003-1995-6531>, @patilindrajeets) |
Maintainer: | Tyson Barrett <[email protected]> |
License: | GPL-3 |
Version: | 0.4.0 |
Built: | 2024-11-04 04:34:20 UTC |
Source: | https://github.com/tysonstanley/tidyfast |
Tidying functions built on 'data.table' to provide quick and efficient data manipulation with minimal overhead.
Maintainer: Tyson Barrett [email protected] (ORCID)
Other contributors:
Mark Fairbanks [contributor]
Ivan Leung [contributor]
Indrajeet Patil [email protected] (ORCID) (@patilindrajeets) [contributor]
Does what dplyr::case_when()
does, with the same syntax, but with
data.table::fcase()
under the hood.
dt_case_when(...)
dt_case_when(...)
... |
statements of the form: |
Vector of the same size as the input vector
x <- rnorm(100) dt_case_when( x < median(x) ~ "low", x >= median(x) ~ "high", is.na(x) ~ "other" ) library(data.table) temp <- data.table( pseudo_id = c(1, 2, 3, 4, 5), x = sample(1:5, 5, replace = TRUE) ) temp[, y := dt_case_when( pseudo_id == 1 ~ x * 1, pseudo_id == 2 ~ x * 2, pseudo_id == 3 ~ x * 3, pseudo_id == 4 ~ x * 4, pseudo_id == 5 ~ x * 5 )]
x <- rnorm(100) dt_case_when( x < median(x) ~ "low", x >= median(x) ~ "high", is.na(x) ~ "other" ) library(data.table) temp <- data.table( pseudo_id = c(1, 2, 3, 4, 5), x = sample(1:5, 5, replace = TRUE) ) temp[, y := dt_case_when( pseudo_id == 1 ~ x * 1, pseudo_id == 2 ~ x * 2, pseudo_id == 3 ~ x * 3, pseudo_id == 4 ~ x * 4, pseudo_id == 5 ~ x * 5 )]
Count the numbers of observations within groups
dt_count(dt_, ..., na.rm = FALSE, wt = NULL)
dt_count(dt_, ..., na.rm = FALSE, wt = NULL)
dt_ |
the data table to uncount |
... |
groups |
na.rm |
should any rows with missingness be removed before the count? Default is |
wt |
the wt assigned to the counts (same number of rows as the data) |
A data.table with counts for each group (or combination of groups)
library(data.table) dt <- data.table( x = rnorm(1e5), y = runif(1e5), grp = sample(1L:3L, 1e5, replace = TRUE), wt = runif(1e5, 1, 100) ) dt_count(dt, grp) dt_count(dt, grp, na.rm = TRUE) dt_count(dt, grp, na.rm = TRUE, wt = wt)
library(data.table) dt <- data.table( x = rnorm(1e5), y = runif(1e5), grp = sample(1L:3L, 1e5, replace = TRUE), wt = runif(1e5, 1, 100) ) dt_count(dt, grp) dt_count(dt, grp, na.rm = TRUE) dt_count(dt, grp, na.rm = TRUE, wt = wt)
Fills in values, similar to tidyr::fill()
, by within data.table
. This function relies on the
Rcpp
functions that drive tidyr::fill()
but applies them within data.table
.
dt_fill( dt_, ..., id = NULL, .direction = c("down", "up", "downup", "updown"), immutable = TRUE )
dt_fill( dt_, ..., id = NULL, .direction = c("down", "up", "downup", "updown"), immutable = TRUE )
dt_ |
the data table (or if not a data.table then it is coerced with as.data.table) |
... |
the columns to fill |
id |
the grouping variable(s) to fill within |
.direction |
either "down" or "up" (down fills values down, up fills values up), or "downup" (down first then up) or "updown" (up first then down) |
immutable |
If |
A data.table with listed columns having values filled in
set.seed(84322) library(data.table) x <- 1:10 dt <- data.table( v1 = x, v2 = shift(x), v3 = shift(x, -1L), v4 = sample(c(rep(NA, 10), x), 10), grp = sample(1:3, 10, replace = TRUE) ) dt_fill(dt, v2, v3, v4, id = grp, .direction = "downup") dt_fill(dt, v2, v3, v4, id = grp) dt_fill(dt, .direction = "up")
set.seed(84322) library(data.table) x <- 1:10 dt <- data.table( v1 = x, v2 = shift(x), v3 = shift(x, -1L), v4 = sample(c(rep(NA, 10), x), 10), grp = sample(1:3, 10, replace = TRUE) ) dt_fill(dt, v2, v3, v4, id = grp, .direction = "downup") dt_fill(dt, v2, v3, v4, id = grp) dt_fill(dt, .direction = "up")
Quickly unnest vectors nested in list columns. Still experimental (has some potentially unexpected behavior in some situations)!
dt_hoist(dt_, ...)
dt_hoist(dt_, ...)
dt_ |
the data table to unnest |
... |
the columns to unnest (must all be the sample length when unnested); use bare names of the variables |
library(data.table) dt <- data.table( x = rnorm(1e5), y = runif(1e5), nested1 = lapply(1:10, sample, 10, replace = TRUE), nested2 = lapply(c("thing1", "thing2"), sample, 10, replace = TRUE), id = 1:1e5 ) dt_hoist(dt, nested1, nested2)
library(data.table) dt <- data.table( x = rnorm(1e5), y = runif(1e5), nested1 = lapply(1:10, sample, 10, replace = TRUE), nested2 = lapply(c("thing1", "thing2"), sample, 10, replace = TRUE), id = 1:1e5 ) dt_hoist(dt, nested1, nested2)
Quickly nest data tables (similar to dplyr::group_nest()
).
dt_nest(dt_, ..., .key = "data")
dt_nest(dt_, ..., .key = "data")
dt_ |
the data table to nest |
... |
the variables to group by |
.key |
the name of the list column; default is "data" |
A data.table with a list column containing data.tables
library(data.table) dt <- data.table( x = rnorm(1e5), y = runif(1e5), grp = sample(1L:3L, 1e5, replace = TRUE) ) dt_nest(dt, grp)
library(data.table) dt <- data.table( x = rnorm(1e5), y = runif(1e5), grp = sample(1L:3L, 1e5, replace = TRUE) ) dt_nest(dt, grp)
dt_pivot_wider()
"widens" data, increasing the number of columns and
decreasing the number of rows. The inverse transformation is
dt_pivot_longer()
. Syntax based on the tidyr
equivalents.
dt_pivot_longer( dt_, cols = NULL, names_to = "name", values_to = "value", values_drop_na = FALSE, ... )
dt_pivot_longer( dt_, cols = NULL, names_to = "name", values_to = "value", values_drop_na = FALSE, ... )
dt_ |
The data table to pivot longer |
cols |
Column selection. If empty, uses all columns. Can use -colname to unselect column(s) |
names_to |
Name of the new "names" column. Must be a string. |
values_to |
Name of the new "values" column. Must be a string. |
values_drop_na |
If TRUE, rows will be dropped that contain NAs. |
... |
Additional arguments to pass to 'melt.data.table()' |
A reshaped data.table into longer format
library(data.table) example_dt <- data.table(x = c(1, 2, 3), y = c(4, 5, 6), z = c("a", "b", "c")) dt_pivot_longer(example_dt, cols = c(x, y), names_to = "stuff", values_to = "things" ) dt_pivot_longer(example_dt, cols = -z, names_to = "stuff", values_to = "things" )
library(data.table) example_dt <- data.table(x = c(1, 2, 3), y = c(4, 5, 6), z = c("a", "b", "c")) dt_pivot_longer(example_dt, cols = c(x, y), names_to = "stuff", values_to = "things" ) dt_pivot_longer(example_dt, cols = -z, names_to = "stuff", values_to = "things" )
dt_pivot_wider()
"widens" data, increasing the number of columns and
decreasing the number of rows. The inverse transformation is
dt_pivot_longer()
. Syntax based on the tidyr
equivalents.
dt_pivot_wider(dt_, id_cols = NULL, names_from, names_sep = "_", values_from)
dt_pivot_wider(dt_, id_cols = NULL, names_from, names_sep = "_", values_from)
dt_ |
the data table to widen |
id_cols |
A set of columns that uniquely identifies each observation. Defaults to all columns in the data table except for the columns specified in |
names_from |
A pair of arguments describing which column (or columns) to get the name of the output column ( |
names_sep |
the separator between the names of the columns |
values_from |
A pair of arguments describing which column (or columns) to get the name of the output column ( |
A reshaped data.table into wider format
library(data.table) example_dt <- data.table( z = rep(c("a", "b", "c"), 2), stuff = c(rep("x", 3), rep("y", 3)), things = 1:6 ) dt_pivot_wider(example_dt, names_from = stuff, values_from = things) dt_pivot_wider(example_dt, names_from = stuff, values_from = things, id_cols = z)
library(data.table) example_dt <- data.table( z = rep(c("a", "b", "c"), 2), stuff = c(rep("x", 3), rep("y", 3)), things = 1:6 ) dt_pivot_wider(example_dt, names_from = stuff, values_from = things) dt_pivot_wider(example_dt, names_from = stuff, values_from = things, id_cols = z)
The function allows the user to define options relating to the print method for data.table
.
dt_print_options( class = TRUE, topn = 5, rownames = TRUE, nrows = 100, trunc.cols = TRUE )
dt_print_options( class = TRUE, topn = 5, rownames = TRUE, nrows = 100, trunc.cols = TRUE )
class |
should the variable class be printed? ( |
topn |
the number of rows to print (both head and tail) if |
rownames |
should rownames be printed? ( |
nrows |
total number of rows to print ( |
trunc.cols |
if |
None. This function is used for its side effect of changing options.
dt_print_options( class = TRUE, topn = 5, rownames = TRUE, nrows = 100, trunc.cols = TRUE )
dt_print_options( class = TRUE, topn = 5, rownames = TRUE, nrows = 100, trunc.cols = TRUE )
Separates a column of data into others, by splitting based a separator or regular expression
dt_separate( dt_, col, into, sep = ".", remove = TRUE, fill = NA, fixed = TRUE, immutable = TRUE, dev = FALSE, ... )
dt_separate( dt_, col, into, sep = ".", remove = TRUE, fill = NA, fixed = TRUE, immutable = TRUE, dev = FALSE, ... )
dt_ |
the data table (or if not a data.table then it is coerced with as.data.table) |
col |
the column to separate |
into |
the names of the new columns created from splitting |
sep |
the regular expression stating how |
remove |
should |
fill |
if empty, fill is inserted. Default is |
fixed |
logical. If TRUE match split exactly, otherwise use regular expressions. Has priority over perl. |
immutable |
If |
dev |
If |
... |
arguments passed to |
A data.table with a column split into multiple columns.
library(data.table) d <- data.table( x = c("A.B", "A", "B", "B.A"), y = 1:4 ) # defaults dt_separate(d, x, c("c1", "c2")) # can keep the original column with `remove = FALSE` dt_separate(d, x, c("c1", "c2"), remove = FALSE) # need to assign when `immutable = TRUE` separated <- dt_separate(d, x, c("c1", "c2"), immutable = TRUE) separated # don't need to assign when `immutable = FALSE` (default) dt_separate(d, x, c("c1", "c2"), immutable = FALSE) d
library(data.table) d <- data.table( x = c("A.B", "A", "B", "B.A"), y = 1:4 ) # defaults dt_separate(d, x, c("c1", "c2")) # can keep the original column with `remove = FALSE` dt_separate(d, x, c("c1", "c2"), remove = FALSE) # need to assign when `immutable = TRUE` separated <- dt_separate(d, x, c("c1", "c2"), immutable = TRUE) separated # don't need to assign when `immutable = FALSE` (default) dt_separate(d, x, c("c1", "c2"), immutable = FALSE) d
These functions allow you to select variables based on their names.
dt_starts_with()
: Starts with a prefix
dt_starts_with()
: Ends with a suffix
dt_contains()
: Contains a literal string
dt_everything()
: Matches all variables
dt_starts_with(match) dt_contains(match) dt_ends_with(match) dt_everything()
dt_starts_with(match) dt_contains(match) dt_ends_with(match) dt_everything()
match |
a character string to match to variable names |
None. To be used within the dt_pivot_*
functions.
library(data.table) # example of using it with `dt_pivot_longer()` df <- data.table(row = 1, var = c("x", "y"), a = 1:2, b = 3:4) pv <- dt_pivot_wider(df, names_from = var, values_from = c(dt_starts_with("a"), dt_ends_with("b")) )
library(data.table) # example of using it with `dt_pivot_longer()` df <- data.table(row = 1, var = c("x", "y"), a = 1:2, b = 3:4) pv <- dt_pivot_wider(df, names_from = var, values_from = c(dt_starts_with("a"), dt_ends_with("b")) )
Uncount a counted data table
dt_uncount(dt_, weights, .remove = TRUE, .id = NULL)
dt_uncount(dt_, weights, .remove = TRUE, .id = NULL)
dt_ |
the data table to uncount |
weights |
the counts for each |
.remove |
should the weights variable be removed? |
.id |
an optional new id variable, providing a unique id for each row |
A data.table with a row for each uncounted column.
library(data.table) dt_count <- data.table( x = LETTERS[1:3], w = c(2, 1, 4) ) uncount <- dt_uncount(dt_count, w, .id = "id") uncount[] # note that `[]` forces the printing
library(data.table) dt_count <- data.table( x = LETTERS[1:3], w = c(2, 1, 4) ) uncount <- dt_uncount(dt_count, w, .id = "id") uncount[] # note that `[]` forces the printing
Quickly unnest data tables, particularly those nested by dt_nest()
.
dt_unnest(dt_, col, keep = TRUE)
dt_unnest(dt_, col, keep = TRUE)
dt_ |
the data table to unnest |
col |
the column to unnest |
keep |
whether to keep the nested column, default is |
library(data.table) dt <- data.table( x = rnorm(1e5), y = runif(1e5), grp = sample(1L:3L, 1e5, replace = TRUE) ) nested <- dt_nest(dt, grp) dt_unnest(nested, col = data)
library(data.table) dt <- data.table( x = rnorm(1e5), y = runif(1e5), grp = sample(1L:3L, 1e5, replace = TRUE) ) nested <- dt_nest(dt, grp) dt_unnest(nested, col = data)