Pivot a frame from long to wide format
Description
Only available in eager mode. See "Examples" section below for how to do a "lazy pivot" if you know the unique column values in advance.
Usage
<DataFrame>$pivot(
on,
...,
index = NULL,
values = NULL,
aggregate_function = NULL,
maintain_order = TRUE,
sort_columns = FALSE,
separator = "_"
)
Arguments
on
|
The column(s) whose values will be used as the new columns of the output DataFrame. |
…
|
These dots are for future extensions and must be empty. |
index
|
The column(s) that remain from the input to the output. The output
DataFrame will have one row for each unique combination of the
index ’s values. If NULL , all remaining columns
not specified in on and values will be used.
At least one of index and values must be
specified.
|
values
|
The existing column(s) of values which will be moved under the new
columns from index . If an aggregation is specified, these
are the values on which the aggregation will be computed. If
NULL , all remaining columns not specified in
on and index will be used. At least one of
index and values must be specified.
|
aggregate_function
|
Choose from:
|
maintain_order
|
Ensure the values of index are sorted by discovery order.
|
sort_columns
|
Sort the transposed columns by name. Default is by order of discovery. |
separator
|
Used as separator/delimiter in generated column names in case of multiple values columns. |
Value
A polars DataFrame
Examples
library("polars")
# Suppose we have a dataframe of test scores achieved by some students,
# where each row represents a distinct test.
df <- pl$DataFrame(
name = c("Cady", "Cady", "Karen", "Karen"),
subject = c("maths", "physics", "maths", "physics"),
test_1 = c(98, 99, 61, 58),
test_2 = c(100, 100, 60, 60)
)
df
#> shape: (4, 4)
#> ┌───────┬─────────┬────────┬────────┐
#> │ name ┆ subject ┆ test_1 ┆ test_2 │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ str ┆ str ┆ f64 ┆ f64 │
#> ╞═══════╪═════════╪════════╪════════╡
#> │ Cady ┆ maths ┆ 98.0 ┆ 100.0 │
#> │ Cady ┆ physics ┆ 99.0 ┆ 100.0 │
#> │ Karen ┆ maths ┆ 61.0 ┆ 60.0 │
#> │ Karen ┆ physics ┆ 58.0 ┆ 60.0 │
#> └───────┴─────────┴────────┴────────┘
# Using pivot(), we can reshape so we have one row per student, with
# different subjects as columns, and their `test_1` scores as values:
df$pivot("subject", index = "name", values = "test_1")
#> shape: (2, 3)
#> ┌───────┬───────┬─────────┐
#> │ name ┆ maths ┆ physics │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═══════╪═══════╪═════════╡
#> │ Cady ┆ 98.0 ┆ 99.0 │
#> │ Karen ┆ 61.0 ┆ 58.0 │
#> └───────┴───────┴─────────┘
# If you end up with multiple values per cell, you can specify how to
# aggregate them with `aggregate_function`:
df <- pl$DataFrame(
ix = c(1, 1, 2, 2, 1, 2),
col = c("a", "a", "a", "a", "b", "b"),
foo = c(0, 1, 2, 2, 7, 1),
bar = c(0, 2, 0, 0, 9, 4)
)
df
#> shape: (6, 4)
#> ┌─────┬─────┬─────┬─────┐
#> │ ix ┆ col ┆ foo ┆ bar │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ f64 ┆ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╪═════╡
#> │ 1.0 ┆ a ┆ 0.0 ┆ 0.0 │
#> │ 1.0 ┆ a ┆ 1.0 ┆ 2.0 │
#> │ 2.0 ┆ a ┆ 2.0 ┆ 0.0 │
#> │ 2.0 ┆ a ┆ 2.0 ┆ 0.0 │
#> │ 1.0 ┆ b ┆ 7.0 ┆ 9.0 │
#> │ 2.0 ┆ b ┆ 1.0 ┆ 4.0 │
#> └─────┴─────┴─────┴─────┘
#> shape: (2, 5)
#> ┌─────┬───────┬───────┬───────┬───────┐
#> │ ix ┆ foo_a ┆ foo_b ┆ bar_a ┆ bar_b │
#> │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
#> │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
#> ╞═════╪═══════╪═══════╪═══════╪═══════╡
#> │ 1.0 ┆ 1.0 ┆ 7.0 ┆ 2.0 ┆ 9.0 │
#> │ 2.0 ┆ 4.0 ┆ 1.0 ┆ 0.0 ┆ 4.0 │
#> └─────┴───────┴───────┴───────┴───────┘
# You can also pass a custom aggregation function using `pl$element()`:
df <- pl$DataFrame(
col1 = c("a", "a", "a", "b", "b", "b"),
col2 = c("x", "x", "x", "x", "y", "y"),
col3 = c(6, 7, 3, 2, 5, 7),
)
df$pivot(
"col2",
index = "col1",
values = "col3",
aggregate_function = pl$element()$tanh()$mean(),
)
#> shape: (2, 3)
#> ┌──────┬──────────┬──────────┐
#> │ col1 ┆ x ┆ y │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞══════╪══════════╪══════════╡
#> │ a ┆ 0.998347 ┆ null │
#> │ b ┆ 0.964028 ┆ 0.999954 │
#> └──────┴──────────┴──────────┘
# Note that pivot is only available in eager mode. If you know the unique
# column values in advance, you can use `$group_by()` on a LazyFrame to get
# the same result as above in lazy mode:
index <- pl$col("col1")
on <- pl$col("col2")
values <- pl$col("col3")
unique_column_values <- c("x", "y")
aggregate_function <- \(col) col$tanh()$mean()
funs <- lapply(unique_column_values, \(value) {
aggregate_function(values$filter(on == value))$alias(value)
})
df$lazy()$group_by(index)$agg(!!!funs)$collect()
#> shape: (2, 3)
#> ┌──────┬──────────┬──────────┐
#> │ col1 ┆ x ┆ y │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞══════╪══════════╪══════════╡
#> │ b ┆ 0.964028 ┆ 0.999954 │
#> │ a ┆ 0.998347 ┆ null │
#> └──────┴──────────┴──────────┘