Pivot a frame from long to wide format

Description

Only available in eager mode. See "Examples" section below for how to do a "lazy pivot" if you know the unique column values in advance.

Usage

<DataFrame>$pivot(
  on,
  ...,
  index = NULL,
  values = NULL,
  aggregate_function = NULL,
  maintain_order = TRUE,
  sort_columns = FALSE,
  separator = "_"
)

Arguments

on The column(s) whose values will be used as the new columns of the output DataFrame.

… These dots are for future extensions and must be empty.

index The column(s) that remain from the input to the output. The output DataFrame will have one row for each unique combination of the index’s values. If NULL, all remaining columns not specified in on and values will be used. At least one of index and values must be specified.

values The existing column(s) of values which will be moved under the new columns from index. If an aggregation is specified, these are the values on which the aggregation will be computed. If NULL, all remaining columns not specified in on and index will be used. At least one of index and values must be specified.

aggregate_function

Choose from:

NULL: no aggregation takes place, will raise error if multiple values are in group;
A predefined aggregate function string, one of “min”, “max”, “first”, “last”, “sum”, “mean”, “median”, “len”;
An expression to do the aggregation.

maintain_order Ensure the values of index are sorted by discovery order.

sort_columns Sort the transposed columns by name. Default is by order of discovery.

separator Used as separator/delimiter in generated column names in case of multiple values columns.

Value

A polars DataFrame

Examples

library("polars")

# Suppose we have a dataframe of test scores achieved by some students,
# where each row represents a distinct test.
df <- pl$DataFrame(
  name = c("Cady", "Cady", "Karen", "Karen"),
  subject = c("maths", "physics", "maths", "physics"),
  test_1 = c(98, 99, 61, 58),
  test_2 = c(100, 100, 60, 60)
)
df

#> shape: (4, 4)
#> ┌───────┬─────────┬────────┬────────┐
#> │ name  ┆ subject ┆ test_1 ┆ test_2 │
#> │ ---   ┆ ---     ┆ ---    ┆ ---    │
#> │ str   ┆ str     ┆ f64    ┆ f64    │
#> ╞═══════╪═════════╪════════╪════════╡
#> │ Cady  ┆ maths   ┆ 98.0   ┆ 100.0  │
#> │ Cady  ┆ physics ┆ 99.0   ┆ 100.0  │
#> │ Karen ┆ maths   ┆ 61.0   ┆ 60.0   │
#> │ Karen ┆ physics ┆ 58.0   ┆ 60.0   │
#> └───────┴─────────┴────────┴────────┘

# Using pivot(), we can reshape so we have one row per student, with
# different subjects as columns, and their `test_1` scores as values:
df$pivot("subject", index = "name", values = "test_1")

#> shape: (2, 3)
#> ┌───────┬───────┬─────────┐
#> │ name  ┆ maths ┆ physics │
#> │ ---   ┆ ---   ┆ ---     │
#> │ str   ┆ f64   ┆ f64     │
#> ╞═══════╪═══════╪═════════╡
#> │ Cady  ┆ 98.0  ┆ 99.0    │
#> │ Karen ┆ 61.0  ┆ 58.0    │
#> └───────┴───────┴─────────┘

# If you end up with multiple values per cell, you can specify how to
# aggregate them with `aggregate_function`:
df <- pl$DataFrame(
  ix = c(1, 1, 2, 2, 1, 2),
  col = c("a", "a", "a", "a", "b", "b"),
  foo = c(0, 1, 2, 2, 7, 1),
  bar = c(0, 2, 0, 0, 9, 4)
)
df

#> shape: (6, 4)
#> ┌─────┬─────┬─────┬─────┐
#> │ ix  ┆ col ┆ foo ┆ bar │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ f64 ┆ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╪═════╡
#> │ 1.0 ┆ a   ┆ 0.0 ┆ 0.0 │
#> │ 1.0 ┆ a   ┆ 1.0 ┆ 2.0 │
#> │ 2.0 ┆ a   ┆ 2.0 ┆ 0.0 │
#> │ 2.0 ┆ a   ┆ 2.0 ┆ 0.0 │
#> │ 1.0 ┆ b   ┆ 7.0 ┆ 9.0 │
#> │ 2.0 ┆ b   ┆ 1.0 ┆ 4.0 │
#> └─────┴─────┴─────┴─────┘

df$pivot("col", index = "ix", aggregate_function = "sum")

#> shape: (2, 5)
#> ┌─────┬───────┬───────┬───────┬───────┐
#> │ ix  ┆ foo_a ┆ foo_b ┆ bar_a ┆ bar_b │
#> │ --- ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
#> │ f64 ┆ f64   ┆ f64   ┆ f64   ┆ f64   │
#> ╞═════╪═══════╪═══════╪═══════╪═══════╡
#> │ 1.0 ┆ 1.0   ┆ 7.0   ┆ 2.0   ┆ 9.0   │
#> │ 2.0 ┆ 4.0   ┆ 1.0   ┆ 0.0   ┆ 4.0   │
#> └─────┴───────┴───────┴───────┴───────┘

# You can also pass a custom aggregation function using `pl$element()`:
df <- pl$DataFrame(
  col1 = c("a", "a", "a", "b", "b", "b"),
  col2 = c("x", "x", "x", "x", "y", "y"),
  col3 = c(6, 7, 3, 2, 5, 7),
)
df$pivot(
  "col2",
  index = "col1",
  values = "col3",
  aggregate_function = pl$element()$tanh()$mean(),
)

#> shape: (2, 3)
#> ┌──────┬──────────┬──────────┐
#> │ col1 ┆ x        ┆ y        │
#> │ ---  ┆ ---      ┆ ---      │
#> │ str  ┆ f64      ┆ f64      │
#> ╞══════╪══════════╪══════════╡
#> │ a    ┆ 0.998347 ┆ null     │
#> │ b    ┆ 0.964028 ┆ 0.999954 │
#> └──────┴──────────┴──────────┘

# Note that pivot is only available in eager mode. If you know the unique
# column values in advance, you can use `$group_by()` on a LazyFrame to get
# the same result as above in lazy mode:
index <- pl$col("col1")
on <- pl$col("col2")
values <- pl$col("col3")
unique_column_values <- c("x", "y")
aggregate_function <- \(col) col$tanh()$mean()
funs <- lapply(unique_column_values, \(value) {
  aggregate_function(values$filter(on == value))$alias(value)
})
df$lazy()$group_by(index)$agg(!!!funs)$collect()

#> shape: (2, 3)
#> ┌──────┬──────────┬──────────┐
#> │ col1 ┆ x        ┆ y        │
#> │ ---  ┆ ---      ┆ ---      │
#> │ str  ┆ f64      ┆ f64      │
#> ╞══════╪══════════╪══════════╡
#> │ b    ┆ 0.964028 ┆ 0.999954 │
#> │ a    ┆ 0.998347 ┆ null     │
#> └──────┴──────────┴──────────┘