Skip to content

top_k_by

Return the elements corresponding to the k largest elements of the by column(s)

Description

Non-null elements are always preferred over null elements. The output is not guaranteed to be in any particular order, call $sort() after this function if you wish the output to be sorted. This has time complexity O(n).

Usage

<Expr>$top_k_by(by, k = 5, ..., reverse = FALSE)

Arguments

by Column(s) used to determine the smallest elements. Accepts expression input. Strings are parsed as column names.
k Number of elements to return.
These dots are for future extensions and must be empty.
reverse Consider the k smallest elements of the by column(s) (instead of the k largest). This can be specified per column by passing a sequence of booleans.

Value

A polars expression

Examples

library("polars")

df <- pl$DataFrame(
  a = 1:6,
  b = 6:1,
  c = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana")
)

# Get the top 2 rows by column a or b:
df$select(
  pl$all()$top_k_by("a", 2)$name$suffix("_btm_by_a"),
  pl$all()$top_k_by("b", 2)$name$suffix("_btm_by_b")
)
#> shape: (2, 6)
#> ┌────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
#> │ a_btm_by_a ┆ b_btm_by_a ┆ c_btm_by_a ┆ a_btm_by_b ┆ b_btm_by_b ┆ c_btm_by_b │
#> │ ---        ┆ ---        ┆ ---        ┆ ---        ┆ ---        ┆ ---        │
#> │ i32        ┆ i32        ┆ str        ┆ i32        ┆ i32        ┆ str        │
#> ╞════════════╪════════════╪════════════╪════════════╪════════════╪════════════╡
#> │ 6          ┆ 1          ┆ Banana     ┆ 1          ┆ 6          ┆ Apple      │
#> │ 5          ┆ 2          ┆ Banana     ┆ 2          ┆ 5          ┆ Orange     │
#> └────────────┴────────────┴────────────┴────────────┴────────────┴────────────┘
# Get the top 2 rows by multiple columns with given order.
df$select(
  pl$all()$
    top_k_by(c("c", "a"), 2, reverse = c(FALSE, TRUE))$
    name$suffix("_btm_by_ca"),
  pl$all()$
    top_k_by(c("c", "b"), 2, reverse = c(FALSE, TRUE))$
    name$suffix("_btm_by_cb"),
)
#> shape: (2, 6)
#> ┌─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
#> │ a_btm_by_ca ┆ b_btm_by_ca ┆ c_btm_by_ca ┆ a_btm_by_cb ┆ b_btm_by_cb ┆ c_btm_by_cb │
#> │ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ ---         │
#> │ i32         ┆ i32         ┆ str         ┆ i32         ┆ i32         ┆ str         │
#> ╞═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
#> │ 2           ┆ 5           ┆ Orange      ┆ 2           ┆ 5           ┆ Orange      │
#> │ 5           ┆ 2           ┆ Banana      ┆ 6           ┆ 1           ┆ Banana      │
#> └─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘
# Get the top 2 rows by column a in each group
df$group_by("c", .maintain_order = TRUE)$agg(
  pl$all()$top_k_by("a", 2)
)$explode(pl$all()$exclude("c"))
#> shape: (5, 3)
#> ┌────────┬─────┬─────┐
#> │ c      ┆ a   ┆ b   │
#> │ ---    ┆ --- ┆ --- │
#> │ str    ┆ i32 ┆ i32 │
#> ╞════════╪═════╪═════╡
#> │ Apple  ┆ 4   ┆ 3   │
#> │ Apple  ┆ 3   ┆ 4   │
#> │ Orange ┆ 2   ┆ 5   │
#> │ Banana ┆ 6   ┆ 1   │
#> │ Banana ┆ 5   ┆ 2   │
#> └────────┴─────┴─────┘