Group based on a date/time or integer column
Description
Time windows are calculated and rows are assigned to windows. Different from a normal group by is that a row can be member of multiple groups. By default, the windows look like:
- \[start, start + period)
- \[start + every, start + every + period)
- \[start + 2 \* every, start + 2 \* every + period)
- …
where start
is determined by start_by
,
offset
, every
, and the earliest datapoint. See
the start_by
argument description for details.
Usage
<LazyFrame>$group_by_dynamic(
index_column,
...,
every,
period = NULL,
offset = NULL,
include_boundaries = FALSE,
closed = c("left", "right", "both", "none"),
label = c("left", "right", "datapoint"),
group_by = NULL,
start_by = "window"
)
Arguments
index_column
|
Column used to group based on the time window. Often of type
Date/Datetime. This column must be sorted in ascending order (or, if
group_by is specified, then it must be sorted in ascending
order within each group). In case of a dynamic group by on indices, the
data type needs to be either Int32 or In64. Note that Int32 gets
temporarily cast to Int64, so if performance matters, use an Int64
column.
|
…
|
These dots are for future extensions and must be empty. |
every
|
Interval of the window. |
period
|
Length of the window. If NULL (default), it will equal
every .
|
offset
|
Offset of the window, does not take effect if start_by =
“datapoint” . Defaults to zero.
|
include_boundaries
|
Add two columns “\_lower_boundary” and
“\_upper_boundary” columns that show the boundaries of the
window. This will impact performance because it’s harder to parallelize.
|
closed
|
Define which sides of the interval are closed (inclusive). Default is
“left” .
|
label
|
Define which label to use for the window:
|
group_by
|
Also group by this column/these columns. Can be expressions or objects coercible to expressions. |
start_by
|
The strategy to determine the start of the first window by:
|
Details
The every
, period
, and offset
arguments are created with the following string language:
- 1ns \# 1 nanosecond
- 1us \# 1 microsecond
- 1ms \# 1 millisecond
- 1s \# 1 second
- 1m \# 1 minute
- 1h \# 1 hour
- 1d \# 1 day
- 1w \# 1 calendar week
- 1mo \# 1 calendar month
-
1y \# 1 calendar year These strings can be combined:
- 3d12h4m25s \# 3 days, 12 hours, 4 minutes, and 25 seconds
In case of a group_by_dynamic
on an integer column, the
windows are defined by:
- 1i \# length 1
- 10i \# length 10
Value
An object of class polars_lazy_group_by
See Also
-
\
$rolling()
Examples
library("polars")
lf <- pl$select(
time = pl$datetime_range(
start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"),
end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"),
interval = "30m"
),
n = 0:6
)$lazy()
lf$collect()
#> shape: (7, 2)
#> ┌─────────────────────────┬─────┐
#> │ time ┆ n │
#> │ --- ┆ --- │
#> │ datetime[ns, UTC] ┆ i32 │
#> ╞═════════════════════════╪═════╡
#> │ 2021-12-16 00:00:00 UTC ┆ 0 │
#> │ 2021-12-16 00:30:00 UTC ┆ 1 │
#> │ 2021-12-16 01:00:00 UTC ┆ 2 │
#> │ 2021-12-16 01:30:00 UTC ┆ 3 │
#> │ 2021-12-16 02:00:00 UTC ┆ 4 │
#> │ 2021-12-16 02:30:00 UTC ┆ 5 │
#> │ 2021-12-16 03:00:00 UTC ┆ 6 │
#> └─────────────────────────┴─────┘
# Group by windows of 1 hour.
lf$group_by_dynamic("time", every = "1h", closed = "right")$agg(
vals = pl$col("n")
)$collect()
#> shape: (4, 2)
#> ┌─────────────────────────┬───────────┐
#> │ time ┆ vals │
#> │ --- ┆ --- │
#> │ datetime[ns, UTC] ┆ list[i32] │
#> ╞═════════════════════════╪═══════════╡
#> │ 2021-12-15 23:00:00 UTC ┆ [0] │
#> │ 2021-12-16 00:00:00 UTC ┆ [1, 2] │
#> │ 2021-12-16 01:00:00 UTC ┆ [3, 4] │
#> │ 2021-12-16 02:00:00 UTC ┆ [5, 6] │
#> └─────────────────────────┴───────────┘
# The window boundaries can also be added to the aggregation result
lf$group_by_dynamic(
"time",
every = "1h", include_boundaries = TRUE, closed = "right"
)$agg(
pl$col("n")$mean()
)$collect()
#> shape: (4, 4)
#> ┌─────────────────────────┬─────────────────────────┬─────────────────────────┬─────┐
#> │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ datetime[ns, UTC] ┆ datetime[ns, UTC] ┆ datetime[ns, UTC] ┆ f64 │
#> ╞═════════════════════════╪═════════════════════════╪═════════════════════════╪═════╡
#> │ 2021-12-15 23:00:00 UTC ┆ 2021-12-16 00:00:00 UTC ┆ 2021-12-15 23:00:00 UTC ┆ 0.0 │
#> │ 2021-12-16 00:00:00 UTC ┆ 2021-12-16 01:00:00 UTC ┆ 2021-12-16 00:00:00 UTC ┆ 1.5 │
#> │ 2021-12-16 01:00:00 UTC ┆ 2021-12-16 02:00:00 UTC ┆ 2021-12-16 01:00:00 UTC ┆ 3.5 │
#> │ 2021-12-16 02:00:00 UTC ┆ 2021-12-16 03:00:00 UTC ┆ 2021-12-16 02:00:00 UTC ┆ 5.5 │
#> └─────────────────────────┴─────────────────────────┴─────────────────────────┴─────┘
# When closed = "left", the window excludes the right end of interval:
# [lower_bound, upper_bound)
lf$group_by_dynamic("time", every = "1h", closed = "left")$agg(
pl$col("n")
)$collect()
#> shape: (4, 2)
#> ┌─────────────────────────┬───────────┐
#> │ time ┆ n │
#> │ --- ┆ --- │
#> │ datetime[ns, UTC] ┆ list[i32] │
#> ╞═════════════════════════╪═══════════╡
#> │ 2021-12-16 00:00:00 UTC ┆ [0, 1] │
#> │ 2021-12-16 01:00:00 UTC ┆ [2, 3] │
#> │ 2021-12-16 02:00:00 UTC ┆ [4, 5] │
#> │ 2021-12-16 03:00:00 UTC ┆ [6] │
#> └─────────────────────────┴───────────┘
# When closed = "both" the time values at the window boundaries belong to 2
# groups.
lf$group_by_dynamic("time", every = "1h", closed = "both")$agg(
pl$col("n")
)$collect()
#> shape: (4, 2)
#> ┌─────────────────────────┬───────────┐
#> │ time ┆ n │
#> │ --- ┆ --- │
#> │ datetime[ns, UTC] ┆ list[i32] │
#> ╞═════════════════════════╪═══════════╡
#> │ 2021-12-16 00:00:00 UTC ┆ [0, 1, 2] │
#> │ 2021-12-16 01:00:00 UTC ┆ [2, 3, 4] │
#> │ 2021-12-16 02:00:00 UTC ┆ [4, 5, 6] │
#> │ 2021-12-16 03:00:00 UTC ┆ [6] │
#> └─────────────────────────┴───────────┘
# Dynamic group bys can also be combined with grouping on normal keys
lf <- lf$with_columns(
groups = as_polars_series(c("a", "a", "a", "b", "b", "a", "a"))
)
lf$collect()
#> shape: (7, 3)
#> ┌─────────────────────────┬─────┬────────┐
#> │ time ┆ n ┆ groups │
#> │ --- ┆ --- ┆ --- │
#> │ datetime[ns, UTC] ┆ i32 ┆ str │
#> ╞═════════════════════════╪═════╪════════╡
#> │ 2021-12-16 00:00:00 UTC ┆ 0 ┆ a │
#> │ 2021-12-16 00:30:00 UTC ┆ 1 ┆ a │
#> │ 2021-12-16 01:00:00 UTC ┆ 2 ┆ a │
#> │ 2021-12-16 01:30:00 UTC ┆ 3 ┆ b │
#> │ 2021-12-16 02:00:00 UTC ┆ 4 ┆ b │
#> │ 2021-12-16 02:30:00 UTC ┆ 5 ┆ a │
#> │ 2021-12-16 03:00:00 UTC ┆ 6 ┆ a │
#> └─────────────────────────┴─────┴────────┘
lf$group_by_dynamic(
"time",
every = "1h",
closed = "both",
group_by = "groups",
include_boundaries = TRUE
)$agg(pl$col("n"))$collect()
#> shape: (6, 5)
#> ┌────────┬─────────────────────────┬─────────────────────────┬─────────────────────────┬───────────┐
#> │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │
#> │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
#> │ str ┆ datetime[ns, UTC] ┆ datetime[ns, UTC] ┆ datetime[ns, UTC] ┆ list[i32] │
#> ╞════════╪═════════════════════════╪═════════════════════════╪═════════════════════════╪═══════════╡
#> │ a ┆ 2021-12-16 00:00:00 UTC ┆ 2021-12-16 01:00:00 UTC ┆ 2021-12-16 00:00:00 UTC ┆ [0, 1, 2] │
#> │ a ┆ 2021-12-16 01:00:00 UTC ┆ 2021-12-16 02:00:00 UTC ┆ 2021-12-16 01:00:00 UTC ┆ [2] │
#> │ a ┆ 2021-12-16 02:00:00 UTC ┆ 2021-12-16 03:00:00 UTC ┆ 2021-12-16 02:00:00 UTC ┆ [5, 6] │
#> │ a ┆ 2021-12-16 03:00:00 UTC ┆ 2021-12-16 04:00:00 UTC ┆ 2021-12-16 03:00:00 UTC ┆ [6] │
#> │ b ┆ 2021-12-16 01:00:00 UTC ┆ 2021-12-16 02:00:00 UTC ┆ 2021-12-16 01:00:00 UTC ┆ [3, 4] │
#> │ b ┆ 2021-12-16 02:00:00 UTC ┆ 2021-12-16 03:00:00 UTC ┆ 2021-12-16 02:00:00 UTC ┆ [4] │
#> └────────┴─────────────────────────┴─────────────────────────┴─────────────────────────┴───────────┘
# We can also create a dynamic group by based on an index column
lf <- pl$LazyFrame(
idx = 0:5,
A = c("A", "A", "B", "B", "B", "C")
)$with_columns(pl$col("idx")$set_sorted())
lf$collect()
#> shape: (6, 2)
#> ┌─────┬─────┐
#> │ idx ┆ A │
#> │ --- ┆ --- │
#> │ i32 ┆ str │
#> ╞═════╪═════╡
#> │ 0 ┆ A │
#> │ 1 ┆ A │
#> │ 2 ┆ B │
#> │ 3 ┆ B │
#> │ 4 ┆ B │
#> │ 5 ┆ C │
#> └─────┴─────┘
lf$group_by_dynamic(
"idx",
every = "2i",
period = "3i",
include_boundaries = TRUE,
closed = "right"
)$agg(A_agg_list = pl$col("A"))$collect()
#> shape: (4, 4)
#> ┌─────────────────┬─────────────────┬─────┬─────────────────┐
#> │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ i32 ┆ i32 ┆ i32 ┆ list[str] │
#> ╞═════════════════╪═════════════════╪═════╪═════════════════╡
#> │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
#> │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
#> │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
#> │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
#> └─────────────────┴─────────────────┴─────┴─────────────────┘