This page is based on the chapter “Timeseries” of the Modern Polars book.
Preparing Data
Since the PRQL and SQL results shown on this page are after being converted to R DataFrame via knitr, they have been converted from DuckDB types to R types. So NULL
in DuckDB is shown as NA
.
Download
Download the data from Binance REST API and write it to a Parquet file.
This document uses R to download the data from the source here, but we can also download and use the Parquet file included in the kevinheavey/modern-polars GitHub repository.
Code
data_path <- "data/ohlcv.parquet"
if (!fs::file_exists(data_path)) {
fs::dir_create(fs::path_dir(data_path))
.epoch_ms <- function(dt) {
dt |>
lubridate::as_datetime() |>
(\(x) (as.integer(x) * 1000))()
}
.start <- lubridate::make_datetime(2021, 1, 1) |> .epoch_ms()
.end <- lubridate::make_datetime(2022, 1, 1) |> .epoch_ms()
.url <- glue::glue(
"https://api.binance.com/api/v3/klines?symbol=BTCUSDT&",
"interval=1d&startTime={.start}&endTime={.end}"
)
.res <- jsonlite::read_json(.url)
time_col <- "time"
ohlcv_cols <- c(
"open",
"high",
"low",
"close",
"volume"
)
cols_to_use <- c(time_col, ohlcv_cols)
cols <- c(cols_to_use, glue::glue("ignore_{i}", i = 1:6))
df <- .res |>
tibble::enframe(name = NULL) |>
tidyr::unnest_wider(value, names_sep = "_") |>
rlang::set_names({{ cols }}) |>
dplyr::mutate(
dplyr::across({{ time_col }}, \(x) lubridate::as_datetime(x / 1000) |> lubridate::as_date()),
dplyr::across({{ ohlcv_cols }}, as.numeric),
.keep = "none"
)
con_tmp <- DBI::dbConnect(duckdb::duckdb(), ":memory:")
duckdb::duckdb_register(con_tmp, "df", df)
duckdb:::sql(glue::glue("COPY df TO '{data_path}' (FORMAT PARQUET)"), con_tmp)
DBI::dbDisconnect(con_tmp)
}
This is a sample command to download the Parquet file from the kevinheavey/modern-polars
GitHub repository.
Terminal
mkdir data
curl -sL https://github.com/kevinheavey/modern-polars/raw/d67d6f95ce0de8aad5492c4497ac4c3e33d696e8/data/ohlcv.pq -o data/ohlcv.parquet
Load the Data
After the Parquet file is ready, load that into DuckDB (in-memory) database table, R DataFrame, and Python polars.LazyFrame.
- DuckDB
- R DataFrame
- Python polars.LazyFrame
CREATE TABLE tab AS FROM 'data/ohlcv.parquet'
time | open | high | low | close | volume |
---|
2021-01-01 | 28923.63 | 29600.00 | 28624.57 | 29331.69 | 54182.93 |
2021-01-02 | 29331.70 | 33300.00 | 28946.53 | 32178.33 | 129993.87 |
2021-01-03 | 32176.45 | 34778.11 | 31962.99 | 33000.05 | 120957.57 |
2021-01-04 | 33000.05 | 33600.00 | 28130.00 | 31988.71 | 140899.89 |
2021-01-05 | 31989.75 | 34360.00 | 29900.00 | 33949.53 | 116050.00 |
library(dplyr, warn.conflicts = FALSE)
df <- duckdb:::sql("FROM 'data/ohlcv.parquet'")
time | open | high | low | close | volume |
---|
2021-01-01 | 28923.63 | 29600.00 | 28624.57 | 29331.69 | 54182.93 |
2021-01-02 | 29331.70 | 33300.00 | 28946.53 | 32178.33 | 129993.87 |
2021-01-03 | 32176.45 | 34778.11 | 31962.99 | 33000.05 | 120957.57 |
2021-01-04 | 33000.05 | 33600.00 | 28130.00 | 31988.71 | 140899.89 |
2021-01-05 | 31989.75 | 34360.00 | 29900.00 | 33949.53 | 116050.00 |
import polars as pl
lf = pl.scan_parquet("data/ohlcv.parquet")
shape: (5, 6)
time | open | high | low | close | volume |
---|
date | f64 | f64 | f64 | f64 | f64 |
2021-01-01 | 28923.63 | 29600.0 | 28624.57 | 29331.69 | 54182.925011 |
2021-01-02 | 29331.7 | 33300.0 | 28946.53 | 32178.33 | 129993.873362 |
2021-01-03 | 32176.45 | 34778.11 | 31962.99 | 33000.05 | 120957.56675 |
2021-01-04 | 33000.05 | 33600.0 | 28130.0 | 31988.71 | 140899.88569 |
2021-01-05 | 31989.75 | 34360.0 | 29900.0 | 33949.53 | 116049.997038 |
Filtering
- PRQL DuckDB
- SQL DuckDB
- dplyr R
- Python Polars
from tab
filter s"date_part(['year', 'month'], time) = {{year: 2021, month: 2}}"
take 5
time | open | high | low | close | volume |
---|
2021-02-01 | 33092.97 | 34717.27 | 32296.16 | 33526.37 | 82718.28 |
2021-02-02 | 33517.09 | 35984.33 | 33418.00 | 35466.24 | 78056.66 |
2021-02-03 | 35472.71 | 37662.63 | 35362.38 | 37618.87 | 80784.33 |
2021-02-04 | 37620.26 | 38708.27 | 36161.95 | 36936.66 | 92080.74 |
2021-02-05 | 36936.65 | 38310.12 | 36570.00 | 38290.24 | 66681.33 |
FROM tab
WHERE date_part(['year', 'month'], time) = {year: 2021, month: 2}
LIMIT 5
time | open | high | low | close | volume |
---|
2021-02-01 | 33092.97 | 34717.27 | 32296.16 | 33526.37 | 82718.28 |
2021-02-02 | 33517.09 | 35984.33 | 33418.00 | 35466.24 | 78056.66 |
2021-02-03 | 35472.71 | 37662.63 | 35362.38 | 37618.87 | 80784.33 |
2021-02-04 | 37620.26 | 38708.27 | 36161.95 | 36936.66 | 92080.74 |
2021-02-05 | 36936.65 | 38310.12 | 36570.00 | 38290.24 | 66681.33 |
df |>
filter(
lubridate::floor_date(time, "month") == lubridate::make_datetime(2021, 2)
) |>
head(5)
time | open | high | low | close | volume |
---|
2021-02-01 | 33092.97 | 34717.27 | 32296.16 | 33526.37 | 82718.28 |
2021-02-02 | 33517.09 | 35984.33 | 33418.00 | 35466.24 | 78056.66 |
2021-02-03 | 35472.71 | 37662.63 | 35362.38 | 37618.87 | 80784.33 |
2021-02-04 | 37620.26 | 38708.27 | 36161.95 | 36936.66 | 92080.74 |
2021-02-05 | 36936.65 | 38310.12 | 36570.00 | 38290.24 | 66681.33 |
(
lf.filter((pl.col("time").dt.year() == 2021) & (pl.col("time").dt.month() == 2))
.head(5)
.collect()
)
shape: (5, 6)
time | open | high | low | close | volume |
---|
date | f64 | f64 | f64 | f64 | f64 |
2021-02-01 | 33092.97 | 34717.27 | 32296.16 | 33526.37 | 82718.276882 |
2021-02-02 | 33517.09 | 35984.33 | 33418.0 | 35466.24 | 78056.65988 |
2021-02-03 | 35472.71 | 37662.63 | 35362.38 | 37618.87 | 80784.333663 |
2021-02-04 | 37620.26 | 38708.27 | 36161.95 | 36936.66 | 92080.735898 |
2021-02-05 | 36936.65 | 38310.12 | 36570.0 | 38290.24 | 66681.334275 |
Downsampling
It is important to note carefully how units such as 5 days
or 1 week
actually work. In other words, where to start counting 5 days
or 1 week
could be completely different in each system.
Here, we should note that time_bucket
in DuckDB, lubridate::floor_date
in R, and groupby_dynamic
in Polars have completely different initial starting points by default.
-
The DuckDB function time_bucket
’s origin defaults to 2000-01-03 00:00:00+00
for days or weeks interval.1
-
In the R lubridate::floor_date
function, timestamp is floored using the number of days elapsed since the beginning of every month when specifying "5 days"
to the unit
argument.
lubridate::as_date(c("2023-01-31", "2023-02-01")) |>
lubridate::floor_date("5 days")
[1] "2023-01-31" "2023-02-01"
And when "1 week"
to the unit
argument, it is floored to the nearest week, Sunday through Saturday.
lubridate::as_date(c("2023-01-31", "2023-02-01")) |>
lubridate::floor_date("1 week")
[1] "2023-01-29" "2023-01-29"
To start from an arbitrary origin, all breaks must be specified as a vector in the unit argument.2
lubridate::as_date(c("2023-01-31", "2023-02-01")) |>
lubridate::floor_date(lubridate::make_date(2023, 1, 31))
[1] "2023-01-31" "2023-01-31"
-
groupby_dynamic
of Polars, the offset
parameter to specify the origin point, is negative every
by default.3
- PRQL DuckDB
- SQL DuckDB
- dplyr R
- Python Polars
from tab
derive {
time_new = s"""
time_bucket(INTERVAL '5 days', time, (FROM tab SELECT min(time)))
"""
}
group {time_new} (
aggregate {
open = average open,
high = average high,
low = average low,
close = average close,
volume = average volume
}
)
sort time_new
take 5
time_new | open | high | low | close | volume |
---|
2021-01-01 | 31084.32 | 33127.62 | 29512.82 | 32089.66 | 112416.85 |
2021-01-06 | 38165.31 | 40396.84 | 35983.82 | 39004.54 | 118750.08 |
2021-01-11 | 36825.23 | 38518.10 | 33288.05 | 36542.76 | 146166.70 |
2021-01-16 | 36216.36 | 37307.53 | 34650.37 | 35962.92 | 81236.80 |
2021-01-21 | 32721.53 | 34165.71 | 30624.23 | 32077.48 | 97809.66 |
WITH _tab1 AS (
FROM tab
SELECT
* REPLACE (time_bucket(INTERVAL '5 days', time, (FROM tab SELECT min(time)))) AS time
)
FROM _tab1
SELECT
time,
avg(COLUMNS(x -> x NOT IN ('time')))
GROUP BY time
ORDER BY time
LIMIT 5
time | avg(_tab1.open) | avg(_tab1.high) | avg(_tab1.low) | avg(_tab1.”close”) | avg(_tab1.volume) |
---|
2021-01-01 | 31084.32 | 33127.62 | 29512.82 | 32089.66 | 112416.85 |
2021-01-06 | 38165.31 | 40396.84 | 35983.82 | 39004.54 | 118750.08 |
2021-01-11 | 36825.23 | 38518.10 | 33288.05 | 36542.76 | 146166.70 |
2021-01-16 | 36216.36 | 37307.53 | 34650.37 | 35962.92 | 81236.80 |
2021-01-21 | 32721.53 | 34165.71 | 30624.23 | 32077.48 | 97809.66 |
df |>
mutate(
time = time |>
(\(x) lubridate::floor_date(x, seq(min(x), max(x), by = 5)))()
) |>
summarise(across(everything(), mean), .by = time) |>
head(5)
time | open | high | low | close | volume |
---|
2021-01-01 | 31084.32 | 33127.62 | 29512.82 | 32089.66 | 112416.85 |
2021-01-06 | 38165.31 | 40396.84 | 35983.82 | 39004.54 | 118750.08 |
2021-01-11 | 36825.23 | 38518.10 | 33288.05 | 36542.76 | 146166.70 |
2021-01-16 | 36216.36 | 37307.53 | 34650.37 | 35962.92 | 81236.80 |
2021-01-21 | 32721.53 | 34165.71 | 30624.23 | 32077.48 | 97809.66 |
(
lf.sort("time")
.groupby_dynamic("time", every="5d")
.agg(pl.col(pl.Float64).mean())
.head(5)
.collect()
)
shape: (5, 6)
time | open | high | low | close | volume |
---|
date | f64 | f64 | f64 | f64 | f64 |
2020-12-29 | 29127.665 | 31450.0 | 28785.55 | 30755.01 | 92088.399186 |
2021-01-03 | 33577.028 | 36008.464 | 31916.198 | 35027.986 | 127574.470245 |
2021-01-08 | 38733.61 | 39914.548 | 34656.422 | 37655.352 | 143777.954392 |
2021-01-13 | 36659.63 | 38500.052 | 34539.406 | 37016.338 | 102447.076833 |
2021-01-18 | 34933.922 | 36232.334 | 32593.012 | 34357.258 | 103530.722029 |
- PRQL DuckDB
- SQL DukcDB
- dplyr R
- Python Polars
from tab
derive {
time_new = s"""
time_bucket(INTERVAL '7 days', time, (FROM tab SELECT min(time)))
"""
}
group {time_new} (
aggregate {
open_mean = average open,
high_mean = average high,
low_mean = average low,
close_mean = average close,
volume_mean = average volume,
open_sum = sum open,
high_sum = sum high,
low_sum = sum low,
close_sum = sum close,
volume_sum = sum volume
}
)
sort time_new
take 5
time_new | open_mean | high_mean | low_mean | close_mean | volume_mean | open_sum | high_sum | low_sum | close_sum | volume_sum |
---|
2021-01-01 | 32305.78 | 34706.05 | 31021.73 | 33807.14 | 117435.59 | 226140.5 | 242942.3 | 217152.1 | 236650.0 | 822049.1 |
2021-01-08 | 37869.80 | 39646.11 | 34623.33 | 37827.52 | 135188.30 | 265088.6 | 277522.7 | 242363.3 | 264792.6 | 946318.1 |
2021-01-15 | 36527.89 | 37412.20 | 33961.55 | 35343.85 | 94212.72 | 255695.2 | 261885.4 | 237730.9 | 247406.9 | 659489.0 |
2021-01-22 | 31888.55 | 33498.81 | 30424.48 | 32248.01 | 89649.94 | 223219.8 | 234491.7 | 212971.4 | 225736.0 | 627549.6 |
2021-01-29 | 34511.48 | 36411.44 | 33450.08 | 35022.31 | 102728.42 | 241580.4 | 254880.1 | 234150.6 | 245156.2 | 719099.0 |
WITH _tab1 AS (
FROM tab
SELECT
* REPLACE (time_bucket(INTERVAL '7 days', time, (FROM tab SELECT min(time)))) AS time
)
FROM _tab1
SELECT
time,
avg(COLUMNS(x -> x NOT IN ('time'))),
sum(COLUMNS(x -> x NOT IN ('time')))
GROUP BY time
ORDER BY time
LIMIT 5
time | avg(_tab1.open) | avg(_tab1.high) | avg(_tab1.low) | avg(_tab1.”close”) | avg(_tab1.volume) | sum(_tab1.open) | sum(_tab1.high) | sum(_tab1.low) | sum(_tab1.”close”) | sum(_tab1.volume) |
---|
2021-01-01 | 32305.78 | 34706.05 | 31021.73 | 33807.14 | 117435.59 | 226140.5 | 242942.3 | 217152.1 | 236650.0 | 822049.1 |
2021-01-08 | 37869.80 | 39646.11 | 34623.33 | 37827.52 | 135188.30 | 265088.6 | 277522.7 | 242363.3 | 264792.6 | 946318.1 |
2021-01-15 | 36527.89 | 37412.20 | 33961.55 | 35343.85 | 94212.72 | 255695.2 | 261885.4 | 237730.9 | 247406.9 | 659489.0 |
2021-01-22 | 31888.55 | 33498.81 | 30424.48 | 32248.01 | 89649.94 | 223219.8 | 234491.7 | 212971.4 | 225736.0 | 627549.6 |
2021-01-29 | 34511.48 | 36411.44 | 33450.08 | 35022.31 | 102728.42 | 241580.4 | 254880.1 | 234150.6 | 245156.2 | 719099.0 |
df |>
mutate(
time = time |>
(\(x) lubridate::floor_date(x, seq(min(x), max(x), by = 7)))()
) |>
summarise(
across(
everything(),
list(mean = mean, sum = sum),
.names = "{.col}_{.fn}"
),
.by = time
) |>
head(5)
time | open_mean | open_sum | high_mean | high_sum | low_mean | low_sum | close_mean | close_sum | volume_mean | volume_sum |
---|
2021-01-01 | 32305.78 | 226140.5 | 34706.05 | 242942.3 | 31021.73 | 217152.1 | 33807.14 | 236650.0 | 117435.59 | 822049.1 |
2021-01-08 | 37869.80 | 265088.6 | 39646.11 | 277522.7 | 34623.33 | 242363.3 | 37827.52 | 264792.6 | 135188.30 | 946318.1 |
2021-01-15 | 36527.89 | 255695.2 | 37412.20 | 261885.4 | 33961.55 | 237730.9 | 35343.85 | 247406.9 | 94212.72 | 659489.0 |
2021-01-22 | 31888.55 | 223219.8 | 33498.81 | 234491.7 | 30424.48 | 212971.4 | 32248.01 | 225736.0 | 89649.94 | 627549.6 |
2021-01-29 | 34511.48 | 241580.4 | 36411.44 | 254880.1 | 33450.08 | 234150.6 | 35022.31 | 245156.2 | 102728.42 | 719099.0 |
(
lf.sort("time")
.groupby_dynamic("time", every="1w")
.agg(
[
pl.col(pl.Float64).mean().suffix("_mean"),
pl.col(pl.Float64).sum().suffix("_sum"),
]
)
.head(5)
.collect()
)
shape: (5, 11)
time | open_mean | high_mean | low_mean | close_mean | volume_mean | open_sum | high_sum | low_sum | close_sum | volume_sum |
---|
date | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
2020-12-28 | 30143.926667 | 32559.37 | 29844.696667 | 31503.356667 | 101711.455041 | 90431.78 | 97678.11 | 89534.09 | 94510.07 | 305134.365123 |
2021-01-04 | 36545.192857 | 38563.458571 | 33992.73 | 37280.132857 | 121528.60945 | 255816.35 | 269944.21 | 237949.11 | 260960.93 | 850700.266152 |
2021-01-11 | 36694.077143 | 38199.0 | 33664.004286 | 36362.485714 | 128191.382064 | 256858.54 | 267393.0 | 235648.03 | 254537.4 | 897339.67445 |
2021-01-18 | 34242.515714 | 35384.095714 | 32179.317143 | 33732.027143 | 91460.990826 | 239697.61 | 247688.67 | 225255.22 | 236124.19 | 640226.935785 |
2021-01-25 | 32745.312857 | 34563.677143 | 31249.037143 | 32865.862857 | 106780.501216 | 229217.19 | 241945.74 | 218743.26 | 230061.04 | 747463.508509 |
Upsampling
The way to use a function like generate_series
to generate sequential values and then join them is general-purpose.
In R, we can also use dedicated functions like timetk::pad_by_time
.
- PRQL DuckDB
- SQL DuckDB
- dplyr R
- Python Polars
This example does not work with prql-compiler 0.11.1. (PRQL/prql#3129)
let _tab1 = s"""
SELECT
generate_series(
(SELECT min(time)),
(SELECT max(time)),
INTERVAL '6 hours'
).unnest() AS time
FROM tab
"""
from _tab1
join side:left tab (==time)
sort tab.time
select !{tab.time}
take 5
WITH _tab1 AS (
SELECT
generate_series(
(FROM tab SELECT min(time)),
(FROM tab SELECT max(time)),
INTERVAL '6 hours'
).unnest() AS time
)
FROM _tab1
LEFT JOIN tab USING (time)
ORDER BY time
LIMIT 5
time | open | high | low | close | volume |
---|
2021-01-01 00:00:00 | 28923.63 | 29600 | 28624.57 | 29331.69 | 54182.93 |
2021-01-01 06:00:00 | NA | NA | NA | NA | NA |
2021-01-01 12:00:00 | NA | NA | NA | NA | NA |
2021-01-01 18:00:00 | NA | NA | NA | NA | NA |
2021-01-02 00:00:00 | 29331.70 | 33300 | 28946.53 | 32178.33 | 129993.87 |
.grid <- df$time |>
lubridate::as_datetime() |>
(\(x) seq(min(x), max(x), by = "6 hours"))() |>
tibble::tibble(time = _)
.grid |>
left_join(df, by = "time") |>
head(5)
time | open | high | low | close | volume |
---|
2021-01-01 00:00:00 | 28923.63 | 29600 | 28624.57 | 29331.69 | 54182.93 |
2021-01-01 06:00:00 | NA | NA | NA | NA | NA |
2021-01-01 12:00:00 | NA | NA | NA | NA | NA |
2021-01-01 18:00:00 | NA | NA | NA | NA | NA |
2021-01-02 00:00:00 | 29331.70 | 33300 | 28946.53 | 32178.33 | 129993.87 |
lf.collect().sort("time").upsample("time", every="6h").head(5)
shape: (5, 6)
time | open | high | low | close | volume |
---|
date | f64 | f64 | f64 | f64 | f64 |
2021-01-01 | 28923.63 | 29600.0 | 28624.57 | 29331.69 | 54182.925011 |
2021-01-01 | null | null | null | null | null |
2021-01-01 | null | null | null | null | null |
2021-01-01 | null | null | null | null | null |
2021-01-02 | 29331.7 | 33300.0 | 28946.53 | 32178.33 | 129993.873362 |
Window Functions
It is necessary to be careful how the Window function calculates if the width of the window is less than the specified value.
Moving Average, Cumulative Avarage
PRQL has a dedicated way of applying the window to the entire table. For the others, use a individual function for each column.
In R, base R have some window functions like cumsum
, but none like cumulative avarage. dplyr complements this with several functions, including cummean
.
Polars does not yet have a dedicated function to compute cumulative averages, so we must use cumulative sums to compute them.
- PRQL DuckDB
- SQL DuckDB
- dplyr R
- Python Polars
from tab
sort this.time
window rolling:28 (
derive {`28D MA` = average close}
)
window rows:..0 (
derive {`Expanding Average` = average close}
)
select {
this.time,
Raw = close,
`28D MA`,
`Expanding Average`
}
take 26..30
time | Raw | 28D MA | Expanding Average |
---|
2021-01-26 | 32467.77 | 35032.87 | 35032.87 |
2021-01-27 | 30366.15 | 34860.03 | 34860.03 |
2021-01-28 | 33364.86 | 34806.63 | 34806.63 |
2021-01-29 | 34252.20 | 34982.36 | 34787.51 |
2021-01-30 | 34262.88 | 35056.81 | 34770.02 |
FROM tab
SELECT
time,
close AS "Raw",
avg(close) OVER (
ORDER BY time
ROWS BETWEEN 27 PRECEDING AND CURRENT ROW
) AS "28D MA",
avg(close) OVER (
ORDER BY time
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS "Expanding Average"
LIMIT 5 OFFSET 25
time | Raw | 28D MA | Expanding Average |
---|
2021-01-26 | 32467.77 | 35032.87 | 35032.87 |
2021-01-27 | 30366.15 | 34860.03 | 34860.03 |
2021-01-28 | 33364.86 | 34806.63 | 34806.63 |
2021-01-29 | 34252.20 | 34982.36 | 34787.51 |
2021-01-30 | 34262.88 | 35056.81 | 34770.02 |
roll_and_expand <- df |>
arrange(time) |>
mutate(
time,
Raw = close,
`28D MA` = close |>
slider::slide_vec(mean, .before = 27, .complete = TRUE),
`Expanding Average` = cummean(close),
.keep = "none"
)
roll_and_expand |>
slice(26:30)
time | Raw | 28D MA | Expanding Average |
---|
2021-01-26 | 32467.77 | NA | 35032.87 |
2021-01-27 | 30366.15 | NA | 34860.03 |
2021-01-28 | 33364.86 | 34806.63 | 34806.63 |
2021-01-29 | 34252.20 | 34982.36 | 34787.51 |
2021-01-30 | 34262.88 | 35056.81 | 34770.02 |
close = pl.col("close")
roll_and_expand = lf.sort("time").select(
[
pl.col("time"),
close.alias("Raw"),
close.rolling_mean(28).alias("28D MA"),
close.alias("Expanding Average").cum_sum() / (close.cum_count() + 1),
]
)
roll_and_expand.head(30).tail(5).collect()
shape: (5, 4)
time | Raw | 28D MA | Expanding Average |
---|
date | f64 | f64 | f64 |
2021-01-26 | 32467.77 | null | 33735.354074 |
2021-01-27 | 30366.15 | null | 33615.025357 |
2021-01-28 | 33364.86 | 34806.6275 | 33606.398966 |
2021-01-29 | 34252.2 | 34982.36 | 33627.925667 |
2021-01-30 | 34262.88 | 35056.808214 | 33648.408065 |
Here, DuckDB also calculates avarage for cases where the window width is less than 28 for the 28D MA
column, whereas R slider::slide_vec(.complete = TRUE)
and Polars rolling_mean
make them missing values. If we are using DuckDB and need to make replacement for NULL
, we need to add further processing.
Plotting the results of dplyr shows the following.
Code
library(ggplot2)
roll_and_expand |>
tidyr::pivot_longer(cols = !time) |>
ggplot(aes(time, value, colour = name)) +
geom_line() +
theme_linedraw() +
labs(y = "Close ($)") +
scale_x_date(
date_breaks = "month",
labels = scales::label_date_short()
)
Combining Rolling Aggregations
- PRQL DuckDB
- SQL DuckDB
- dplyr R
- Python Polars
from tab
sort this.time
window rows:-15..14 (
select {
this.time,
mean = average close,
std = stddev close
}
)
take 13..17
time | mean | std |
---|
2021-01-13 | 34860.03 | 3092.516 |
2021-01-14 | 34806.63 | 3047.833 |
2021-01-15 | 34787.51 | 2994.683 |
2021-01-16 | 34770.02 | 2944.156 |
2021-01-17 | 34895.40 | 2780.095 |
FROM tab
SELECT
time,
avg(close) OVER (
ORDER BY time
ROWS BETWEEN 15 PRECEDING AND 14 FOLLOWING
) AS mean,
stddev(close) OVER (
ORDER BY time
ROWS BETWEEN 15 PRECEDING AND 14 FOLLOWING
) AS std
ORDER BY time
LIMIT 5 OFFSET 12
time | mean | std |
---|
2021-01-13 | 34860.03 | 3092.516 |
2021-01-14 | 34806.63 | 3047.833 |
2021-01-15 | 34787.51 | 2994.683 |
2021-01-16 | 34770.02 | 2944.156 |
2021-01-17 | 34895.40 | 2780.095 |
.slide_func <- function(.x, .fn) {
slider::slide_vec(.x, .fn, .before = 15, .after = 14, .complete = TRUE)
}
mean_std <- df |>
arrange(time) |>
mutate(
time,
across(
close,
.fns = list(mean = \(x) .slide_func(x, mean), std = \(x) .slide_func(x, sd)),
.names = "{.fn}"
),
.keep = "none"
)
time | mean | std |
---|
2021-01-13 | NA | NA |
2021-01-14 | NA | NA |
2021-01-15 | NA | NA |
2021-01-16 | 34770.02 | 2944.156 |
2021-01-17 | 34895.40 | 2780.095 |
mean_std = lf.sort("time").select(
time=pl.col("time"),
mean=pl.col("close").rolling_mean(30, center=True),
std=pl.col("close").rolling_std(30, center=True),
)
mean_std.head(17).tail(5).collect()
shape: (5, 3)
time | mean | std |
---|
date | f64 | f64 |
2021-01-13 | null | null |
2021-01-14 | null | null |
2021-01-15 | null | null |
2021-01-16 | 34770.021667 | 2944.155675 |
2021-01-17 | 34895.398 | 2780.095304 |
As in Section 5.1, here too the DuckDB results differ from the others.
Plotting the results of dplyr shows the following.
Code
library(ggplot2)
mean_std |>
ggplot(aes(time)) +
geom_ribbon(
aes(ymin = mean - std, ymax = mean + std),
alpha = 0.3, fill = "blue"
) +
geom_line(aes(y = mean), color = "blue") +
theme_linedraw() +
labs(y = "Close ($)") +
scale_x_date(
date_breaks = "month",
labels = scales::label_date_short()
)
Timezones
In DuckDB, the icu DuckDB extension is needed for time zones support. If the DuckDB client that we are using does not contain the extension, we need to install and load it.
- PRQL DuckDB
- SQL DuckDB
- dplyr R
- Python Polars
let timezone = tz col -> s"timezone({tz}, {col})"
from tab
derive {
time_new = (this.time | timezone "UTC" | timezone "US/Eastern")
}
select !{this.time}
take 5
open | high | low | close | volume | time_new |
---|
28923.63 | 29600.00 | 28624.57 | 29331.69 | 54182.93 | 2020-12-31 19:00:00 |
29331.70 | 33300.00 | 28946.53 | 32178.33 | 129993.87 | 2021-01-01 19:00:00 |
32176.45 | 34778.11 | 31962.99 | 33000.05 | 120957.57 | 2021-01-02 19:00:00 |
33000.05 | 33600.00 | 28130.00 | 31988.71 | 140899.89 | 2021-01-03 19:00:00 |
31989.75 | 34360.00 | 29900.00 | 33949.53 | 116050.00 | 2021-01-04 19:00:00 |
FROM tab
SELECT
* REPLACE timezone('US/Eastern', timezone('UTC', time)) AS time
LIMIT 5
time | open | high | low | close | volume |
---|
2020-12-31 19:00:00 | 28923.63 | 29600.00 | 28624.57 | 29331.69 | 54182.93 |
2021-01-01 19:00:00 | 29331.70 | 33300.00 | 28946.53 | 32178.33 | 129993.87 |
2021-01-02 19:00:00 | 32176.45 | 34778.11 | 31962.99 | 33000.05 | 120957.57 |
2021-01-03 19:00:00 | 33000.05 | 33600.00 | 28130.00 | 31988.71 | 140899.89 |
2021-01-04 19:00:00 | 31989.75 | 34360.00 | 29900.00 | 33949.53 | 116050.00 |
df |>
mutate(
time = time |>
lubridate::force_tz("UTC") |>
lubridate::with_tz("US/Eastern")
) |>
head(5)
time | open | high | low | close | volume |
---|
2020-12-31 19:00:00 | 28923.63 | 29600.00 | 28624.57 | 29331.69 | 54182.93 |
2021-01-01 19:00:00 | 29331.70 | 33300.00 | 28946.53 | 32178.33 | 129993.87 |
2021-01-02 19:00:00 | 32176.45 | 34778.11 | 31962.99 | 33000.05 | 120957.57 |
2021-01-03 19:00:00 | 33000.05 | 33600.00 | 28130.00 | 31988.71 | 140899.89 |
2021-01-04 19:00:00 | 31989.75 | 34360.00 | 29900.00 | 33949.53 | 116050.00 |
(
lf.with_columns(
pl.col("time")
.cast(pl.Datetime)
.dt.replace_time_zone("UTC")
.dt.convert_time_zone("US/Eastern")
)
.head(5)
.collect()
)
shape: (5, 6)
time | open | high | low | close | volume |
---|
datetime[μs, US/Eastern] | f64 | f64 | f64 | f64 | f64 |
2020-12-31 19:00:00 EST | 28923.63 | 29600.0 | 28624.57 | 29331.69 | 54182.925011 |
2021-01-01 19:00:00 EST | 29331.7 | 33300.0 | 28946.53 | 32178.33 | 129993.873362 |
2021-01-02 19:00:00 EST | 32176.45 | 34778.11 | 31962.99 | 33000.05 | 120957.56675 |
2021-01-03 19:00:00 EST | 33000.05 | 33600.0 | 28130.0 | 31988.71 | 140899.88569 |
2021-01-04 19:00:00 EST | 31989.75 | 34360.0 | 29900.0 | 33949.53 | 116049.997038 |
Note that each system may keep time zone information in a different way. Here, the time
column (and the time_new
column) in DuckDB results are the TIMESTAMP type, has no time zone information.