library(tidyverse)
library(palmerpenguins)
library(ggpmisc)
theme_set(theme_bw())
RAdelaide 2025
July 10, 2025
R
is explicitly designed to work with vectorsBUT
summarise
lm
as a list
\(\implies\) a list columnBut how do we get the slopes & standard errors now?
R
has special tricks for this which speed things up profoundlydata/benchmarks
are 6 very similar filesphoenix
(UofA HPC)
[1] "/home/stevie/TKI/RAdelaide25/data/benchmarks/AHR.poisson.n10.benchmark.tsv"
[2] "/home/stevie/TKI/RAdelaide25/data/benchmarks/AHR.poisson.n100.benchmark.tsv"
[3] "/home/stevie/TKI/RAdelaide25/data/benchmarks/AHR.poisson.n1000.benchmark.tsv"
[4] "/home/stevie/TKI/RAdelaide25/data/benchmarks/AHR.poisson.n250.benchmark.tsv"
[5] "/home/stevie/TKI/RAdelaide25/data/benchmarks/AHR.poisson.n50.benchmark.tsv"
[6] "/home/stevie/TKI/RAdelaide25/data/benchmarks/AHR.poisson.n500.benchmark.tsv"
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 11.2 00'11" 4172. 17559. 829. 1194. 0 0 385. 49.8
snakemake
when you benchmark a rulefor
loop
seq_along()
will create a sequence of integers along a vectorfor (i in seq_along(f))
will step through the vector one at a time[1] 1 2 3 4 5 6
For i = 1, open /home/stevie/TKI/RAdelaide25/data/benchmarks/AHR.poisson.n10.benchmark.tsv
For i = 2, open /home/stevie/TKI/RAdelaide25/data/benchmarks/AHR.poisson.n100.benchmark.tsv
For i = 3, open /home/stevie/TKI/RAdelaide25/data/benchmarks/AHR.poisson.n1000.benchmark.tsv
For i = 4, open /home/stevie/TKI/RAdelaide25/data/benchmarks/AHR.poisson.n250.benchmark.tsv
For i = 5, open /home/stevie/TKI/RAdelaide25/data/benchmarks/AHR.poisson.n50.benchmark.tsv
For i = 6, open /home/stevie/TKI/RAdelaide25/data/benchmarks/AHR.poisson.n500.benchmark.tsv
for
loop workslist
list()
## Give each list element a name, dropping the bulk of the file path
names(df_list) <- basename(f)
df_list
$AHR.poisson.n10.benchmark.tsv
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 11.2 00'11" 4172. 17559. 829. 1194. 0 0 385. 49.8
$AHR.poisson.n100.benchmark.tsv
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 65.2 01'05" 4950. 18337. 3986. 4086. 0 0 496. 329.
$AHR.poisson.n1000.benchmark.tsv
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 594. 09'54" 15535. 28940. 9731. 10368. 0 0 700. 4382.
$AHR.poisson.n250.benchmark.tsv
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 154. 02'34" 6462. 19867. 4845. 5017. 0 0 667. 1034.
$AHR.poisson.n50.benchmark.tsv
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 35.1 00'35" 4435. 17841. 914. 1264. 0 0 258. 94.6
$AHR.poisson.n500.benchmark.tsv
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 300. 04'59" 9814. 23201. 6805. 7132. 0 0 706. 2121.
bind_rows()
to form a single tibble
.id
argument will add list names to a column# A tibble: 6 × 11
file s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load
<chr> <dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 AHR.pois… 11.2 00'11" 4172. 17559. 829. 1194. 0 0 385.
2 AHR.pois… 65.2 01'05" 4950. 18337. 3986. 4086. 0 0 496.
3 AHR.pois… 594. 09'54" 15535. 28940. 9731. 10368. 0 0 700.
4 AHR.pois… 154. 02'34" 6462. 19867. 4845. 5017. 0 0 667.
5 AHR.pois… 35.1 00'35" 4435. 17841. 914. 1264. 0 0 258.
6 AHR.pois… 300. 04'59" 9814. 23201. 6805. 7132. 0 0 706.
# ℹ 1 more variable: cpu_time <dbl>
mutate()
to extract n = 10, 50, …R
-Style ApproachR
offers an approach using lapply()
[[1]]
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 11.2 00'11" 4172. 17559. 829. 1194. 0 0 385. 49.8
[[2]]
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 65.2 01'05" 4950. 18337. 3986. 4086. 0 0 496. 329.
[[3]]
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 594. 09'54" 15535. 28940. 9731. 10368. 0 0 700. 4382.
[[4]]
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 154. 02'34" 6462. 19867. 4845. 5017. 0 0 667. 1034.
[[5]]
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 35.1 00'35" 4435. 17841. 914. 1264. 0 0 258. 94.6
[[6]]
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 300. 04'59" 9814. 23201. 6805. 7132. 0 0 706. 2121.
R
-Style Approachf |>
setNames(basename(f)) |> # Add names here to ensure the list has names
lapply(read_tsv) |>
bind_rows(.id = "file")
# A tibble: 6 × 11
file s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load
<chr> <dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 AHR.pois… 11.2 00'11" 4172. 17559. 829. 1194. 0 0 385.
2 AHR.pois… 65.2 01'05" 4950. 18337. 3986. 4086. 0 0 496.
3 AHR.pois… 594. 09'54" 15535. 28940. 9731. 10368. 0 0 700.
4 AHR.pois… 154. 02'34" 6462. 19867. 4845. 5017. 0 0 667.
5 AHR.pois… 35.1 00'35" 4435. 17841. 914. 1264. 0 0 258.
6 AHR.pois… 300. 04'59" 9814. 23201. 6805. 7132. 0 0 706.
# ℹ 1 more variable: cpu_time <dbl>
R
-Style Approachf |>
setNames(basename(f)) |> # Add names here to ensure the list has names
lapply(read_tsv) |>
bind_rows(.id = "file") |>
mutate(bg_size = str_extract(file, "[0-9]+") |> as.integer())
# A tibble: 6 × 12
file s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load
<chr> <dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 AHR.pois… 11.2 00'11" 4172. 17559. 829. 1194. 0 0 385.
2 AHR.pois… 65.2 01'05" 4950. 18337. 3986. 4086. 0 0 496.
3 AHR.pois… 594. 09'54" 15535. 28940. 9731. 10368. 0 0 700.
4 AHR.pois… 154. 02'34" 6462. 19867. 4845. 5017. 0 0 667.
5 AHR.pois… 35.1 00'35" 4435. 17841. 914. 1264. 0 0 258.
6 AHR.pois… 300. 04'59" 9814. 23201. 6805. 7132. 0 0 706.
# ℹ 2 more variables: cpu_time <dbl>, bg_size <int>
R
-Style Approachf |>
setNames(basename(f)) |>
lapply(read_tsv) |>
bind_rows(.id = "file") |>
mutate(
bg_size = file |>
str_extract("[0-9]+") |>
as.integer()
) |>
ggplot(aes(bg_size, s)) +
geom_point(size = 3) +
geom_smooth(method = "lm") +
stat_poly_eq(use_label("eq")) +
labs(
x = "BG Size",
y = "Time Taken (sec)"
)
lapply()
apply()
apply()
including the MARGIN
argument
MARGIN = 1
MARGIN = 2
MARGIN=3
apply()
AirPassengers
is a time-series object
## Coerce the time-series object to be an actual matrix
AirPassengers |>
matrix(ncol = 12, byrow = TRUE, dimnames = list(1949:1960, month.abb))
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
1949 112 118 132 129 121 135 148 148 136 119 104 118
1950 115 126 141 135 125 149 170 170 158 133 114 140
1951 145 150 178 163 172 178 199 199 184 162 146 166
1952 171 180 193 181 183 218 230 242 209 191 172 194
1953 196 196 236 235 229 243 264 272 237 211 180 201
1954 204 188 235 227 234 264 302 293 259 229 203 229
1955 242 233 267 269 270 315 364 347 312 274 237 278
1956 284 277 317 313 318 374 413 405 355 306 271 306
1957 315 301 356 348 355 422 465 467 404 347 305 336
1958 340 318 362 348 363 435 491 505 404 359 310 337
1959 360 342 406 396 420 472 548 559 463 407 362 405
1960 417 391 419 461 472 535 622 606 508 461 390 432
apply()
## Start by finding the mean in every month
AirPassengers |>
matrix(ncol = 12, byrow = TRUE, dimnames = list(1949:1960, month.abb)) |>
apply(MARGIN = 2, FUN = mean)
Jan Feb Mar Apr May Jun Jul Aug
241.7500 235.0000 270.1667 267.0833 271.8333 311.6667 351.3333 351.0833
Sep Oct Nov Dec
302.4167 266.5833 232.8333 261.8333
apply()
with MARGIN = 1
rowSums()
, colSums()
, rowMeans()
and colMeans()
all existlapply()
lapply()
is sapply()
x
as nameslapply()
vapply()
is for when you need a vector as output
f |>
lapply(read_tsv) |>
setNames(basename(f)) |>
# Apply the function `nrow` knowing we will return an integer with length(1)
vapply(nrow, integer(1))
AHR.poisson.n10.benchmark.tsv AHR.poisson.n100.benchmark.tsv
1 1
AHR.poisson.n1000.benchmark.tsv AHR.poisson.n250.benchmark.tsv
1 1
AHR.poisson.n50.benchmark.tsv AHR.poisson.n500.benchmark.tsv
1 1
purrr
tidyverse
package purrr
reimplements these using map()
map()
mostly replicates lapply()
$AHR.poisson.n10.benchmark.tsv
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 11.2 00'11" 4172. 17559. 829. 1194. 0 0 385. 49.8
$AHR.poisson.n100.benchmark.tsv
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 65.2 01'05" 4950. 18337. 3986. 4086. 0 0 496. 329.
$AHR.poisson.n1000.benchmark.tsv
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 594. 09'54" 15535. 28940. 9731. 10368. 0 0 700. 4382.
$AHR.poisson.n250.benchmark.tsv
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 154. 02'34" 6462. 19867. 4845. 5017. 0 0 667. 1034.
$AHR.poisson.n50.benchmark.tsv
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 35.1 00'35" 4435. 17841. 914. 1264. 0 0 258. 94.6
$AHR.poisson.n500.benchmark.tsv
# A tibble: 1 × 10
s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load cpu_time
<dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 300. 04'59" 9814. 23201. 6805. 7132. 0 0 706. 2121.
purrr
map_int()
is like vapply()
setting integer(1)
as the outputmap_dbl()
, map_chr()
, map_lgl()
map_dfr()
is a little different \(\implies\) will perform the bind_rows()
operation# A tibble: 6 × 11
file s `h:m:s` max_rss max_vms max_uss max_pss io_in io_out mean_load
<chr> <dbl> <time> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 AHR.pois… 11.2 00'11" 4172. 17559. 829. 1194. 0 0 385.
2 AHR.pois… 65.2 01'05" 4950. 18337. 3986. 4086. 0 0 496.
3 AHR.pois… 594. 09'54" 15535. 28940. 9731. 10368. 0 0 700.
4 AHR.pois… 154. 02'34" 6462. 19867. 4845. 5017. 0 0 667.
5 AHR.pois… 35.1 00'35" 4435. 17841. 914. 1264. 0 0 258.
6 AHR.pois… 300. 04'59" 9814. 23201. 6805. 7132. 0 0 706.
# ℹ 1 more variable: cpu_time <dbl>
# A tibble: 3 × 2
species lm
<fct> <list>
1 Adelie <lm>
2 Gentoo <lm>
3 Chinstrap <lm>
penguins |>
summarise(
lm = list(lm(bill_length_mm ~ body_mass_g)),
.by = c(species)
) |>
mutate(
## Create a summary object for each linear model
summary = lapply(lm, summary),
## Extract the body mass coefficient from each linear model
slope = map_dbl(lm, \(x) coef(x)["body_mass_g"]),
## Extract the std.error for body mass
se = map_dbl(
summary,
\(x) coefficients(x)["body_mass_g", "Std. Error"]
)
)
# A tibble: 3 × 5
species lm summary slope se
<fct> <list> <list> <dbl> <dbl>
1 Adelie <lm> <smmry.lm> 0.00319 0.000398
2 Gentoo <lm> <smmry.lm> 0.00409 0.000413
3 Chinstrap <lm> <smmry.lm> 0.00446 0.000918
bind_rows()
then dplyr::filter()
geom_errorbar_h()