library(tidyverse)
library(palmerpenguins)
RAdelaide 2025
July 10, 2025
R
ggplot()
across()
in dplyr
rename_with()
dplyr
allows you to rename columns of a data.frame
using rename_with()
# A tibble: 344 × 8
Species Island Bill_length_mm Bill_depth_mm Flipper_length_mm Body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 2 more variables: Sex <fct>, Year <int>
across()
across()
is very powerful for this type of operationacross()
ends_with()
iris |>
as_tibble() |>
summarise(
## We specify the columns using tidy syntax, then pass a function
across(.cols = ends_with("th"), .fns = mean),
.by = Species
)
# A tibble: 3 × 5
Species Sepal.Length Sepal.Width Petal.Length Petal.Width
<fct> <dbl> <dbl> <dbl> <dbl>
1 setosa 5.01 3.43 1.46 0.246
2 versicolor 5.94 2.77 4.26 1.33
3 virginica 6.59 2.97 5.55 2.03
across()
iris |>
as_tibble() |>
summarise(
## Specify the columns using tidy syntax, then pass a named list of functions
across(.cols = ends_with("th"), .fns = list(mn = mean, sd = sd)),
.by = Species
)
# A tibble: 3 × 9
Species Sepal.Length_mn Sepal.Length_sd Sepal.Width_mn Sepal.Width_sd
<fct> <dbl> <dbl> <dbl> <dbl>
1 setosa 5.01 0.352 3.43 0.379
2 versicolor 5.94 0.516 2.77 0.314
3 virginica 6.59 0.636 2.97 0.322
# ℹ 4 more variables: Petal.Length_mn <dbl>, Petal.Length_sd <dbl>,
# Petal.Width_mn <dbl>, Petal.Width_sd <dbl>
across()
pivot_*()
functionsiris |>
as_tibble() |>
summarise(
across(.cols = ends_with("th"), .fns = list(mn = mean, sd = sd)),
.by = Species
) |>
pivot_longer(cols = contains("_")) |>
separate(name, into = c("feature", "stat"), sep = "_") |>
pivot_wider(names_from = stat, values_from = value)
# A tibble: 12 × 4
Species feature mn sd
<fct> <chr> <dbl> <dbl>
1 setosa Sepal.Length 5.01 0.352
2 setosa Sepal.Width 3.43 0.379
3 setosa Petal.Length 1.46 0.174
4 setosa Petal.Width 0.246 0.105
5 versicolor Sepal.Length 5.94 0.516
6 versicolor Sepal.Width 2.77 0.314
7 versicolor Petal.Length 4.26 0.470
8 versicolor Petal.Width 1.33 0.198
9 virginica Sepal.Length 6.59 0.636
10 virginica Sepal.Width 2.97 0.322
11 virginica Petal.Length 5.55 0.552
12 virginica Petal.Width 2.03 0.275
across()
mean()
(& sd()
) to produce NA
NA
values may appear differently in different columnsif_any()
and if_all()
are similar to across()
, but apply logical tests## Find all the missing values in the dataset
penguins |>
as_tibble() |>
dplyr::filter(
## if_any() is like a version of across, but performing logical tests
if_any(.cols = everything(), .fns = is.na)
)
# A tibble: 11 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen NA NA NA NA
2 Adelie Torgersen 34.1 18.1 193 3475
3 Adelie Torgersen 42 20.2 190 4250
4 Adelie Torgersen 37.8 17.1 186 3300
5 Adelie Torgersen 37.8 17.3 180 3700
6 Adelie Dream 37.5 18.9 179 2975
7 Gentoo Biscoe 44.5 14.3 216 4100
8 Gentoo Biscoe 46.2 14.4 214 4650
9 Gentoo Biscoe 47.3 13.8 216 4725
10 Gentoo Biscoe 44.5 15.7 217 4875
11 Gentoo Biscoe NA NA NA NA
# ℹ 2 more variables: sex <fct>, year <int>
across()
With penguins
NA
values for the other speciespenguins |>
as_tibble() |>
summarise(
## Select all numeric columns using `where()`
## This applies a logical test to each column & selects it if TRUE
across(where(is.numeric), mean), .by = species
)
# A tibble: 3 × 6
species bill_length_mm bill_depth_mm flipper_length_mm body_mass_g year
<fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Adelie NA NA NA NA 2008.
2 Gentoo NA NA NA NA 2008.
3 Chinstrap 48.8 18.4 196. 3733. 2008.
across()
With penguins
mean()
can take the argument na.rm = TRUE
mean
here?\(x)
penguins |>
as_tibble() |>
summarise(
across(where(is.numeric), \(x) mean(x, na.rm = TRUE)),
.by = species
)
# A tibble: 3 × 6
species bill_length_mm bill_depth_mm flipper_length_mm body_mass_g year
<fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Adelie 38.8 18.3 190. 3701. 2008.
2 Gentoo 47.5 15.0 217. 5076. 2008.
3 Chinstrap 48.8 18.4 196. 3733. 2008.
R
Functions have three key components
formals()
body()
GlobalEnvironment
sd()
is a beautifully simple one?sd
x
: a numeric vectorna.rm
: a logical valuena.rm
is visible, but x
is empty
x
sd
is passed to the internal environment as x
Global Environment
body()
## If x is not a factor or vector, try coercing x to being a double
## If not possible, the function will error by default here
if (!(is.vector(x) || is.factor(x))) {
x <- as.double(x)
}
## Now we have x in a suitable type of vector, find the square root of the variance
sqrt(var(x), na.rm = na.rm)
x
can be manipulated as x
inside the function’s environmentz_score
R
objectR
object are some R
codez_score <- function(x, na.rm = FALSE) {
## The key elements we need for a Z-score are the mean & SD of a vector
mn <- mean(x, na.rm = na.rm)
sd <- sd(x, na.rm = na.rm)
## To calculate the z-score we subtract the mean, then divide by the SD
## The last line executed is what the function returns
(x - mn) / sd # No need to assign this internally to an object
}
browser()
z_score <- function(x, na.rm = FALSE) {
browser() # Pause execution as soon as we call the function
## The key elements we need for a Z-score are the mean & SD of a vector
mn <- mean(x, na.rm = na.rm)
sd <- sd(x, na.rm = na.rm)
## To calculate the z-score we subtract the mean, then divide by the SD
## The last line executed is what the function returns
(x - mn) / sd # No need to assign this internally to an object
}
z_score(some_num)
z_score()
na.rm
and x
Browse[1]>
ls()
na.rm
and x
x
and see what you get
some_num
ls()
mn
sd
Q
to exit the browser & return to the Global Environmentmn
and sd
no longer exist \(\implies\) some_num
is unchangedR
v4.0 the shorthand \(x)
is the same as function(x)
...
)R
has a very unique feature using the syntax ...
mean()
...
)z_score <- function(x, na.rm = FALSE, ...) {
## The key elements we need for a Z-score are the mean & SD of a vector
## Include the ellipsis here for any additional arguments
mn <- mean(x, na.rm = na.rm, ...)
sd <- sd(x, na.rm = na.rm)
## To calculate the z-score we subtract the mean, then divide by the SD
## The last line executed is what the function returns
(x - mn) / sd # No need to assign this internally to an object
}
z_score(some_num)
[1] -1.11224480 -0.92107772 -0.72991065 -0.53874357 -0.34757650 -0.15640942
[7] 0.03475765 0.22592472 0.41709180 0.60825887 2.51992962
...
)mean
can take an argument trim
[1] -0.9558354 -0.7646683 -0.5735012 -0.3823341 -0.1911671 0.0000000
[7] 0.1911671 0.3823341 0.5735012 0.7646683 2.6763390
mean
and the outermost 10% of observations excludedsd()
internally?
sd()
can’t take an argument called trim
R
is the S3
classmean
using body(mean)
UseMethod("mean")
mean
existmean
that exist[1] mean.Date mean.default mean.difftime mean.POSIXct
[5] mean.POSIXlt mean.quosure* mean.vctrs_vctr*
see '?methods' for accessing help and source code
mean.
mean.default()
mean.default()
will be called.Internal(mean(x))
is called
.Internal
means the function is built right into the core R
codeR
Corepenguins
in Title Casemm
(or Mm
)