library(tidyverse)
RAdelaide 2024
July 10, 2024
R
ggplot()
\(x)
is shorthand for function(x)
(since R v4.1)x
\(x)
using x
as the underlying valuemutate()
to modify columnsA function really has multiple aspects
formals()
\(\implies\) the arguments we passbody()
\(\implies\) the code that does stuffenvironment()
where calculations take placeLet’s look through sd()
starting at the help page ?sd
na.rm
has a default value (FALSE
)"_"
to spaces
flagstats
as x
?x
GlobalEnvironment
but like “a separate bubble”modify_labels <- function(x) {
new_x <- str_replace_all(x, "_", " ") # Replace all '_' with spaces
new_x <- str_to_title(new_x) # Start each word with an uppercase letter
new_x <- str_wrap(new_x, width = 12) # Add line breaks after 12 characters
new_x # Return our final object
}
modify_labels(flagstats)
[1] "Properly\nPaired Reads" "Unique\nAlignments"
12
internallywidth
with default value of 12
modify_labels <- function(x, width = 12) {
new_x <- str_replace_all(x, "_", " ") # Replace all '_' with spaces
new_x <- str_to_title(new_x) # Start each word with an uppercase letter
new_x <- str_wrap(new_x, width = width) # Add line breaks where requested
new_x # Return our final object
}
modify_labels(flagstats)
[1] "Properly\nPaired Reads" "Unique\nAlignments"
[1] "Properly Paired Reads" "Unique Alignments"
...
as a function argument?str_wrap
width
, indent
, exdent
and whitespace_only
...
...
inside str_wrap
modify_labels <- function(x, ...) {
new_x <- str_replace_all(x, "_", " ") # Replace all '_' with spaces
new_x <- str_to_title(new_x) # Start each word with an upper-case letter
new_x <- str_wrap(new_x, ...) # Add line breaks where requested
new_x # Return our final object
}
modify_labels(flagstats)
modify_labels(flagstats, width = 12)
modify_labels(flagstats, width = 12, indent = 5)
R
sees everything as vectorsflagstats
python
, C
, C++
, perl
etc step through vectors[1] "properly_paired_reads"
[1] "unique_alignments"
x
is just a convention \(\implies\) can be anything (i
, bob
etc)R
works on vectors
Iteration is probably our first, best guess…
# Never do this. It's just an example...
len <- c() # Initialise an empty object
for (x in vals) { # Step through 'vals' calling each element 'x'
len <- c(len, length(x)) # Add the values as we step through
}
len
[1] 26 1000
The above:
len
x
x
and extends len
R
Way to IterateR
lapply
lapply(list, function)
R
Way to Iteratelapply(list, function, ...)
R
Way to Iteratelapply()
will always return a listhead
gave two elements of different typeslength
gave two integer
elementsR
Way to Iteratemap_*()
functionspurrr
\(\implies\) core tidyverse
packagemap_chr()
, map_lgl()
, map_dbl()
[1] 104 2001
# A tibble: 6 × 10
Population SNP1 SNP2 SNP3 SNP4 SNP5 SNP6 SNP7 SNP8 SNP9
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Control AB BB AB BB BB BB BB AB AB
2 Control AB AB AB AB BB <NA> AA AA AB
3 Control BB AB AA AA AA AB AB BB AB
4 Control AB AB BB AB AB AB AA AA BB
5 Control BB BB AB BB AB AA AA AB AB
6 Control AB AB AA AA AB <NA> AB BB AB
Our task is to:
lapply()
and functions will be our friendsacross()
dplyr
function
tidyselect
helpersis.na()
across all columns that start_with “SNP”# A tibble: 104 × 10
Population SNP1 SNP2 SNP3 SNP4 SNP5 SNP6 SNP7 SNP8 SNP9
<chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
2 Control FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
3 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
4 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
5 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
6 Control FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
7 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
8 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
9 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
10 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# ℹ 94 more rows
summarise()
we can count these accross all SNPs
pivot_longer()
might helpsnps %>%
summarise(
across(starts_with("SNP"), \(x) sum(is.na(x)))
) %>%
pivot_longer(everything(), names_to = "locus", values_to = "missing")
# A tibble: 2,000 × 2
locus missing
<chr> <int>
1 SNP1 3
2 SNP2 0
3 SNP3 1
4 SNP4 0
5 SNP5 2
6 SNP6 3
7 SNP7 1
8 SNP8 1
9 SNP9 1
10 SNP10 1
# ℹ 1,990 more rows
rmarkdown
reportA
allele acts in a dominant mannerA_TRUE | A_FALSE | |
---|---|---|
Control | a | b |
Treat | c | d |
A
alleleA
statusTRUE/FALSE
columns
snps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
arrange(locus)
tibble
for each locussnps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_")))
list
columnsnps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_"))) %>%
slice(1) %>% pull(df)
lapply()
On Nested Columnslapply()
to perform an analysis on every nested dfsnps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_"))) %>%
mutate(
ft = lapply(df, \(x) fisher.test(x[, c("A_TRUE", "A_FALSE")]))
)
lapply()
On Nested Columnshtest
p.value
double
(i.e. numeric
)map_dbl()
to grab these valueslapply()
On Nested Columnssnps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_"))) %>%
mutate(
ft = lapply(df, \(x) fisher.test(x[, c("A_TRUE", "A_FALSE")])),
p = map_dbl(ft, \(x) x$p.value),
)
lapply()
On Nested Columnsestimate
snps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_"))) %>%
mutate(
ft = lapply(df, \(x) fisher.test(x[, c("A_TRUE", "A_FALSE")])),
OR = map_dbl(ft, \(x) x$estimate),
p = map_dbl(ft, \(x) x$p.value),
)
lapply()
On Nested Columnsdf
againsnps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_"))) %>%
mutate(
ft = lapply(df, \(x) fisher.test(x[, c("A_TRUE", "A_FALSE")])),
Control = map_int(df, \(x) dplyr::filter(x, Population == "Control")[["A_TRUE"]]),
Treat = map_int(df, \(x) dplyr::filter(x, Population == "Treat")[["A_TRUE"]]),
OR = map_dbl(ft, \(x) x$estimate),
p = map_dbl(ft, \(x) x$p.value),
)
snps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_"))) %>%
mutate(
ft = lapply(df, \(x) fisher.test(x[, c("A_TRUE", "A_FALSE")])),
Control = map_int(df, \(x) dplyr::filter(x, Population == "Control")[["A_TRUE"]]),
Treat = map_int(df, \(x) dplyr::filter(x, Population == "Treat")[["A_TRUE"]]),
OR = map_dbl(ft, \(x) x$estimate),
p = map_dbl(ft, \(x) x$p.value),
adj_p = p.adjust(p, "bonferroni")
) %>%
arrange(p)
# A tibble: 2,000 × 8
locus df ft Control Treat OR p adj_p
<chr> <list> <list> <int> <int> <dbl> <dbl> <dbl>
1 SNP1716 <tibble [2 × 3]> <htest> 47 8 38.7 2.17e-14 4.35e-11
2 SNP1236 <tibble [2 × 3]> <htest> 46 11 22.9 1.29e-11 2.57e- 8
3 SNP1618 <tibble [2 × 3]> <htest> 45 12 22.7 3.00e-11 6.00e- 8
4 SNP248 <tibble [2 × 3]> <htest> 44 10 18.8 7.91e-11 1.58e- 7
5 SNP1730 <tibble [2 × 3]> <htest> 43 10 17.0 3.27e-10 6.54e- 7
6 SNP311 <tibble [2 × 3]> <htest> 41 10 13.5 2.52e- 9 5.04e- 6
7 SNP1385 <tibble [2 × 3]> <htest> 45 14 14.0 3.70e- 9 7.40e- 6
8 SNP1647 <tibble [2 × 3]> <htest> 43 13 13.5 4.28e- 9 8.57e- 6
9 SNP8 <tibble [2 × 3]> <htest> 46 16 13.5 9.41e- 9 1.88e- 5
10 SNP1993 <tibble [2 × 3]> <htest> 40 11 11.7 1.74e- 8 3.47e- 5
# ℹ 1,990 more rows
For this we needed to understand
pivot_longer()
and pivot_wider()
list
, vector
and data.frame
?integer
and double
valuestidyselect
helper functions + dplyr
lapply()
with inline functionslapply()
using map_*()
to produce vector outputmap_*()
are sapply()
and vapply()
sapply()
is slightly unpredictablevapply()
is a bit more clunky but powerfulunlist(lapply(...))
Why didn’t we?