library(tidyverse)RAdelaide 2024
July 10, 2024
Rggplot()

\(x) is shorthand for function(x) (since R v4.1)x\(x) using x as the underlying valuemutate() to modify columnsA function really has multiple aspects
formals() \(\implies\) the arguments we passbody() \(\implies\) the code that does stuffenvironment() where calculations take placeLet’s look through sd() starting at the help page ?sd
na.rm has a default value (FALSE)"_" to spaces
flagstats as x?xGlobalEnvironment but like “a separate bubble”modify_labels <- function(x) {
new_x <- str_replace_all(x, "_", " ") # Replace all '_' with spaces
new_x <- str_to_title(new_x) # Start each word with an uppercase letter
new_x <- str_wrap(new_x, width = 12) # Add line breaks after 12 characters
new_x # Return our final object
}
modify_labels(flagstats)[1] "Properly\nPaired Reads" "Unique\nAlignments"
12 internallywidth with default value of 12
modify_labels <- function(x, width = 12) {
new_x <- str_replace_all(x, "_", " ") # Replace all '_' with spaces
new_x <- str_to_title(new_x) # Start each word with an uppercase letter
new_x <- str_wrap(new_x, width = width) # Add line breaks where requested
new_x # Return our final object
}
modify_labels(flagstats)[1] "Properly\nPaired Reads" "Unique\nAlignments"
[1] "Properly Paired Reads" "Unique Alignments"
... as a function argument?str_wrapwidth, indent, exdent and whitespace_only...... inside str_wrapmodify_labels <- function(x, ...) {
new_x <- str_replace_all(x, "_", " ") # Replace all '_' with spaces
new_x <- str_to_title(new_x) # Start each word with an upper-case letter
new_x <- str_wrap(new_x, ...) # Add line breaks where requested
new_x # Return our final object
}
modify_labels(flagstats)
modify_labels(flagstats, width = 12)
modify_labels(flagstats, width = 12, indent = 5)R sees everything as vectorsflagstats
python, C, C++, perl etc step through vectors[1] "properly_paired_reads"
[1] "unique_alignments"
x is just a convention \(\implies\) can be anything (i, bob etc)R works on vectors
Iteration is probably our first, best guess…
# Never do this. It's just an example...
len <- c() # Initialise an empty object
for (x in vals) { # Step through 'vals' calling each element 'x'
len <- c(len, length(x)) # Add the values as we step through
}
len[1] 26 1000
The above:
lenxx and extends lenR Way to IterateRlapply
lapply(list, function)R Way to Iteratelapply(list, function, ...)R Way to Iteratelapply() will always return a listhead gave two elements of different typeslength gave two integer elementsR Way to Iteratemap_*() functionspurrr \(\implies\) core tidyverse packagemap_chr(), map_lgl(), map_dbl()[1] 104 2001
# A tibble: 6 × 10
Population SNP1 SNP2 SNP3 SNP4 SNP5 SNP6 SNP7 SNP8 SNP9
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Control AB BB AB BB BB BB BB AB AB
2 Control AB AB AB AB BB <NA> AA AA AB
3 Control BB AB AA AA AA AB AB BB AB
4 Control AB AB BB AB AB AB AA AA BB
5 Control BB BB AB BB AB AA AA AB AB
6 Control AB AB AA AA AB <NA> AB BB AB
Our task is to:
lapply() and functions will be our friendsacross()dplyr function
tidyselect helpersis.na() across all columns that start_with “SNP”# A tibble: 104 × 10
Population SNP1 SNP2 SNP3 SNP4 SNP5 SNP6 SNP7 SNP8 SNP9
<chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
2 Control FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
3 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
4 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
5 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
6 Control FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
7 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
8 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
9 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
10 Control FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# ℹ 94 more rows
summarise() we can count these accross all SNPs
pivot_longer() might helpsnps %>%
summarise(
across(starts_with("SNP"), \(x) sum(is.na(x)))
) %>%
pivot_longer(everything(), names_to = "locus", values_to = "missing")# A tibble: 2,000 × 2
locus missing
<chr> <int>
1 SNP1 3
2 SNP2 0
3 SNP3 1
4 SNP4 0
5 SNP5 2
6 SNP6 3
7 SNP7 1
8 SNP8 1
9 SNP9 1
10 SNP10 1
# ℹ 1,990 more rows
rmarkdown reportA allele acts in a dominant manner| A_TRUE | A_FALSE | |
|---|---|---|
| Control | a | b |
| Treat | c | d |
A alleleA statusTRUE/FALSE columns
snps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
arrange(locus)tibble for each locussnps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_")))list columnsnps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_"))) %>%
slice(1) %>% pull(df)lapply() On Nested Columnslapply() to perform an analysis on every nested dfsnps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_"))) %>%
mutate(
ft = lapply(df, \(x) fisher.test(x[, c("A_TRUE", "A_FALSE")]))
)lapply() On Nested Columnshtestp.valuedouble (i.e. numeric)map_dbl() to grab these valueslapply() On Nested Columnssnps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_"))) %>%
mutate(
ft = lapply(df, \(x) fisher.test(x[, c("A_TRUE", "A_FALSE")])),
p = map_dbl(ft, \(x) x$p.value),
)lapply() On Nested Columnsestimatesnps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_"))) %>%
mutate(
ft = lapply(df, \(x) fisher.test(x[, c("A_TRUE", "A_FALSE")])),
OR = map_dbl(ft, \(x) x$estimate),
p = map_dbl(ft, \(x) x$p.value),
)lapply() On Nested Columnsdf againsnps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_"))) %>%
mutate(
ft = lapply(df, \(x) fisher.test(x[, c("A_TRUE", "A_FALSE")])),
Control = map_int(df, \(x) dplyr::filter(x, Population == "Control")[["A_TRUE"]]),
Treat = map_int(df, \(x) dplyr::filter(x, Population == "Treat")[["A_TRUE"]]),
OR = map_dbl(ft, \(x) x$estimate),
p = map_dbl(ft, \(x) x$p.value),
)snps %>%
pivot_longer(starts_with("SNP"), names_to = "locus", values_to = "genotype") %>%
dplyr::filter(!is.na(genotype)) %>%
mutate(A = str_detect(genotype, "A")) %>%
summarise(n = dplyr::n(), .by = c(Population, locus, A)) %>%
pivot_wider(
names_from = "A", values_from = "n", values_fill = 0, names_prefix = "A_"
) %>%
nest(df = c(Population, starts_with("A_"))) %>%
mutate(
ft = lapply(df, \(x) fisher.test(x[, c("A_TRUE", "A_FALSE")])),
Control = map_int(df, \(x) dplyr::filter(x, Population == "Control")[["A_TRUE"]]),
Treat = map_int(df, \(x) dplyr::filter(x, Population == "Treat")[["A_TRUE"]]),
OR = map_dbl(ft, \(x) x$estimate),
p = map_dbl(ft, \(x) x$p.value),
adj_p = p.adjust(p, "bonferroni")
) %>%
arrange(p)# A tibble: 2,000 × 8
locus df ft Control Treat OR p adj_p
<chr> <list> <list> <int> <int> <dbl> <dbl> <dbl>
1 SNP1716 <tibble [2 × 3]> <htest> 47 8 38.7 2.17e-14 4.35e-11
2 SNP1236 <tibble [2 × 3]> <htest> 46 11 22.9 1.29e-11 2.57e- 8
3 SNP1618 <tibble [2 × 3]> <htest> 45 12 22.7 3.00e-11 6.00e- 8
4 SNP248 <tibble [2 × 3]> <htest> 44 10 18.8 7.91e-11 1.58e- 7
5 SNP1730 <tibble [2 × 3]> <htest> 43 10 17.0 3.27e-10 6.54e- 7
6 SNP311 <tibble [2 × 3]> <htest> 41 10 13.5 2.52e- 9 5.04e- 6
7 SNP1385 <tibble [2 × 3]> <htest> 45 14 14.0 3.70e- 9 7.40e- 6
8 SNP1647 <tibble [2 × 3]> <htest> 43 13 13.5 4.28e- 9 8.57e- 6
9 SNP8 <tibble [2 × 3]> <htest> 46 16 13.5 9.41e- 9 1.88e- 5
10 SNP1993 <tibble [2 × 3]> <htest> 40 11 11.7 1.74e- 8 3.47e- 5
# ℹ 1,990 more rows
For this we needed to understand
pivot_longer() and pivot_wider()list, vector and data.frame?integer and double valuestidyselect helper functions + dplyrlapply() with inline functionslapply() using map_*() to produce vector outputmap_*() are sapply() and vapply()
sapply() is slightly unpredictablevapply() is a bit more clunky but powerfulunlist(lapply(...))Why didn’t we?
