---
title: "05 Visualization. Crash Course in Statistics (Summer 2025)"
subtitle: "Neuroscience Center Zurich, University of Zurich"
author: "Zofia Baranczuk"
date: "2025-08-25"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## 0. Load required packages
```{r}
library(ggplot2)  # plotting
library(dplyr)    # data wrangling
library(tidyr)    # more data wrangling
library(GGally)   # for ggpairs - a convenient way for a look at the whole data set
library(ggthemes) # if we would like to use some specific theme for a plot
library(lme4)     # sleepstudy data set

```


## 1. Our first ggplot: Reaction vs Days sleepstudy
ggplot needs three things: which function (ggplot()), what data, and what aesthetics (x/y). Then add a geom.
Pattern: ggplot(data = df, aes(x = ..., y = ...)) + geom_*()
```{r}
data("sleepstudy", package = "lme4")
ggplot(data = sleepstudy, aes(x = Days, y = Reaction)) +
  geom_point()
 #Task: change color of the point to green.

```

## 2. Subject dependent color:
```{r}
ggplot(data = sleepstudy, aes(x = Days, y = Reaction)) +
geom_point(aes(color = Subject)) # 

# Notice: we are starting with the same data, so we can use:
ggbase <- ggplot(data = sleepstudy, aes(x = Days, y = Reaction)) 
ggbase + geom_point(aes(color = Subject))

```
## 3. Grouping per Subject. 
ggplot draws each layer per group.
For geoms that need grouping (like lines), ggplot decides the groups like this:

If you set group = ..., that’s the grouping.

Otherwise, it infers groups from any discrete aesthetics you mapped (e.g., color, linetype, fill, shape), i.e. discrete aesthetics imply grouping.

```{r}
ggplot(sleepstudy, aes(Days, Reaction, group = Subject, color = Subject)) +
  geom_line(alpha = 0.7) + # alpha set transparency. 1 - no transparency. 0 completely transparent
  geom_point( alpha = 0.5) 
 
# Tasks: 1. check what happens if you remove "group = Subject" or "color = Subject". 
# 2. Make only the color points depending on the subjects, but lines should be black
# 3. Make the point size proportional to the reaction time
```

## 4. Add a Trend Line.
geom_smooth() fits a model and draws the fitted line (plus an optional confidence band). Because our plot already groups by Subject, we override that for the smoother with aes(group = 1) to get one overall trend. Choose method = "lm" (linear) or method = "loess" (smooth). The se argument toggles the confidence ribbon.
```{r}
ggplot(sleepstudy, aes(Days, Reaction, group = Subject, color = Subject)) +
  geom_line(alpha = 0.7) + # alpha set transparency. 1 - no transparency. 0 completely transparent
  geom_point( alpha = 0.5)  +
geom_smooth(aes(group = 1),                       # override grouping 
              method = "lm", formula = y ~ x, se = TRUE, level = 0.99, color = "red") 
# we will come back to the linear regression later
# here just in context of plotting

```
## 5. Labels and themes Example
```{r}
ggplot(sleepstudy, aes(Days, Reaction, color = Subject, group = Subject)) +
  geom_line(alpha = 0.25) +
  geom_point(alpha = 0.6) +
  geom_smooth(aes(group = 1), method = "lm", 
            formula = 'y ~ x', se = TRUE, color = "red") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(
    title = "Reaction time across days of sleep restriction",
    subtitle = "Each line = one subject; red line = overall linear trend",
    x = "Days of restriction",
    y = "Reaction time (ms)",
    caption = "Data: lme4::sleepstudy"
  )

# Task: How to add (or: not remove) the legend about which color corresponds to which subject?

```


## 6. ToothGrowth: Violin + jitter
```{r}
#View(ToothGrowth)
ggplot(ToothGrowth, aes(x = factor(dose), y = len, fill = supp)) +
  geom_violin(trim = FALSE, alpha = 0.6) +
  geom_jitter(shape = 21,  color = "black", 
              position = position_jitterdodge(jitter.width = 0.1,
                                              dodge.width = 0.95))  +
  theme_minimal() +
  labs(title = "Tooth Growth by Supplement and Dose",
       x = "Dose (mg)", y = "Tooth Length") +
  scale_fill_brewer(palette = "Set2")


# Task: Swap the violin for just a geom_boxplot() and add coord_flip()—which communicates better?
```




## 7. Sleep time vs brain weight for different "-vore". 
Plot sleep_total against brainwt. Add lines corresponding to sleep_total and brainwt relationship per group. The code below returns warnings. See the task below.   
```{r}
data(msleep)
#View(msleep)
msleep_small <- msleep %>%
  select(sleep_total, brainwt, bodywt, vore) %>%
  drop_na()

ggplot(msleep_small, aes(x = brainwt, y = sleep_total, color = vore)) +
  geom_point(alpha = 0.7, size = 3) + # a scatterplot
  geom_smooth(se = TRUE, alpha = 0.2) +   
  # one smoother per vore 
  scale_x_log10() + # on a log scale
  theme_minimal() +
  labs(title = "Sleep Time vs Brain Weight (log scale)",
       x = "Brain Weight (log)", y = "Total Sleep (hrs)")

# Task: 1. You get many warnings, and overfitted models per group. 
# How could you change it to lm for each group?
# 2. Which part of the code informs R, that it needs to plot models per group?
# 3. Check the plot without the log scale. 
# Which version (with or without log scale) is easier to read from?  

```
## 8. Histograms in ggplot.
Plot overlaying histograms and densities for sleep duration per group.
```{r}
ggplot(msleep_small, aes(x = sleep_total, fill = vore, color = vore)) +
  geom_histogram(position = "identity", bins = 16) +
  theme_minimal() +
  labs(title = "Sleep duration — overlaid histograms by feeding type",
       x = "Total sleep (hours/day)", y = "Count", fill = "Vore", color = "Vore") +
  theme(legend.position = "bottom")


ggplot(msleep_small, aes(x = sleep_total, color = vore, fill = vore)) +
  geom_density( linewidth = 0.9) +
  theme_minimal() +
  labs(title = "Sleep duration — overlaid density curves by feeding type",
       x = "Total sleep (hours/day)", y = "Density", color = "Vore", fill = "Vore") +
  theme(legend.position = "bottom")

# Task: add transparency to the plots. 
```
## 9. QQ plots in ggplot.
Prepare qqplot for sleep_total.
```{r}

ggplot(msleep, aes(sample = sleep_total)) +
  stat_qq() + stat_qq_line() +
  theme_minimal() +
  labs(title = "Q–Q plot: total sleep")


#Task: Prepare q-q plot for the brainwt. Does it look normally distributed? How about log(brainwt)?

```



## 10. Pairwise explorer (GGally::ggpairs)
Prepare pair plot for sleep_total, brainwt, bodywt, vore in msleep, using vore as grouping factor.
```{r}

logsmsleep_small <- msleep %>%
  select(sleep_total, brainwt, bodywt, vore) %>%
  mutate(across(c(brainwt, bodywt), log)) %>%
  drop_na()

ggpairs(
  logsmsleep_small,
  mapping = aes(color = vore),
  progress = FALSE)

ggpairs(
  logsmsleep_small,
  columns = c("sleep_total","brainwt","bodywt"),
  mapping = aes(color = vore),
  upper   = list(continuous = wrap("cor", size = 4)),         # text
  lower   = list(continuous = wrap("points", alpha = 0.5)),
  diag    = list(continuous = wrap("densityDiag", alpha = 0.3)),
  progress = FALSE
) + theme_minimal()


ggpairs(
  logsmsleep_small,
  columns = c("sleep_total","brainwt","bodywt"),
  mapping = aes(color = vore),
  lower = list(continuous = wrap("points", alpha = 0.5)),
  upper = list(continuous = wrap("smooth", se = TRUE, alpha = 0.1)), 
  diag  = list(continuous = wrap("densityDiag", alpha = 0.3)),
  progress = FALSE
) + theme_minimal()



```
