---
title: "01 Worksheet. Crash Course in Statistics (Summer 2025)"
subtitle: "Neuroscience Center Zurich, University of Zurich"
author: "Zofia Baranczuk"
date: "2025-08-25"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

  
## 1. GDP.
Read the data set GDP.
We will focus on year 2023.
Which country has the highest and which one has the lowest GDP in 2023? What are GDPs of these countries?
What is the GDP of Switzerland?
Plot the histogram and the boxplot of GDP.
How many columns and how many rows does this data set have?
(Extra:) What is GDP for all the countries starting with letter "S"? 
 

```{r gdp}
library(readr)
library(here)

GDP <- read_csv(here("Data", "GDP.csv"), show_col_types = FALSE)
ind_max <- which.max(GDP$`2023`) #index of the (first, if many)
#maximal element in the column
GDP$`Country Name`[ind_max] #country name corresponding to the index above
GDP$`2023`[ind_max] # GDP value corresponding to the index above
# - index -- the row number for which GDP 2023 was the highest

#if issues with NAs:
#1. na.omit on a smaller data set. 
# Careful: if you have more columns than the one you are interested in, you can loose more data
#2. m <- max(GDP$`2023`, na.rm = TRUE)
#

ind_min <- which.min(GDP$`2023`) 
GDP$`Country Name`[ind_min]
GDP$`2023`[ind_min]

GDP$`2023`[GDP$`Country Name`== "Switzerland"]

hist(GDP$`2023`, col = "darkseagreen2", main= "Histogram of GDP per country in 2023")
boxplot(GDP$`2023`, col = "goldenrod2",main = "GDP per country, 2023")

print("nrow:")
nrow(GDP)
print("ncol:")
ncol(GDP)

idx <-startsWith(GDP$`Country Name`, "S")
S_GDP<- GDP[idx, c("Country Name", "2023")]
S_GDP
```

## 2. Choose one data set of interest from 02DataSets (or use your won data set).
Create 1–2 simple plots (e.g., histogram, boxplot, scatter).
For each plot, add a one-sentence rationale: What do you learn from this view?
```{r}
load(here("Data", "Diag1.RData"))
boxplot(Diag1$Age~Diag1$Sex)


hist(Diag1$Age[Diag1$Sex==1])
hist(Diag1$Age[Diag1$Sex==0])

#Extra: or, to make the histogram a bit more useful, but quite tedious. 
x0 <- Diag1$Age[Diag1$Sex == 0]
x1 <- Diag1$Age[Diag1$Sex == 1]

# Common breaks (adjust n if you want more/fewer bins)
rng <- range(c(x0, x1), na.rm = TRUE)
breaks <- pretty(rng, n = 20)

# Precompute on the same bins 
h0 <- hist(x0, breaks = breaks, plot = FALSE)
h1 <- hist(x1, breaks = breaks, plot = FALSE)

# Y limit to fit both, so that we have the same range for boys and girls
ylim <- c(0, max(h0$density, h1$density, na.rm = TRUE))

# Draw
plot(h0, freq = FALSE,
     col = rgb(0.2, 0.6, 0.9, 0.4),  
     xlab = "Age", ylab = "Density",
     main = "Age distribution by Sex",
     ylim = ylim)

plot(h1, freq = FALSE, add = TRUE,
     col = rgb(0.9, 0.3, 0.3, 0.4))

legend("topright", 
       fill = c(rgb(0.2,0.6,0.9,0.4), rgb(0.9,0.3,0.3,0.4)),
       border = "grey30",
       legend = c("Sex = 0", "Sex = 1"))

```