lab-1-review-nberzinsCP/Lab1RMD.Rmd at master · Cal-Poly-Advanced-R/lab-1-review-nberzinsCP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
---
title: "Lab1RMD"
author: "Brandon Kim, Shreya Ravilla"
date: '2023-04-06'
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(tidyverse)
```


```{r qqplot function in tidy}
qqcreatetidy <- function(x) {
  normalized_values <- sort(rnorm(length(x), mean = mean(x), sd = sd(x)))
  # Create a sorted vector of random normally distributed values with the mean and sd of the inputted vector
  vector <- sort(x)
  df <- data.frame(vector, normalized_values)
  # since both vectors are sorted, having them put together in a dataframe will match the values from low to high.

  plt <- df %>%
    ggplot(aes(vector, normalized_values)) +
    geom_point() +
    geom_abline(intercept = 0, slope = 1) +
    ggtitle("qqplot")
  # Create a scatterplot with a line y = x.

  return(plt)
}
```


```{r qqplot function in base}
qqcreatebase <- function(x) {
  normalized_values <- sort(rnorm(length(x), mean = mean(x), sd = sd(x)))
  vector <- sort(x)

  plt <- plot(vector, normalized_values)
  abline(a = 0, b = 1)
  # add a y = x line
  title("qqplot")
  return(plt)
}
```


```{r readcsv}
df_oscars <- read_csv(here::here("Oscars-demographics-DFE.csv"))
head(df_oscars)
```


```{r warmup 1 tidy}
df_oscars %>%
  filter(award %in% c("Best Director",
                       "Best Actor",
                       "Best Actress",
                       "Best Supporting Actor",
                       "Best Supporting Actress")) %>%
  group_by(movie) %>%
  summarise(unique = nlevels(factor(award))) %>%
  filter(unique == max(unique))
```


```{r warmup 1 base}
oscars_filtered <- df_oscars[df_oscars$award %in% c("Best Director",
                                                    "Best Actor",
                                                    "Best Actress",
                                                    "Best Supporting Actor",
                                                    "Best Supporting Actress"),]

oscars_grouped <- aggregate(award ~ movie, data = oscars_filtered, FUN = function(x) length(unique(x)))

oscars_grouped[oscars_grouped$award == max(oscars_grouped$award), ]
```


```{r warmup 2 tidy}
df_oscars %>%
  filter(award == "Best Actress") %>%
  mutate(first_name = sub(" .*", "", person)) %>%
  count(first_name) %>%
  filter(n == max(n))
```


```{r warmup 2 base}

oscars_actress <- subset(df_oscars, award == "Best Actress")
oscars_actress$first_name <- sub(" .*", "", oscars_actress$person)
num_name <- table(oscars_actress$first_name)
max_name <- max(num_name)
subset(as.data.frame(num_name), Freq == max_name)
```

```{r warmup 3 tidy}
df_oscars %>%
  mutate(birthplace = ifelse(birthplace=="New York City", "Ny", birthplace),
         region = sub(".*, ", "", birthplace)) %>%
  count(region) %>%
  filter(n == max(n))
```


```{r warmup 3 base}
df_oscars$birthplace <- ifelse(df_oscars$birthplace == "New York City", "Ny", df_oscars$birthplace)
df_oscars$region <- sub(".*, ", "", df_oscars$birthplace)
region_counts <- table(df_oscars$region)
max_region <- names(region_counts)[region_counts == max(region_counts)]
data.frame(region = max_region, n = max(region_counts))
```


```{r age and gender data cleaning}
library(lubridate)
df_oscars <- df_oscars %>%
  mutate(award_date = as.Date(paste(as.character(year_of_award), "-02-01", sep = "")),
         date_of_birth = as.Date(date_of_birth, "%d-%b-%Y"),
         date_of_birth = update(date_of_birth, year = as.numeric(format(date_of_birth, "%Y")) +
                                ifelse(as.numeric(format(date_of_birth, "%Y")) < 99, 1900, 0)),
         age = floor(time_length(difftime(award_date, date_of_birth), "years")),
         years_since_inception = year_of_award - 1927)
```


```{r malelm}
df_males <- df_oscars %>%
  filter(award %in% c("Best Actor", "Best Supporting Actor"))

lm_male <- lm(age ~ years_since_inception, data = df_males)
summary(lm_male)
```
For male actors, our linear model is:
$$
\hat{age}=43.47876+0.07471(year)
$$
where year is in years after the inception of the Oscars. With a p-value of 0.05551, we can conclude that there is moderate evidence to support that there is a relationship between years after inception and age for male award winners. This means that for every 1 year past inception, the predicted age of Best Actor and Best Supporting Actor winners will go up by 0.07471 years.


```{r femalelm}
df_females <- df_oscars %>%
  filter(award %in% c("Best Actress", "Best Supporting Actress"))

lm_female <- lm(age ~ years_since_inception, data = df_females)
summary(lm_female)
```

For female actors, our linear model is:
$$
\hat{age}=35.88847+0.03270(year)
$$
where year is in years after the inception of the Oscars. With a p-value of 0.4011, we can conclude that there is very weak evidence to suggest
is a relationship between years after inception and age for female award winners. However, if a relationship did exist between typical age and years since inception, we can conclude that for every 1 year past inception, the predicted age of Best Actress and Best Supporting Actress winners will go up by 0.03270 years.