Classwork 5

Author

Diya Bijoy

Published

October 6, 2025

library(tidyverse) # Tidy data processing and plotting
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggformula)# Formula based plots
Loading required package: scales

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

Loading required package: ggridges

New to ggformula?  Try the tutorials: 
    learnr::run_tutorial("introduction", package = "ggformula")
    learnr::run_tutorial("refining", package = "ggformula")
library(ggplot2)
library(mosaic) # Our go-to package
Registered S3 method overwritten by 'mosaic':
  method                           from   
  fortify.SpatialPolygonsDataFrame ggplot2

The 'mosaic' package masks several functions from core packages in order to add 
additional features.  The original behavior of these functions should not be affected by this.

Attaching package: 'mosaic'

The following object is masked from 'package:Matrix':

    mean

The following object is masked from 'package:scales':

    rescale

The following objects are masked from 'package:dplyr':

    count, do, tally

The following object is masked from 'package:purrr':

    cross

The following object is masked from 'package:ggplot2':

    stat

The following objects are masked from 'package:stats':

    binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
    quantile, sd, t.test, var

The following objects are masked from 'package:base':

    max, mean, min, prod, range, sample, sum
library(skimr) # Another Data inspection package

Attaching package: 'skimr'

The following object is masked from 'package:mosaic':

    n_missing
library(GGally) # Corr plots
library(broom) # Clean reports from Stats / ML outputs

# library(devtools)
# devtools::install_github("rpruim/Lock5withR")
library(Lock5withR) # Datasets

library(easystats) # Easy Statistical Analysis and Charts
# Attaching packages: easystats 0.7.5 (red = needs update)
✔ bayestestR  0.17.0   ✔ correlation 0.8.8 
✖ datawizard  1.2.0    ✔ effectsize  1.0.1 
✔ insight     1.4.2    ✔ modelbased  0.13.0
✖ performance 0.15.1   ✔ parameters  0.28.2
✖ report      0.6.1    ✔ see         0.12.0

Restart the R-Session and update packages with `easystats::easystats_update()`.
library(correlation) # Different Types of Correlations

library(janitor) # Data cleaning and tidying package

Attaching package: 'janitor'

The following object is masked from 'package:insight':

    clean_names

The following objects are masked from 'package:datawizard':

    remove_empty, remove_empty_rows

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(visdat) # Visualize whole dataframes for missing data
library(naniar) # Clean missing data

Attaching package: 'naniar'

The following object is masked from 'package:skimr':

    n_complete
library(DT) # Interactive Tables for our data
library(tinytable) # Elegant Tables for our data

Attaching package: 'tinytable'

The following object is masked from 'package:ggplot2':

    theme_void
library(ggrepel) # Repelled Text Labels in ggplot
library(marquee) # Marquee Text Labels in ggplot
library(dplyr)
data(HollywoodMovies2011, package = "Lock5withR")
movies_modified <- HollywoodMovies2011 %>%
  janitor::clean_names(case = "snake") %>%
  janitor::remove_empty(which = c("rows", "cols")) %>%
  dplyr::mutate(
    across(where(is.character), as.factor)
  ) %>%
  dplyr::relocate(where(is.factor))
movies_quant <- movies_modified %>%
  drop_na() %>%
  select(where(is.numeric))
movies_quant %>% names()
 [1] "rotten_tomatoes"      "audience_score"       "theaters_open_week"  
 [4] "bo_average_open_week" "domestic_gross"       "foreign_gross"       
 [7] "world_gross"          "budget"               "profitability"       
[10] "opening_weekend"     
movies_modified %>%
  drop_na() %>%
  gf_point(profitability ~ budget) %>%
  gf_smooth() %>%
  gf_labs(
    title = "Profitability vs budget",
    subtitle = "Movie profitability: Does budget affect profitability?"
  )
`geom_smooth()` using method = 'loess'

movies_modified %>%
  drop_na() %>%
  gf_point(profitability ~ budget, color = ~story) %>%
  gf_lm() %>%
  gf_labs(
    title = "Profitability vs budget",
    subtitle = "Movie profitability: Does budget affect profitability?"
  )

movies_modified %>% count(lead_studio) %>% slice_min(n = 10, order_by = n)
                 lead_studio n
1         Aardman Animations 1
2                  CBS Films 1
3       DreamWorks Animation 1
4  Happy Madison Productions 1
5              Miramax Films 1
6   Morgan Creek Productions 1
7            New Line Cinema 1
8                      Pixar 1
9        Regency Enterprises 1
10               Relativity  1
11    Reliance Entertainment 1
12   Sony Pictures Animation 1
13     Vertigo Entertainment 1
14 Village Roadshow Pictures 1
15                    Virgin 1
movies_modified %>% count(lead_studio) %>% arrange(desc(n)) %>% head(5)
       lead_studio  n
1      Independent 32
2      Warner Bros 12
3 20th Century Fox  9
4        Universal  9
5           Disney  8
GGally::ggpairs(
  movies_modified %>% drop_na(),
  columns = c(
    "profitability", "budget", "domestic_gross","theaters_open_week","opening_weekend","rotten_tomatoes"),
  switch = "both",
  progress = FALSE,
  diag = list(continuous = "densityDiag"),
  lower = list(continuous = wrap("smooth", alpha = 0.3, se = FALSE)),
  title = "Movies Data Correlations Plot #1"
)

GGally::ggpairs(
  movies_modified %>% drop_na(),
  columns = c(
    "profitability", "budget", "domestic_gross","theaters_open_week","genre","rotten_tomatoes"),
  switch = "both",
  progress = FALSE,
  diag = list(continuous = "barDiag"),
  lower = list(continuous = wrap("smooth", alpha = 0.3, se = FALSE)),
  title = "Movies Data Correlations Plot #1"
)
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.

mosaic::cor_test(domestic_gross ~ budget, data = movies_modified) %>%
  broom::tidy() %>%
  knitr::kable(
    digits = 2,
    caption = "Movie Domestic Gross vs budget"
  )
Movie Domestic Gross vs budget
estimate statistic p.value parameter conf.low conf.high method alternative
0.7 11.06 0 131 0.6 0.77 Pearson’s product-moment correlation two.sided