Classwork 5

Author

Diya Bijoy

Published

October 6, 2025

library(tidyverse) # Tidy data processing and plotting

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggformula)# Formula based plots

Loading required package: scales

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

Loading required package: ggridges

New to ggformula?  Try the tutorials: 
    learnr::run_tutorial("introduction", package = "ggformula")
    learnr::run_tutorial("refining", package = "ggformula")

library(ggplot2)
library(mosaic) # Our go-to package

Registered S3 method overwritten by 'mosaic':
  method                           from   
  fortify.SpatialPolygonsDataFrame ggplot2

The 'mosaic' package masks several functions from core packages in order to add 
additional features.  The original behavior of these functions should not be affected by this.

Attaching package: 'mosaic'

The following object is masked from 'package:Matrix':

    mean

The following object is masked from 'package:scales':

    rescale

The following objects are masked from 'package:dplyr':

    count, do, tally

The following object is masked from 'package:purrr':

    cross

The following object is masked from 'package:ggplot2':

    stat

The following objects are masked from 'package:stats':

    binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
    quantile, sd, t.test, var

The following objects are masked from 'package:base':

    max, mean, min, prod, range, sample, sum

library(skimr) # Another Data inspection package


Attaching package: 'skimr'

The following object is masked from 'package:mosaic':

    n_missing

library(GGally) # Corr plots
library(broom) # Clean reports from Stats / ML outputs

# library(devtools)
# devtools::install_github("rpruim/Lock5withR")
library(Lock5withR) # Datasets

library(easystats) # Easy Statistical Analysis and Charts

# Attaching packages: easystats 0.7.5 (red = needs update)
✔ bayestestR  0.17.0   ✔ correlation 0.8.8 
✖ datawizard  1.2.0    ✔ effectsize  1.0.1 
✔ insight     1.4.2    ✔ modelbased  0.13.0
✖ performance 0.15.1   ✔ parameters  0.28.2
✖ report      0.6.1    ✔ see         0.12.0

Restart the R-Session and update packages with `easystats::easystats_update()`.

library(correlation) # Different Types of Correlations

library(janitor) # Data cleaning and tidying package


Attaching package: 'janitor'

The following object is masked from 'package:insight':

    clean_names

The following objects are masked from 'package:datawizard':

    remove_empty, remove_empty_rows

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

library(visdat) # Visualize whole dataframes for missing data
library(naniar) # Clean missing data


Attaching package: 'naniar'

The following object is masked from 'package:skimr':

    n_complete

library(DT) # Interactive Tables for our data
library(tinytable) # Elegant Tables for our data


Attaching package: 'tinytable'

The following object is masked from 'package:ggplot2':

    theme_void

library(ggrepel) # Repelled Text Labels in ggplot
library(marquee) # Marquee Text Labels in ggplot
library(dplyr)

data(HollywoodMovies2011, package = "Lock5withR")
movies_modified <- HollywoodMovies2011 %>%
  janitor::clean_names(case = "snake") %>%
  janitor::remove_empty(which = c("rows", "cols")) %>%
  dplyr::mutate(
    across(where(is.character), as.factor)
  ) %>%
  dplyr::relocate(where(is.factor))

movies_quant <- movies_modified %>%
  drop_na() %>%
  select(where(is.numeric))
movies_quant %>% names()

 [1] "rotten_tomatoes"      "audience_score"       "theaters_open_week"  
 [4] "bo_average_open_week" "domestic_gross"       "foreign_gross"       
 [7] "world_gross"          "budget"               "profitability"       
[10] "opening_weekend"

movies_modified %>%
  drop_na() %>%
  gf_point(profitability ~ budget) %>%
  gf_smooth() %>%
  gf_labs(
    title = "Profitability vs budget",
    subtitle = "Movie profitability: Does budget affect profitability?"
  )

`geom_smooth()` using method = 'loess'

movies_modified %>%
  drop_na() %>%
  gf_point(profitability ~ budget, color = ~story) %>%
  gf_lm() %>%
  gf_labs(
    title = "Profitability vs budget",
    subtitle = "Movie profitability: Does budget affect profitability?"
  )

movies_modified %>% count(lead_studio) %>% slice_min(n = 10, order_by = n)

                 lead_studio n
1         Aardman Animations 1
2                  CBS Films 1
3       DreamWorks Animation 1
4  Happy Madison Productions 1
5              Miramax Films 1
6   Morgan Creek Productions 1
7            New Line Cinema 1
8                      Pixar 1
9        Regency Enterprises 1
10               Relativity  1
11    Reliance Entertainment 1
12   Sony Pictures Animation 1
13     Vertigo Entertainment 1
14 Village Roadshow Pictures 1
15                    Virgin 1

movies_modified %>% count(lead_studio) %>% arrange(desc(n)) %>% head(5)

       lead_studio  n
1      Independent 32
2      Warner Bros 12
3 20th Century Fox  9
4        Universal  9
5           Disney  8

GGally::ggpairs(
  movies_modified %>% drop_na(),
  columns = c(
    "profitability", "budget", "domestic_gross","theaters_open_week","opening_weekend","rotten_tomatoes"),
  switch = "both",
  progress = FALSE,
  diag = list(continuous = "densityDiag"),
  lower = list(continuous = wrap("smooth", alpha = 0.3, se = FALSE)),
  title = "Movies Data Correlations Plot #1"
)

GGally::ggpairs(
  movies_modified %>% drop_na(),
  columns = c(
    "profitability", "budget", "domestic_gross","theaters_open_week","genre","rotten_tomatoes"),
  switch = "both",
  progress = FALSE,
  diag = list(continuous = "barDiag"),
  lower = list(continuous = wrap("smooth", alpha = 0.3, se = FALSE)),
  title = "Movies Data Correlations Plot #1"
)

`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.

mosaic::cor_test(domestic_gross ~ budget, data = movies_modified) %>%
  broom::tidy() %>%
  knitr::kable(
    digits = 2,
    caption = "Movie Domestic Gross vs budget"
  )

Movie Domestic Gross vs budget
estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
0.7	11.06	0	131	0.6	0.77	Pearson’s product-moment correlation	two.sided