Classwork 3

Author

Diya Bijoy

Published

September 22, 2025

Setup

library(tidyverse) # Sine qua non

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(mosaic) # Out all-in-one package

Registered S3 method overwritten by 'mosaic':
  method                           from   
  fortify.SpatialPolygonsDataFrame ggplot2

The 'mosaic' package masks several functions from core packages in order to add 
additional features.  The original behavior of these functions should not be affected by this.

Attaching package: 'mosaic'

The following object is masked from 'package:Matrix':

    mean

The following objects are masked from 'package:dplyr':

    count, do, tally

The following object is masked from 'package:purrr':

    cross

The following object is masked from 'package:ggplot2':

    stat

The following objects are masked from 'package:stats':

    binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
    quantile, sd, t.test, var

The following objects are masked from 'package:base':

    max, mean, min, prod, range, sample, sum

library(ggformula) # Graphing package
library(skimr) # Looking at Data


Attaching package: 'skimr'

The following object is masked from 'package:mosaic':

    n_missing

library(janitor) # Clean the data


Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

library(naniar) # Handle missing data


Attaching package: 'naniar'

The following object is masked from 'package:skimr':

    n_complete

library(visdat) # Visualise missing data
library(tinytable) # Printing Static Tables for our data


Attaching package: 'tinytable'

The following object is masked from 'package:ggplot2':

    theme_void

library(DT) # Interactive Tables for our data
library(crosstable) # Multiple variable summaries


Attaching package: 'crosstable'

The following object is masked from 'package:purrr':

    compact

library(ggplot2)

docVisits <- read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/AER/DoctorVisits.csv")

Rows: 5190 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): gender, private, freepoor, freerepat, nchronic, lchronic
dbl (7): rownames, visits, age, income, illness, reduced, health

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(docVisits)

Rows: 5,190
Columns: 13
$ rownames  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
$ visits    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, …
$ gender    <chr> "female", "female", "male", "male", "male", "female", "femal…
$ age       <dbl> 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, …
$ income    <dbl> 0.55, 0.45, 0.90, 0.15, 0.45, 0.35, 0.55, 0.15, 0.65, 0.15, …
$ illness   <dbl> 1, 1, 3, 1, 2, 5, 4, 3, 2, 1, 1, 2, 3, 4, 3, 2, 1, 1, 1, 1, …
$ reduced   <dbl> 4, 2, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 13, 7, 1, 0, 0, 1, 0, 0,…
$ health    <dbl> 1, 1, 0, 0, 1, 9, 2, 6, 5, 0, 0, 2, 1, 6, 0, 7, 5, 0, 0, 0, …
$ private   <chr> "yes", "yes", "no", "no", "no", "no", "no", "no", "yes", "ye…
$ freepoor  <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
$ freerepat <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
$ nchronic  <chr> "no", "no", "no", "no", "yes", "yes", "no", "no", "no", "no"…
$ lchronic  <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …

docVisits_modified <- docVisits %>%
  # Replace common NA strings and numbers with actual NA
  naniar::replace_with_na_all(condition = ~ .x %in% common_na_strings) %>%
  naniar::replace_with_na_all(condition = ~ .x %in% common_na_numbers) %>%
  # Clean variable names
  janitor::clean_names(case = "snake") %>% # clean names

  # Convert character variables to factors
  mutate(
    gender = as_factor(gender),
    private = as_factor(private),
    freepoor = as_factor(freepoor),
    freerepat = as_factor(freerepat),
    nchronic = as_factor(nchronic),
    lchronic = as_factor(lchronic)
  ) %>%
  # arrange the character variables first
  dplyr::relocate(where(is.factor), .after = rownames)


docVisits_modified %>% glimpse()

Rows: 5,190
Columns: 13
$ rownames  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
$ gender    <fct> female, female, male, male, male, female, female, female, fe…
$ private   <fct> yes, yes, no, no, no, no, no, no, yes, yes, no, no, no, no, …
$ freepoor  <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, …
$ freerepat <fct> no, no, no, no, no, no, no, no, no, no, no, yes, no, no, no,…
$ nchronic  <fct> no, no, no, no, yes, yes, no, no, no, no, no, no, yes, yes, …
$ lchronic  <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, …
$ visits    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, …
$ age       <dbl> 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, …
$ income    <dbl> 0.55, 0.45, 0.90, 0.15, 0.45, 0.35, 0.55, 0.15, 0.65, 0.15, …
$ illness   <dbl> 1, 1, 3, 1, 2, 5, 4, 3, 2, 1, 1, 2, 3, 4, 3, 2, 1, 1, 1, 1, …
$ reduced   <dbl> 4, 2, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 13, 7, 1, 0, 0, 1, 0, 0,…
$ health    <dbl> 1, 1, 0, 0, 1, 9, 2, 6, 5, 0, 0, 2, 1, 6, 0, 7, 5, 0, 0, 0, …

docVisits_modified %>%
  DT::datatable(
    caption = htmltools::tags$caption(
      style = "caption-side: top; text-align: left; color: black; font-size: 150%;",
      "Doctor Visits Dataset (Clean)"
    ),
    options = list(pageLength = 10, autoWidth = TRUE)
  ) %>%
  DT::formatStyle(
    columns = names(docVisits_modified),
    fontFamily = "Roboto Condensed",
    fontSize = "12px"
  )

Summarising Qual Variables

docVisits_modified %>% count(gender) %>% tt()

gender	n
female	2702
male	2488

docVisits_modified %>% count(private) %>% tt()

private	n
yes	2298
no	2892

docVisits_modified %>% count(across(.cols = c(gender,private))) %>% tt()

gender	private	n
female	yes	1269
female	no	1433
male	yes	1029
male	no	1459

docVisits_modified %>% count(across(where(is.character))) %>% tt()

n
5190

Summarising Quant Variables

docVisits_modified %>% dplyr::summarise(mean_income = mean(income, na.rm = T))

# A tibble: 1 × 1
  mean_income
        <dbl>
1       0.583

Single Variable, Multiple Summaries

docVisits_modified %>%
  dplyr::summarise(
    mean_visits = mean(visits, na.rm = T),
    sd_visits = sd(visits, na.rm = T),
    min_visits = min(visits, na.rm = T),
    max_visits = max(visits, na.rm = T)
  )

# A tibble: 1 × 4
  mean_visits sd_visits min_visits max_visits
        <dbl>     <dbl>      <dbl>      <dbl>
1       0.302     0.798          0          9

Multiple Variables, Multiple Summaries

docVisits_modified %>%
  dplyr::summarise(across(.cols = c(visits, income), # select columns

    .fns = list(
      mean = ~ mean(., na.rm = T),
      sd = sd,
      min = min, max = max
    )
  ))

# A tibble: 1 × 8
  visits_mean visits_sd visits_min visits_max income_mean income_sd income_min
        <dbl>     <dbl>      <dbl>      <dbl>       <dbl>     <dbl>      <dbl>
1       0.302     0.798          0          9       0.583     0.369          0
# ℹ 1 more variable: income_max <dbl>

Grouped Summaries

One qual variable

docVisits_modified %>%
  group_by(gender) %>%
  summarize(average_visits = mean(visits), count = n())

# A tibble: 2 × 3
  gender average_visits count
  <fct>           <dbl> <int>
1 female          0.362  2702
2 male            0.236  2488

Two qual variable

docVisits_modified %>%
  group_by(age, gender) %>%
  summarize(average_visits = mean(visits), 
            max_visits = max(visits),
            min_visits = min(visits),count = n())

`summarise()` has grouped output by 'age'. You can override using the `.groups`
argument.

# A tibble: 24 × 6
# Groups:   age [12]
     age gender average_visits max_visits min_visits count
   <dbl> <fct>           <dbl>      <dbl>      <dbl> <int>
 1  0.19 female          0.276          8          0   351
 2  0.19 male            0.157          4          0   401
 3  0.22 female          0.229          5          0   484
 4  0.22 male            0.182          7          0   729
 5  0.27 female          0.425          7          0   186
 6  0.27 male            0.166          5          0   337
 7  0.32 female          0.356          7          0   104
 8  0.32 male            0.183          3          0   197
 9  0.37 female          0.377          3          0    61
10  0.37 male            0.153          3          0    85
# ℹ 14 more rows

Multiple Variables, Multiple Summaries

docVisits_modified %>%
  dplyr::summarise(across(
    .cols = c(visits, income),

    .fns = list(
      mean = ~ mean(., na.rm = T),
      sd = sd,
      min = min, max = max
    )
  ))

# A tibble: 1 × 8
  visits_mean visits_sd visits_min visits_max income_mean income_sd income_min
        <dbl>     <dbl>      <dbl>      <dbl>       <dbl>     <dbl>      <dbl>
1       0.302     0.798          0          9       0.583     0.369          0
# ℹ 1 more variable: income_max <dbl>

Cross Table

crosstable(visits + income ~ gender + freepoor,
  data = docVisits_modified
) %>%
  crosstable::as_flextable()

freepoor		no		yes
gender		female	male	female	male
visits	Min / Max	0 / 8.0	0 / 9.0	0 / 5.0	0 / 7.0
	Med [IQR]	0 [0;0]	0 [0;0]	0 [0;0]	0 [0;0]
	Mean (std)	0.4 (0.9)	0.2 (0.7)	0.2 (0.8)	0.1 (0.6)
	N (NA)	2618 (0)	2350 (0)	84 (0)	138 (0)
income	Min / Max	0 / 1.5	0 / 1.5	0 / 1.1	0 / 1.1
	Med [IQR]	0.3 [0.2;0.6]	0.6 [0.3;0.9]	0.2 [0.1;0.3]	0.2 [0.1;0.4]
	Mean (std)	0.5 (0.3)	0.7 (0.4)	0.2 (0.2)	0.3 (0.2)
	N (NA)	2618 (0)	2350 (0)	84 (0)	138 (0)

Using Qualitative variables

crosstable(freerepat + private ~ gender + freepoor,
  data = docVisits_modified
) %>%
  crosstable::as_flextable()

freepoor		no		yes
gender		female	male	female	male
freerepat	no	1801 (43.94%)	2076 (50.65%)	84 (2.05%)	138 (3.37%)
freerepat	yes	817 (74.89%)	274 (25.11%)	0 (0%)	0 (0%)
private	yes	1269 (55.22%)	1029 (44.78%)	0 (0%)	0 (0%)
private	no	1349 (46.65%)	1321 (45.68%)	84 (2.90%)	138 (4.77%)

glimpse(docVisits_modified)

Rows: 5,190
Columns: 13
$ rownames  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
$ gender    <fct> female, female, male, male, male, female, female, female, fe…
$ private   <fct> yes, yes, no, no, no, no, no, no, yes, yes, no, no, no, no, …
$ freepoor  <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, …
$ freerepat <fct> no, no, no, no, no, no, no, no, no, no, no, yes, no, no, no,…
$ nchronic  <fct> no, no, no, no, yes, yes, no, no, no, no, no, no, yes, yes, …
$ lchronic  <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, …
$ visits    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, …
$ age       <dbl> 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, …
$ income    <dbl> 0.55, 0.45, 0.90, 0.15, 0.45, 0.35, 0.55, 0.15, 0.65, 0.15, …
$ illness   <dbl> 1, 1, 3, 1, 2, 5, 4, 3, 2, 1, 1, 2, 3, 4, 3, 2, 1, 1, 1, 1, …
$ reduced   <dbl> 4, 2, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 13, 7, 1, 0, 0, 1, 0, 0,…
$ health    <dbl> 1, 1, 0, 0, 1, 9, 2, 6, 5, 0, 0, 2, 1, 6, 0, 7, 5, 0, 0, 0, …

Graphs

set.seed(1947)
diamonds %>%
  slice_sample(n = 150, weight_by = cut) %>%
  gf_point(price ~ carat,
    colour = ~cut,
    shape = ~cut,
    size = 2, data = .
  ) %>%
  gf_labs(
    title = "Plot Title = DIAMONDS ARE FOREVER",
    subtitle = "Plot Subtitle = AND A GIRL'S BEST FRIEND",
    caption = "Plot Caption = From the diamonds dataset",
    x = "x-Axis Title = CARAT",
    y = "y-Axis Title = PRICE"
  ) %>%
  # Use same name for scales to merge legends
  gf_refine(
    scale_color_brewer(
      name = "Legend = DIAMOND QUALITY",
      palette = "Set1"
    ),
    scale_shape_manual(
      name = "Legend = DIAMOND QUALITY",
      values = c(15:21)
    )
  ) %>%
  gf_annotate("text",
    x = 1.0, y = 16000,
    label = "These DIAMONDS are\n Super Affordable!!",
    fontface = "bold",
    size = 2
  ) %>%
  gf_annotate("curve",
    x = 0.9,
    y = 14500,
    yend = 8000,
    xend = 0.95,
    linewidth = 0.5,
    curvature = 0.5,
    arrow = arrow(length = unit(0.25, "cm"))
  ) %>%
  gf_annotate(
    "rect",
    xmin = 1,
    xmax = 1.25,
    ymin = 2250,
    ymax = 10000,
    alpha = 0.5,
    fill = "grey80",
    col = "black"
  )

diamonds %>% gf_bar( ~ clarity | depth, fill=~clarity)

glimpse(diamonds)

Rows: 53,940
Columns: 10
$ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
$ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
$ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
$ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
$ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
$ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
$ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
$ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
$ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
$ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…

taxi <- read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/modeldata/taxi.csv")

Rows: 10000 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): tip, company, local, dow, month
dbl (3): rownames, distance, hour

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

taxi_modified <- taxi %>%
  naniar::replace_with_na_all(condition = ~ .x %in% common_na_strings) %>%
  naniar::replace_with_na_all(condition = ~ .x %in% common_na_numbers) %>%
  janitor::clean_names(case = "snake") %>%
  janitor::remove_empty()

value for "which" not specified, defaulting to c("rows", "cols")

taxi_modified

# A tibble: 10,000 × 8
   rownames tip   distance company                      local dow   month  hour
      <dbl> <chr>    <dbl> <chr>                        <chr> <chr> <chr> <dbl>
 1        1 yes      17.2  Chicago Independents         no    Thu   Feb      16
 2        2 yes       0.88 City Service                 yes   Thu   Mar       8
 3        3 yes      18.1  other                        no    Mon   Feb      18
 4        4 yes      20.7  Chicago Independents         no    Mon   Apr       8
 5        5 yes      12.2  Chicago Independents         no    Sun   Mar      21
 6        6 yes       0.94 Sun Taxi                     yes   Sat   Apr      23
 7        7 yes      17.5  Flash Cab                    no    Fri   Mar      12
 8        8 yes      17.7  other                        no    Sun   Jan       6
 9        9 yes       1.85 Taxicab Insurance Agency Llc no    Fri   Apr      12
10       10 yes       1.47 City Service                 no    Tue   Mar      14
# ℹ 9,990 more rows

taxi_modified %>%
  gf_bar(~local,
    position = "dodge"
  ) %>%
  gf_labs(title = "Plot 2A: Dodged Bar Chart") %>%
  gf_refine(scale_fill_brewer(palette = "Set1"))