Classwork 3

Author

Diya Bijoy

Published

September 22, 2025

Setup

library(tidyverse) # Sine qua non
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mosaic) # Out all-in-one package
Registered S3 method overwritten by 'mosaic':
  method                           from   
  fortify.SpatialPolygonsDataFrame ggplot2

The 'mosaic' package masks several functions from core packages in order to add 
additional features.  The original behavior of these functions should not be affected by this.

Attaching package: 'mosaic'

The following object is masked from 'package:Matrix':

    mean

The following objects are masked from 'package:dplyr':

    count, do, tally

The following object is masked from 'package:purrr':

    cross

The following object is masked from 'package:ggplot2':

    stat

The following objects are masked from 'package:stats':

    binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
    quantile, sd, t.test, var

The following objects are masked from 'package:base':

    max, mean, min, prod, range, sample, sum
library(ggformula) # Graphing package
library(skimr) # Looking at Data

Attaching package: 'skimr'

The following object is masked from 'package:mosaic':

    n_missing
library(janitor) # Clean the data

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(naniar) # Handle missing data

Attaching package: 'naniar'

The following object is masked from 'package:skimr':

    n_complete
library(visdat) # Visualise missing data
library(tinytable) # Printing Static Tables for our data

Attaching package: 'tinytable'

The following object is masked from 'package:ggplot2':

    theme_void
library(DT) # Interactive Tables for our data
library(crosstable) # Multiple variable summaries

Attaching package: 'crosstable'

The following object is masked from 'package:purrr':

    compact
library(ggplot2)
docVisits <- read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/AER/DoctorVisits.csv")
Rows: 5190 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): gender, private, freepoor, freerepat, nchronic, lchronic
dbl (7): rownames, visits, age, income, illness, reduced, health

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(docVisits)
Rows: 5,190
Columns: 13
$ rownames  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
$ visits    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, …
$ gender    <chr> "female", "female", "male", "male", "male", "female", "femal…
$ age       <dbl> 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, …
$ income    <dbl> 0.55, 0.45, 0.90, 0.15, 0.45, 0.35, 0.55, 0.15, 0.65, 0.15, …
$ illness   <dbl> 1, 1, 3, 1, 2, 5, 4, 3, 2, 1, 1, 2, 3, 4, 3, 2, 1, 1, 1, 1, …
$ reduced   <dbl> 4, 2, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 13, 7, 1, 0, 0, 1, 0, 0,…
$ health    <dbl> 1, 1, 0, 0, 1, 9, 2, 6, 5, 0, 0, 2, 1, 6, 0, 7, 5, 0, 0, 0, …
$ private   <chr> "yes", "yes", "no", "no", "no", "no", "no", "no", "yes", "ye…
$ freepoor  <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
$ freerepat <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
$ nchronic  <chr> "no", "no", "no", "no", "yes", "yes", "no", "no", "no", "no"…
$ lchronic  <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
docVisits_modified <- docVisits %>%
  # Replace common NA strings and numbers with actual NA
  naniar::replace_with_na_all(condition = ~ .x %in% common_na_strings) %>%
  naniar::replace_with_na_all(condition = ~ .x %in% common_na_numbers) %>%
  # Clean variable names
  janitor::clean_names(case = "snake") %>% # clean names

  # Convert character variables to factors
  mutate(
    gender = as_factor(gender),
    private = as_factor(private),
    freepoor = as_factor(freepoor),
    freerepat = as_factor(freerepat),
    nchronic = as_factor(nchronic),
    lchronic = as_factor(lchronic)
  ) %>%
  # arrange the character variables first
  dplyr::relocate(where(is.factor), .after = rownames)


docVisits_modified %>% glimpse()
Rows: 5,190
Columns: 13
$ rownames  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
$ gender    <fct> female, female, male, male, male, female, female, female, fe…
$ private   <fct> yes, yes, no, no, no, no, no, no, yes, yes, no, no, no, no, …
$ freepoor  <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, …
$ freerepat <fct> no, no, no, no, no, no, no, no, no, no, no, yes, no, no, no,…
$ nchronic  <fct> no, no, no, no, yes, yes, no, no, no, no, no, no, yes, yes, …
$ lchronic  <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, …
$ visits    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, …
$ age       <dbl> 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, …
$ income    <dbl> 0.55, 0.45, 0.90, 0.15, 0.45, 0.35, 0.55, 0.15, 0.65, 0.15, …
$ illness   <dbl> 1, 1, 3, 1, 2, 5, 4, 3, 2, 1, 1, 2, 3, 4, 3, 2, 1, 1, 1, 1, …
$ reduced   <dbl> 4, 2, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 13, 7, 1, 0, 0, 1, 0, 0,…
$ health    <dbl> 1, 1, 0, 0, 1, 9, 2, 6, 5, 0, 0, 2, 1, 6, 0, 7, 5, 0, 0, 0, …
docVisits_modified %>%
  DT::datatable(
    caption = htmltools::tags$caption(
      style = "caption-side: top; text-align: left; color: black; font-size: 150%;",
      "Doctor Visits Dataset (Clean)"
    ),
    options = list(pageLength = 10, autoWidth = TRUE)
  ) %>%
  DT::formatStyle(
    columns = names(docVisits_modified),
    fontFamily = "Roboto Condensed",
    fontSize = "12px"
  )

Summarising Qual Variables

docVisits_modified %>% count(gender) %>% tt()
gender n
female 2702
male 2488
docVisits_modified %>% count(private) %>% tt()
private n
yes 2298
no 2892
docVisits_modified %>% count(across(.cols = c(gender,private))) %>% tt()
gender private n
female yes 1269
female no 1433
male yes 1029
male no 1459
docVisits_modified %>% count(across(where(is.character))) %>% tt()
n
5190

Summarising Quant Variables

docVisits_modified %>% dplyr::summarise(mean_income = mean(income, na.rm = T))
# A tibble: 1 × 1
  mean_income
        <dbl>
1       0.583

Single Variable, Multiple Summaries

docVisits_modified %>%
  dplyr::summarise(
    mean_visits = mean(visits, na.rm = T),
    sd_visits = sd(visits, na.rm = T),
    min_visits = min(visits, na.rm = T),
    max_visits = max(visits, na.rm = T)
  )
# A tibble: 1 × 4
  mean_visits sd_visits min_visits max_visits
        <dbl>     <dbl>      <dbl>      <dbl>
1       0.302     0.798          0          9

Multiple Variables, Multiple Summaries

docVisits_modified %>%
  dplyr::summarise(across(.cols = c(visits, income), # select columns

    .fns = list(
      mean = ~ mean(., na.rm = T),
      sd = sd,
      min = min, max = max
    )
  ))
# A tibble: 1 × 8
  visits_mean visits_sd visits_min visits_max income_mean income_sd income_min
        <dbl>     <dbl>      <dbl>      <dbl>       <dbl>     <dbl>      <dbl>
1       0.302     0.798          0          9       0.583     0.369          0
# ℹ 1 more variable: income_max <dbl>

Grouped Summaries

One qual variable

docVisits_modified %>%
  group_by(gender) %>%
  summarize(average_visits = mean(visits), count = n())
# A tibble: 2 × 3
  gender average_visits count
  <fct>           <dbl> <int>
1 female          0.362  2702
2 male            0.236  2488

Two qual variable

docVisits_modified %>%
  group_by(age, gender) %>%
  summarize(average_visits = mean(visits), 
            max_visits = max(visits),
            min_visits = min(visits),count = n())
`summarise()` has grouped output by 'age'. You can override using the `.groups`
argument.
# A tibble: 24 × 6
# Groups:   age [12]
     age gender average_visits max_visits min_visits count
   <dbl> <fct>           <dbl>      <dbl>      <dbl> <int>
 1  0.19 female          0.276          8          0   351
 2  0.19 male            0.157          4          0   401
 3  0.22 female          0.229          5          0   484
 4  0.22 male            0.182          7          0   729
 5  0.27 female          0.425          7          0   186
 6  0.27 male            0.166          5          0   337
 7  0.32 female          0.356          7          0   104
 8  0.32 male            0.183          3          0   197
 9  0.37 female          0.377          3          0    61
10  0.37 male            0.153          3          0    85
# ℹ 14 more rows

Multiple Variables, Multiple Summaries

docVisits_modified %>%
  dplyr::summarise(across(
    .cols = c(visits, income),

    .fns = list(
      mean = ~ mean(., na.rm = T),
      sd = sd,
      min = min, max = max
    )
  ))
# A tibble: 1 × 8
  visits_mean visits_sd visits_min visits_max income_mean income_sd income_min
        <dbl>     <dbl>      <dbl>      <dbl>       <dbl>     <dbl>      <dbl>
1       0.302     0.798          0          9       0.583     0.369          0
# ℹ 1 more variable: income_max <dbl>

Cross Table

crosstable(visits + income ~ gender + freepoor,
  data = docVisits_modified
) %>%
  crosstable::as_flextable()

freepoor

no

yes

gender

female

male

female

male

visits

Min / Max

0 / 8.0

0 / 9.0

0 / 5.0

0 / 7.0

Med [IQR]

0 [0;0]

0 [0;0]

0 [0;0]

0 [0;0]

Mean (std)

0.4 (0.9)

0.2 (0.7)

0.2 (0.8)

0.1 (0.6)

N (NA)

2618 (0)

2350 (0)

84 (0)

138 (0)

income

Min / Max

0 / 1.5

0 / 1.5

0 / 1.1

0 / 1.1

Med [IQR]

0.3 [0.2;0.6]

0.6 [0.3;0.9]

0.2 [0.1;0.3]

0.2 [0.1;0.4]

Mean (std)

0.5 (0.3)

0.7 (0.4)

0.2 (0.2)

0.3 (0.2)

N (NA)

2618 (0)

2350 (0)

84 (0)

138 (0)

Using Qualitative variables
crosstable(freerepat + private ~ gender + freepoor,
  data = docVisits_modified
) %>%
  crosstable::as_flextable()

freepoor

no

yes

gender

female

male

female

male

freerepat

no

1801 (43.94%)

2076 (50.65%)

84 (2.05%)

138 (3.37%)

yes

817 (74.89%)

274 (25.11%)

0 (0%)

0 (0%)

private

yes

1269 (55.22%)

1029 (44.78%)

0 (0%)

0 (0%)

no

1349 (46.65%)

1321 (45.68%)

84 (2.90%)

138 (4.77%)

glimpse(docVisits_modified)
Rows: 5,190
Columns: 13
$ rownames  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
$ gender    <fct> female, female, male, male, male, female, female, female, fe…
$ private   <fct> yes, yes, no, no, no, no, no, no, yes, yes, no, no, no, no, …
$ freepoor  <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, …
$ freerepat <fct> no, no, no, no, no, no, no, no, no, no, no, yes, no, no, no,…
$ nchronic  <fct> no, no, no, no, yes, yes, no, no, no, no, no, no, yes, yes, …
$ lchronic  <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, …
$ visits    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, …
$ age       <dbl> 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, …
$ income    <dbl> 0.55, 0.45, 0.90, 0.15, 0.45, 0.35, 0.55, 0.15, 0.65, 0.15, …
$ illness   <dbl> 1, 1, 3, 1, 2, 5, 4, 3, 2, 1, 1, 2, 3, 4, 3, 2, 1, 1, 1, 1, …
$ reduced   <dbl> 4, 2, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0, 13, 7, 1, 0, 0, 1, 0, 0,…
$ health    <dbl> 1, 1, 0, 0, 1, 9, 2, 6, 5, 0, 0, 2, 1, 6, 0, 7, 5, 0, 0, 0, …

Graphs

set.seed(1947)
diamonds %>%
  slice_sample(n = 150, weight_by = cut) %>%
  gf_point(price ~ carat,
    colour = ~cut,
    shape = ~cut,
    size = 2, data = .
  ) %>%
  gf_labs(
    title = "Plot Title = DIAMONDS ARE FOREVER",
    subtitle = "Plot Subtitle = AND A GIRL'S BEST FRIEND",
    caption = "Plot Caption = From the diamonds dataset",
    x = "x-Axis Title = CARAT",
    y = "y-Axis Title = PRICE"
  ) %>%
  # Use same name for scales to merge legends
  gf_refine(
    scale_color_brewer(
      name = "Legend = DIAMOND QUALITY",
      palette = "Set1"
    ),
    scale_shape_manual(
      name = "Legend = DIAMOND QUALITY",
      values = c(15:21)
    )
  ) %>%
  gf_annotate("text",
    x = 1.0, y = 16000,
    label = "These DIAMONDS are\n Super Affordable!!",
    fontface = "bold",
    size = 2
  ) %>%
  gf_annotate("curve",
    x = 0.9,
    y = 14500,
    yend = 8000,
    xend = 0.95,
    linewidth = 0.5,
    curvature = 0.5,
    arrow = arrow(length = unit(0.25, "cm"))
  ) %>%
  gf_annotate(
    "rect",
    xmin = 1,
    xmax = 1.25,
    ymin = 2250,
    ymax = 10000,
    alpha = 0.5,
    fill = "grey80",
    col = "black"
  )

diamonds %>% gf_bar( ~ clarity | depth, fill=~clarity)

glimpse(diamonds)
Rows: 53,940
Columns: 10
$ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
$ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
$ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
$ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
$ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
$ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
$ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
$ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
$ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
$ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
taxi <- read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/modeldata/taxi.csv")
Rows: 10000 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): tip, company, local, dow, month
dbl (3): rownames, distance, hour

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
taxi_modified <- taxi %>%
  naniar::replace_with_na_all(condition = ~ .x %in% common_na_strings) %>%
  naniar::replace_with_na_all(condition = ~ .x %in% common_na_numbers) %>%
  janitor::clean_names(case = "snake") %>%
  janitor::remove_empty()
value for "which" not specified, defaulting to c("rows", "cols")
taxi_modified
# A tibble: 10,000 × 8
   rownames tip   distance company                      local dow   month  hour
      <dbl> <chr>    <dbl> <chr>                        <chr> <chr> <chr> <dbl>
 1        1 yes      17.2  Chicago Independents         no    Thu   Feb      16
 2        2 yes       0.88 City Service                 yes   Thu   Mar       8
 3        3 yes      18.1  other                        no    Mon   Feb      18
 4        4 yes      20.7  Chicago Independents         no    Mon   Apr       8
 5        5 yes      12.2  Chicago Independents         no    Sun   Mar      21
 6        6 yes       0.94 Sun Taxi                     yes   Sat   Apr      23
 7        7 yes      17.5  Flash Cab                    no    Fri   Mar      12
 8        8 yes      17.7  other                        no    Sun   Jan       6
 9        9 yes       1.85 Taxicab Insurance Agency Llc no    Fri   Apr      12
10       10 yes       1.47 City Service                 no    Tue   Mar      14
# ℹ 9,990 more rows
taxi_modified %>%
  gf_bar(~local,
    position = "dodge"
  ) %>%
  gf_labs(title = "Plot 2A: Dodged Bar Chart") %>%
  gf_refine(scale_fill_brewer(palette = "Set1"))