Chi-squared test example using nycflights13 flights data

Chester Ismay

2018-05-14

Data preparation

library(nycflights13)
library(dplyr)
library(ggplot2)
library(stringr)
library(infer)
set.seed(2017)
fli_small <- flights %>% 
  na.omit() %>% 
  sample_n(size = 500) %>% 
  mutate(season = case_when(
    month %in% c(10:12, 1:3) ~ "winter",
    month %in% c(4:9) ~ "summer"
  )) %>% 
  mutate(day_hour = case_when(
    between(hour, 1, 12) ~ "morning",
    between(hour, 13, 24) ~ "not morning"
  )) %>% 
  select(arr_delay, dep_delay, season, 
         day_hour, origin, carrier)

One numerical variable, one categorical (2 levels)

Calculate observed statistic

Using chisq_test in infer

obs_chisq <- fli_small %>% 
  chisq_test(formula = origin ~ season) %>% 
  dplyr::select(statistic) %>% 
  dplyr::pull()

The observed \(\chi^2\) statistic is 0.571898.

Or using another shortcut function in infer:

obs_chisq <- fli_small %>% 
  chisq_stat(formula = origin ~ season)

The observed \(\chi^2\) statistic is 0.571898.

Randomization approach to \(\chi^2\)-statistic

chisq_null_distn <- fli_small %>%
  specify(origin ~ season) %>% # alt: response = origin, explanatory = season
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "Chisq")
chisq_null_distn %>% visualize(obs_stat = obs_chisq, direction = "greater")

Calculate the randomization-based \(p\)-value

chisq_null_distn %>% 
  dplyr::summarize(p_value = mean(stat >= obs_chisq)) %>% 
  dplyr::pull()
## [1] 0.748

Theoretical distribution

fli_small %>%
  specify(origin ~ season) %>% # alt: response = origin, explanatory = season
  hypothesize(null = "independence") %>%
  # generate() ## Not used for theoretical
  calculate(stat = "Chisq") %>%
  visualize(method = "theoretical", obs_stat = obs_chisq, direction = "right")
## Warning: Check to make sure the conditions have been met for the
## theoretical method. `infer` currently does not check these for you.

Overlay appropriate \(\chi^2\) distribution on top of permuted statistics

fli_small %>%
  specify(origin ~ season) %>% # alt: response = origin, explanatory = season
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "Chisq") %>% 
  visualize(method = "both", obs_stat = obs_chisq, direction = "right")
## Warning: Check to make sure the conditions have been met for the
## theoretical method. `infer` currently does not check these for you.

Compute theoretical p-value

fli_small %>% 
  chisq_test(formula = origin ~ season) %>% 
  dplyr::select(p_value) %>% 
  dplyr::pull()
## [1] 0.7513009