# Reading tidytuesday data 
coffee_survey <- tt_load("2024-05-14")
## --- Compiling #TidyTuesday Information for 2024-05-14 ----
## --- There is 1 file available ---
## --- Starting Download ---
## 
##  Downloading file 1 of 1: `coffee_survey.csv`
## --- Download complete ---
coffee_data <- coffee_survey$coffee_survey
head(coffee_data)
## # A tibble: 6 × 57
##   submission_id age   cups  where_drink brew  brew_other purchase purchase_other
##   <chr>         <chr> <chr> <chr>       <chr> <chr>      <chr>    <chr>         
## 1 gMR29l        18-2… <NA>  <NA>        <NA>  <NA>       <NA>     <NA>          
## 2 BkPN0e        25-3… <NA>  <NA>        Pod/… <NA>       <NA>     <NA>          
## 3 W5G8jj        25-3… <NA>  <NA>        Bean… <NA>       <NA>     <NA>          
## 4 4xWgGr        35-4… <NA>  <NA>        Coff… <NA>       <NA>     <NA>          
## 5 QD27Q8        25-3… <NA>  <NA>        Pour… <NA>       <NA>     <NA>          
## 6 V0LPeM        55-6… <NA>  <NA>        Pod/… <NA>       <NA>     <NA>          
## # ℹ 49 more variables: favorite <chr>, favorite_specify <chr>, additions <chr>,
## #   additions_other <chr>, dairy <chr>, sweetener <chr>, style <chr>,
## #   strength <chr>, roast_level <chr>, caffeine <chr>, expertise <dbl>,
## #   coffee_a_bitterness <dbl>, coffee_a_acidity <dbl>,
## #   coffee_a_personal_preference <dbl>, coffee_a_notes <chr>,
## #   coffee_b_bitterness <dbl>, coffee_b_acidity <dbl>,
## #   coffee_b_personal_preference <dbl>, coffee_b_notes <chr>, …
# Exploring the dataset
summary(coffee_data)
##  submission_id          age                cups           where_drink       
##  Length:4042        Length:4042        Length:4042        Length:4042       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      brew            brew_other          purchase         purchase_other    
##  Length:4042        Length:4042        Length:4042        Length:4042       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    favorite         favorite_specify    additions         additions_other   
##  Length:4042        Length:4042        Length:4042        Length:4042       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     dairy            sweetener            style             strength        
##  Length:4042        Length:4042        Length:4042        Length:4042       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  roast_level          caffeine           expertise      coffee_a_bitterness
##  Length:4042        Length:4042        Min.   : 1.000   Min.   :1.000      
##  Class :character   Class :character   1st Qu.: 5.000   1st Qu.:1.000      
##  Mode  :character   Mode  :character   Median : 6.000   Median :2.000      
##                                        Mean   : 5.694   Mean   :2.141      
##                                        3rd Qu.: 7.000   3rd Qu.:3.000      
##                                        Max.   :10.000   Max.   :5.000      
##                                        NA's   :104      NA's   :244        
##  coffee_a_acidity coffee_a_personal_preference coffee_a_notes    
##  Min.   :1.000    Min.   :1.000                Length:4042       
##  1st Qu.:3.000    1st Qu.:2.000                Class :character  
##  Median :4.000    Median :3.000                Mode  :character  
##  Mean   :3.635    Mean   :3.311                                  
##  3rd Qu.:4.000    3rd Qu.:4.000                                  
##  Max.   :5.000    Max.   :5.000                                  
##  NA's   :263      NA's   :253                                    
##  coffee_b_bitterness coffee_b_acidity coffee_b_personal_preference
##  Min.   :1.000       Min.   :1.000    Min.   :1.000               
##  1st Qu.:2.000       1st Qu.:2.000    1st Qu.:2.000               
##  Median :3.000       Median :2.000    Median :3.000               
##  Mean   :3.013       Mean   :2.224    Mean   :3.069               
##  3rd Qu.:4.000       3rd Qu.:3.000    3rd Qu.:4.000               
##  Max.   :5.000       Max.   :5.000    Max.   :5.000               
##  NA's   :262         NA's   :275      NA's   :269                 
##  coffee_b_notes     coffee_c_bitterness coffee_c_acidity
##  Length:4042        Min.   :1.000       Min.   :1.000   
##  Class :character   1st Qu.:2.000       1st Qu.:2.000   
##  Mode  :character   Median :3.000       Median :2.000   
##                     Mean   :3.072       Mean   :2.367   
##                     3rd Qu.:4.000       3rd Qu.:3.000   
##                     Max.   :5.000       Max.   :5.000   
##                     NA's   :278         NA's   :291     
##  coffee_c_personal_preference coffee_c_notes     coffee_d_bitterness
##  Min.   :1.000                Length:4042        Min.   :1.000      
##  1st Qu.:2.000                Class :character   1st Qu.:1.000      
##  Median :3.000                Mode  :character   Median :2.000      
##  Mean   :3.065                                   Mean   :2.163      
##  3rd Qu.:4.000                                   3rd Qu.:3.000      
##  Max.   :5.000                                   Max.   :5.000      
##  NA's   :276                                     NA's   :275        
##  coffee_d_acidity coffee_d_personal_preference coffee_d_notes    
##  Min.   :1.000    Min.   :1.000                Length:4042       
##  1st Qu.:3.000    1st Qu.:2.000                Class :character  
##  Median :4.000    Median :4.000                Mode  :character  
##  Mean   :3.858    Mean   :3.376                                  
##  3rd Qu.:5.000    3rd Qu.:5.000                                  
##  Max.   :5.000    Max.   :5.000                                  
##  NA's   :277      NA's   :278                                    
##   prefer_abc         prefer_ad         prefer_overall         wfh           
##  Length:4042        Length:4042        Length:4042        Length:4042       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  total_spend         why_drink         why_drink_other       taste          
##  Length:4042        Length:4042        Length:4042        Length:4042       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  know_source         most_paid         most_willing        value_cafe       
##  Length:4042        Length:4042        Length:4042        Length:4042       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  spent_equipment    value_equipment       gender          gender_specify    
##  Length:4042        Length:4042        Length:4042        Length:4042       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  education_level    ethnicity_race     ethnicity_race_specify
##  Length:4042        Length:4042        Length:4042           
##  Class :character   Class :character   Class :character      
##  Mode  :character   Mode  :character   Mode  :character      
##                                                              
##                                                              
##                                                              
##                                                              
##  employment_status  number_children    political_affiliation
##  Length:4042        Length:4042        Length:4042          
##  Class :character   Class :character   Class :character     
##  Mode  :character   Mode  :character   Mode  :character     
##                                                             
##                                                             
##                                                             
## 

In October 2023, “world champion barista” James Hoffmann and coffee company Cometeer held the “Great American Coffee Taste Test” on YouTube, during which viewers were asked to fill out a survey about 4 coffees they ordered from Cometeer for the tasting. Let’s look at some charts that discribes the coffee consumptions.

# Setting the color palettes
brown_palette5 <- c("#553d2a", "#7e634e", "#907761", "#ab9680", "#c7b69f")
brown_palette7 <- c("#553d2a", "#69503c", "#7e634e", "#907761", "#ab9680", "#c7b69f", "#e3d7bf")
brown_palette10 <- c("#553d2a", "#5f4633", "#69503c", "#735945", "#7e634e", "#907761", "#ab9680", "#c7b69f", "#e3d7bf", "#fff9e1")
brown_palette12 <- c("#553d2a", "#5f4633", "#69503c", "#735945", "#7e634e", "#907761", "#ab9680", "#c7b69f","#d7cfc6", "#e3d7bf", "#fff9e1", "#efefec")
# Grouping and reordering the data to display the age distribution
coffee_data_filt_age <- coffee_data %>% 
  dplyr::filter(!is.na(age)) %>%
  group_by(age) %>%
  summarise(count = n())

new_order <- c("<18 years old", "18-24 years old", "25-34 years old", "35-44 years old", "45-54 years old", "55-64 years old", ">65 years old")
coffee_data_filt_age$age <- factor(coffee_data_filt_age$age, levels = new_order)
print(head(coffee_data_filt_age))
## # A tibble: 6 × 2
##   age             count
##   <fct>           <int>
## 1 18-24 years old   461
## 2 25-34 years old  1986
## 3 35-44 years old   960
## 4 45-54 years old   302
## 5 55-64 years old   187
## 6 <18 years old      20
# Plot to display the age distribution
Ageplot <- ggplot(coffee_data_filt_age, aes(x = age, y = count, fill = age)) +
  geom_bar(stat = "identity") +
  labs(title = "Age Distribution of Coffee Testers", x = "Age Group", y = "Number of Testers") +
  scale_fill_manual(values = brown_palette7) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(Ageplot)

# Grouping and reordering the data to display the brewing methods
coffee_data_cleaned_brew <- coffee_data %>%
  separate_rows(brew, sep = ", ") %>%
  mutate(brew = gsub(" \\(.*\\)$", "", brew)) %>%
  dplyr::filter(!is.na(brew)) %>%
  group_by(brew) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) 
coffee_data_cleaned_brew$brew <- factor(coffee_data_cleaned_brew$brew, levels = coffee_data_cleaned_brew$brew)
print(head(coffee_data_cleaned_brew))
## # A tibble: 6 × 2
##   brew                   count
##   <fct>                  <int>
## 1 Pour over               2295
## 2 Espresso                1518
## 3 French press             735
## 4 Other                    677
## 5 Coffee brewing machine   663
## 6 Cold brew                525
# Plot to display the brewing methods
Brewplot <- ggplot(coffee_data_cleaned_brew , aes(x = reorder(brew, -count), y = count, fill = brew)) +
  geom_bar(stat = "identity") +
  labs(title = "How do people like to brew their coffee?", x = "Brewing Method", y = "Testers") +
  scale_fill_manual(values = brown_palette10) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(Brewplot)

# Grouping and reordering the data to display the coffee likes
coffee_data_favorite <- coffee_data %>%
  mutate(favorite = if_else(favorite == "Other", favorite_specify, favorite)) %>%
  dplyr::filter(!is.na(favorite)) %>%
  group_by(favorite) %>%
  summarise(count = n()) %>%
  mutate(count = as.numeric(count))

coffee_data_favorite <- coffee_data_favorite %>%  
  mutate(favorite = if_else(count < 10, "Others", favorite)) %>%
  group_by(favorite) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

coffee_data_favorite$favorite <- factor(coffee_data_favorite$favorite, levels = coffee_data_favorite$favorite)
print(head(coffee_data_favorite))
## # A tibble: 6 × 2
##   favorite            count
##   <fct>               <dbl>
## 1 Pourover             1084
## 2 Latte                 680
## 3 Regular drip coffee   442
## 4 Cappuccino            341
## 5 Espresso              330
## 6 Cortado               313
# Plot to display the favorite coffees
Favoriteplot <- ggplot(coffee_data_favorite, aes(x = reorder(favorite, -count), y = count, fill = favorite)) +
  geom_bar(stat = "identity") +
  labs(title = "How do people like to drink their coffee?", x = "Favorite Coffee", y = "Testers") +
  scale_fill_manual(values = brown_palette12) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(Favoriteplot)

# Grouping and reordering the data to display where people like to drink coffee
coffee_data_place <- coffee_data %>%
  separate_rows(where_drink, sep = ", ") %>%
  dplyr::filter(!is.na(where_drink)) %>%
  group_by(where_drink) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) 
coffee_data_place$where_drink <- factor(coffee_data_place$where_drink, levels = coffee_data_place$where_drink)
print(head(coffee_data_place))
## # A tibble: 5 × 2
##   where_drink   count
##   <fct>         <int>
## 1 At home        3644
## 2 At the office  1430
## 3 At a cafe      1170
## 4 On the go       705
## 5 None of these    36
# Plot to display where people like to drink coffee
Whereplot <- ggplot(coffee_data_place, aes(x = reorder(where_drink, -count), y = count, fill = where_drink)) +
  geom_bar(stat = "identity") +
  labs(title = "Where do people like to drink their coffee?", x = "Favorite places", y = "Testers") +
  scale_fill_manual(values = brown_palette5) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(Whereplot)

Now let’s explore if there are any gender differences in how much they are willing to spend on a coffee?

# Cleaning and grouping the data to examine if there any difference in how much they are willing to spend on a coffee.
coffee_data_spending <- coffee_data %>%
  dplyr::filter(!is.na(gender) & !is.na(most_paid) & !is.na(most_willing) & !is.na(spent_equipment)) %>%
  mutate(most_paid = gsub("[^0-9.]", "", most_paid), most_paid = as.numeric(most_paid)) %>%
  mutate(most_willing = gsub("[^0-9.]", "", most_willing), most_willing = as.numeric(most_willing)) %>%
  mutate(spent_equipment = gsub("[^0-9.]", "", spent_equipment), spent_equipment = as.numeric(spent_equipment))

gender_spending <- coffee_data_spending %>%
  group_by(gender) %>%
  summarise(average_most_paid = mean(most_paid, na.rm = TRUE), average_most_willing = mean(most_willing, na.rm = TRUE), average_spent_equipment = mean(spent_equipment, na.rm = TRUE))

print(gender_spending)
## # A tibble: 5 × 4
##   gender           average_most_paid average_most_willing average_spent_equipm…¹
##   <chr>                        <dbl>                <dbl>                  <dbl>
## 1 Female                        484.                 555.                628249.
## 2 Male                          519.                 549.               1116968.
## 3 Non-binary                    627.                 693.                916812.
## 4 Other (please s…              582.                 575.               1161700 
## 5 Prefer not to s…              556.                 516.                865200.
## # ℹ abbreviated name: ¹​average_spent_equipment
# Perform ANOVA's to compare means between genders
anova_most_paid <- aov(most_paid ~ gender, data = coffee_data_spending)
anova_most_willing <- aov(most_willing ~ gender, data = coffee_data_spending)
anova_spent_equipment <- aov(spent_equipment ~ gender, data = coffee_data_spending)
summary(anova_most_paid)
##               Df    Sum Sq Mean Sq F value Pr(>F)  
## gender         4   2181967  545492   2.451 0.0441 *
## Residuals   3445 766682576  222549                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_most_willing)
##               Df    Sum Sq Mean Sq F value Pr(>F)  
## gender         4   2091644  522911   1.965 0.0972 .
## Residuals   3445 916937316  266165                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_spent_equipment)
##               Df    Sum Sq   Mean Sq F value   Pr(>F)    
## gender         4 1.492e+14 3.729e+13   10.45 2.07e-08 ***
## Residuals   3445 1.230e+16 3.570e+12                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# If ANOVA is significant, perform post-hoc tests
tukey_most_paid <- TukeyHSD(anova_most_paid)
tukey_spent_equipment <- TukeyHSD(anova_spent_equipment)
print(tukey_most_paid)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = most_paid ~ gender, data = coffee_data_spending)
## 
## $gender
##                                               diff         lwr       upr
## Male-Female                               34.76598  -16.994130  86.52608
## Non-binary-Female                        142.34625    7.203237 277.48927
## Other (please specify)-Female             97.69593 -333.812666 529.20452
## Prefer not to say-Female                  72.25865 -163.295311 307.81261
## Non-binary-Male                          107.58028  -22.493629 237.65419
## Other (please specify)-Male               62.92995 -367.018014 492.87792
## Prefer not to say-Male                    37.49268 -195.190060 270.17541
## Other (please specify)-Non-binary        -44.65033 -492.355341 403.05469
## Prefer not to say-Non-binary             -70.08760 -334.144002 193.96880
## Prefer not to say-Other (please specify) -25.43728 -512.942982 462.06843
##                                              p adj
## Male-Female                              0.3545126
## Non-binary-Female                        0.0331391
## Other (please specify)-Female            0.9722822
## Prefer not to say-Female                 0.9190457
## Non-binary-Male                          0.1592508
## Other (please specify)-Male              0.9946399
## Prefer not to say-Male                   0.9922490
## Other (please specify)-Non-binary        0.9987979
## Prefer not to say-Non-binary             0.9509147
## Prefer not to say-Other (please specify) 0.9999076
print(tukey_spent_equipment)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = spent_equipment ~ gender, data = coffee_data_spending)
## 
## $gender
##                                                diff        lwr       upr
## Male-Female                               488718.96   281423.5  696014.4
## Non-binary-Female                         288562.57  -252675.3  829800.4
## Other (please specify)-Female             533450.80 -1194709.4 2261611.0
## Prefer not to say-Female                  236951.12  -706425.2 1180327.4
## Non-binary-Male                          -200156.39  -721092.8  320780.0
## Other (please specify)-Male                44731.84 -1677178.1 1766641.8
## Prefer not to say-Male                   -251767.83 -1183645.1  680109.4
## Other (please specify)-Non-binary         244888.24 -1548137.4 2037913.9
## Prefer not to say-Non-binary              -51611.44 -1109137.9 1005915.0
## Prefer not to say-Other (please specify) -296499.68 -2248924.2 1655924.8
##                                              p adj
## Male-Female                              0.0000000
## Non-binary-Female                        0.5918830
## Other (please specify)-Female            0.9173220
## Prefer not to say-Female                 0.9596551
## Non-binary-Male                          0.8325651
## Other (please specify)-Male              0.9999943
## Prefer not to say-Male                   0.9477545
## Other (please specify)-Non-binary        0.9958973
## Prefer not to say-Non-binary             0.9999292
## Prefer not to say-Other (please specify) 0.9938233
# Descriptive statistics of genders and money spent on coffee
summary(gender_spending)
##     gender          average_most_paid average_most_willing
##  Length:5           Min.   :484.2     Min.   :516.1       
##  Class :character   1st Qu.:519.0     1st Qu.:548.8       
##  Mode  :character   Median :556.5     Median :555.4       
##                     Mean   :553.6     Mean   :577.6       
##                     3rd Qu.:581.9     3rd Qu.:574.8       
##                     Max.   :626.5     Max.   :693.1       
##  average_spent_equipment
##  Min.   : 628249        
##  1st Qu.: 865200        
##  Median : 916812        
##  Mean   : 937786        
##  3rd Qu.:1116968        
##  Max.   :1161700
descriptive_stats <- coffee_data_spending %>%
  group_by(gender) %>%
  summarise(
    count = n(),
    mean_most_paid = mean(most_paid, na.rm = TRUE),
    sd_most_paid = sd(most_paid, na.rm = TRUE),
    mean_most_willing = mean(most_willing, na.rm = TRUE),
    sd_most_willing = sd(most_willing, na.rm = TRUE),
    mean_spent_equipment = mean(spent_equipment, na.rm = TRUE),
    sd_spent_equipment = sd(spent_equipment, na.rm = TRUE)
  )
print(descriptive_stats)
## # A tibble: 5 × 8
##   gender     count mean_most_paid sd_most_paid mean_most_willing sd_most_willing
##   <chr>      <int>          <dbl>        <dbl>             <dbl>           <dbl>
## 1 Female       824           484.         444.              555.            492.
## 2 Male        2484           519.         480.              549.            523.
## 3 Non-binary   102           627.         479.              693.            546.
## 4 Other (pl…     9           582.         534.              575.            410.
## 5 Prefer no…    31           556.         453.              516.            514.
## # ℹ 2 more variables: mean_spent_equipment <dbl>, sd_spent_equipment <dbl>
# Reporting ANOVA results
report(anova_most_paid)
## The ANOVA (formula: most_paid ~ gender) suggests that:
## 
##   - The main effect of gender is statistically significant and very small (F(4,
## 3445) = 2.45, p = 0.044; Eta2 = 2.84e-03, 95% CI [3.78e-05, 1.00])
## 
## Effect sizes were labelled following Field's (2013) recommendations.
report(anova_spent_equipment)
## The ANOVA (formula: spent_equipment ~ gender) suggests that:
## 
##   - The main effect of gender is statistically significant and small (F(4, 3445)
## = 10.45, p < .001; Eta2 = 0.01, 95% CI [5.94e-03, 1.00])
## 
## Effect sizes were labelled following Field's (2013) recommendations.

Results

An Analysis of Variance (ANOVA) was conducted to compare the means of spending across different gender groups: - There is a significant difference in what is the most they’ve ever paid for a cup of coffee, with non-binary individuals spent significantly more than females. F(4, 3445) = 2.45, p = 0.044. Post-hoc comparisons using the Tukey HSD test indicated that the mean spending for non-binary (M = 627, SD = 479) was significantly higher than for females (M = 484, SD = 444), with a mean difference of 142.346 (95% CI, 7.203 to 277.489), p = 0.033. - There are no significant gender difference in how much they are willing to spend on a coffee. - There is a significant difference in how much they’ve spent on equipments, with man spent significantly more than females. F(4, 3445) = 10.45, p < .001 . Post-hoc comparisons using the Tukey HSD test indicated that the mean spending for males (M = 519, SD = 480) was significantly higher than for females (M = 484, SD = 444), with a mean difference of 488,718.96 (95% CI, 281423.5 to 696014.4), p < 0.0001.

#Presenting the results in a table
anova_summary_most_paid <- summary(anova_most_paid)
anova_summary_spent_equipment <- summary(anova_spent_equipment)

# Creating a data frame for the ANOVA table
anova_results <- data.frame(
  Variable = c("most_paid", "spent_equipment"),
  Df = c(anova_summary_most_paid[[1]]$Df[1], anova_summary_spent_equipment[[1]]$Df[1]),
  `Sum Sq` = c(anova_summary_most_paid[[1]]$`Sum Sq`[1], anova_summary_spent_equipment[[1]]$`Sum Sq`[1]),
  `Mean Sq` = c(anova_summary_most_paid[[1]]$`Mean Sq`[1], anova_summary_spent_equipment[[1]]$`Mean Sq`[1]),
  `F value` = c(anova_summary_most_paid[[1]]$`F value`[1], anova_summary_spent_equipment[[1]]$`F value`[1]),
  `Pr(>F)` = c(anova_summary_most_paid[[1]]$`Pr(>F)`[1], anova_summary_spent_equipment[[1]]$`Pr(>F)`[1])
)

# Format and display the table
anova_results %>%
  kable(format = "html", digits = 4, col.names = c("Variable", "Df", "Sum Sq", "Mean Sq", "F value", "Pr(>F)"), align = 'c') %>%
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(1, bold = TRUE) %>%
  add_header_above(c("ANOVA Main Effect Results" = 6))
ANOVA Main Effect Results
Variable Df Sum Sq Mean Sq F value Pr(>F)
most_paid 4 2.181967e+06 5.454917e+05 2.4511 0.0441
spent_equipment 4 1.491541e+14 3.728853e+13 10.4462 0.0000

Discussion

The analysis indicates that non-binary people spent the most for a cup of coffee in their life, and man are spending more on coffee equipment then women.

Can we predict how much coffee a person is going to drink a day based on how strongly needs coffee?

We will examine if the number of children (which presumably causes fatigue) and how strongly they need caffeine (the level of caffeine and strengths of the coffee).

# Cleaning the data
# Selecting the columns
selected_data <- c("number_children", "caffeine", "strength", "cups")
coffee_data_select <- coffee_data %>% 
  select(all_of(selected_data))

# Remove missing values
coffee_data_clean <- coffee_data_select %>% drop_na()
# Recoding variables
coffee_data_clean$cups[coffee_data_clean$cups == "More than 4"] <- "5"
coffee_data_clean$cups[coffee_data_clean$cups == "Less than 1"] <- "0"
coffee_data_clean$number_children[coffee_data_clean$number_children == "More than 3"] <- "4"
coffee_data_clean$number_children[coffee_data_clean$number_children == "None"] <- "0"
coffee_data_clean$caffeine[coffee_data_clean$caffeine == "Decaf"] <- "0"
coffee_data_clean$caffeine[coffee_data_clean$caffeine == "Half caff"] <- "1"
coffee_data_clean$caffeine[coffee_data_clean$caffeine == "Full caffeine"] <- "2"
coffee_data_clean$strength[coffee_data_clean$strength == "Weak"] <- "1"
coffee_data_clean$strength[coffee_data_clean$strength == "Somewhat light"] <- "2"
coffee_data_clean$strength[coffee_data_clean$strength == "Medium"] <- "3"
coffee_data_clean$strength[coffee_data_clean$strength == "Somewhat strong"] <- "4"
coffee_data_clean$strength[coffee_data_clean$strength == "Very strong"] <- "5"

# Set variables numeric
coffee_data_clean$cups <- as.numeric(coffee_data_clean$cups)
coffee_data_clean$number_children <- as.numeric(coffee_data_clean$number_children)
coffee_data_clean$caffeine <- as.numeric(coffee_data_clean$caffeine)
coffee_data_clean$strength <- as.numeric(coffee_data_clean$strength)
complex_model <- lm(cups ~ ., data = coffee_data_clean)
summary(complex_model)
## 
## Call:
## lm(formula = cups ~ ., data = coffee_data_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3012 -0.7620  0.0456  0.4114  3.8782 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.07791    0.09714   0.802    0.423    
## number_children  0.17344    0.01770   9.798   <2e-16 ***
## caffeine         0.46682    0.03872  12.055   <2e-16 ***
## strength         0.19235    0.02064   9.319   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9396 on 3378 degrees of freedom
## Multiple R-squared:  0.1001, Adjusted R-squared:  0.09926 
## F-statistic: 125.2 on 3 and 3378 DF,  p-value: < 2.2e-16
# Check normality assumption
qqnorm(residuals(complex_model))
qqline(residuals(complex_model))

# Shapiro-Wilk test
shapiro.test(residuals(complex_model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(complex_model)
## W = 0.95554, p-value < 2.2e-16
# Rainbow test
raintest(complex_model)
## 
##  Rainbow test
## 
## data:  complex_model
## Rain = 1.1658, df1 = 1691, df2 = 1687, p-value = 0.0008164
# Breusch-Pagan test
bptest(complex_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  complex_model
## BP = 11.188, df = 3, p-value = 0.01075
# Check the multicollinearity assumption
vif(complex_model)
## number_children        caffeine        strength 
##        1.000317        1.025048        1.024905
# Calculate robust standard errors
robust_se <- vcovHC(complex_model, type = "HC1")

# Display coefficients with robust standard errors
coeftest(complex_model, vcov = robust_se)
## 
## t test of coefficients:
## 
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.077906   0.088145  0.8838   0.3768    
## number_children 0.173438   0.018903  9.1752   <2e-16 ***
## caffeine        0.466816   0.036335 12.8475   <2e-16 ***
## strength        0.192346   0.020871  9.2160   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Results

We performed a linear regression analysis to examine the relationship between the selected variables. To account for that the residuals are not normally distributed and the heteroscedasticity we used robust standard errors. - The intercept is not significant. - The number of children are significant predictor of daily coffee consumption, 0.1734 (p < 0.000). - The quantity of caffeine in a cup of coffee are significant predictor of daily coffee consumption, 0.4668 (p < 0.000). - The strength of the coffee are significant predictor of daily coffee consumption, 0.1923 (p < 0.000).

# Presenting the results in a table
# Extract coefficients, robust standard errors, t-values, and p-values
coefficients <- coef(summary(complex_model))
robust_se_vals <- sqrt(diag(robust_se))
t_values <- coefficients[, "Estimate"] / robust_se_vals
p_values <- 2 * pt(-abs(t_values), df = df.residual(complex_model))

# Create a data frame for the table
regression_results <- data.frame(
  Estimate = coefficients[, "Estimate"],
  `Robust Std. Error` = robust_se_vals,
  `t value` = t_values,
  `Pr(>|t|)` = p_values
)

# Format and display the table
regression_results %>%
  kable(format = "html", digits = 4, col.names = c("Predictor", "Estimate", "Robust Std. Error", "t value", "Pr(>|t|)")) %>%
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(1, bold = TRUE) %>%
  add_header_above(c("Regression Coefficients" = 5))
Regression Coefficients
Predictor Estimate Robust Std. Error t value Pr(>|t|)
(Intercept) 0.0779 0.0881 0.8838 0.3768
number_children 0.1734 0.0189 9.1752 0.0000
caffeine 0.4668 0.0363 12.8475 0.0000
strength 0.1923 0.0209 9.2160 0.0000

Discussion

The analysis indicates that all of the tested factors contributes positively for the daily coffee consumption. The more children someone has and the stronger they like coffee, the more they will consume in a day.