# Reading tidytuesday data
coffee_survey <- tt_load("2024-05-14")
## --- Compiling #TidyTuesday Information for 2024-05-14 ----
## --- There is 1 file available ---
## --- Starting Download ---
##
## Downloading file 1 of 1: `coffee_survey.csv`
## --- Download complete ---
coffee_data <- coffee_survey$coffee_survey
head(coffee_data)
## # A tibble: 6 × 57
## submission_id age cups where_drink brew brew_other purchase purchase_other
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 gMR29l 18-2… <NA> <NA> <NA> <NA> <NA> <NA>
## 2 BkPN0e 25-3… <NA> <NA> Pod/… <NA> <NA> <NA>
## 3 W5G8jj 25-3… <NA> <NA> Bean… <NA> <NA> <NA>
## 4 4xWgGr 35-4… <NA> <NA> Coff… <NA> <NA> <NA>
## 5 QD27Q8 25-3… <NA> <NA> Pour… <NA> <NA> <NA>
## 6 V0LPeM 55-6… <NA> <NA> Pod/… <NA> <NA> <NA>
## # ℹ 49 more variables: favorite <chr>, favorite_specify <chr>, additions <chr>,
## # additions_other <chr>, dairy <chr>, sweetener <chr>, style <chr>,
## # strength <chr>, roast_level <chr>, caffeine <chr>, expertise <dbl>,
## # coffee_a_bitterness <dbl>, coffee_a_acidity <dbl>,
## # coffee_a_personal_preference <dbl>, coffee_a_notes <chr>,
## # coffee_b_bitterness <dbl>, coffee_b_acidity <dbl>,
## # coffee_b_personal_preference <dbl>, coffee_b_notes <chr>, …
# Exploring the dataset
summary(coffee_data)
## submission_id age cups where_drink
## Length:4042 Length:4042 Length:4042 Length:4042
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## brew brew_other purchase purchase_other
## Length:4042 Length:4042 Length:4042 Length:4042
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## favorite favorite_specify additions additions_other
## Length:4042 Length:4042 Length:4042 Length:4042
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## dairy sweetener style strength
## Length:4042 Length:4042 Length:4042 Length:4042
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## roast_level caffeine expertise coffee_a_bitterness
## Length:4042 Length:4042 Min. : 1.000 Min. :1.000
## Class :character Class :character 1st Qu.: 5.000 1st Qu.:1.000
## Mode :character Mode :character Median : 6.000 Median :2.000
## Mean : 5.694 Mean :2.141
## 3rd Qu.: 7.000 3rd Qu.:3.000
## Max. :10.000 Max. :5.000
## NA's :104 NA's :244
## coffee_a_acidity coffee_a_personal_preference coffee_a_notes
## Min. :1.000 Min. :1.000 Length:4042
## 1st Qu.:3.000 1st Qu.:2.000 Class :character
## Median :4.000 Median :3.000 Mode :character
## Mean :3.635 Mean :3.311
## 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000
## NA's :263 NA's :253
## coffee_b_bitterness coffee_b_acidity coffee_b_personal_preference
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :3.000 Median :2.000 Median :3.000
## Mean :3.013 Mean :2.224 Mean :3.069
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
## NA's :262 NA's :275 NA's :269
## coffee_b_notes coffee_c_bitterness coffee_c_acidity
## Length:4042 Min. :1.000 Min. :1.000
## Class :character 1st Qu.:2.000 1st Qu.:2.000
## Mode :character Median :3.000 Median :2.000
## Mean :3.072 Mean :2.367
## 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000
## NA's :278 NA's :291
## coffee_c_personal_preference coffee_c_notes coffee_d_bitterness
## Min. :1.000 Length:4042 Min. :1.000
## 1st Qu.:2.000 Class :character 1st Qu.:1.000
## Median :3.000 Mode :character Median :2.000
## Mean :3.065 Mean :2.163
## 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000
## NA's :276 NA's :275
## coffee_d_acidity coffee_d_personal_preference coffee_d_notes
## Min. :1.000 Min. :1.000 Length:4042
## 1st Qu.:3.000 1st Qu.:2.000 Class :character
## Median :4.000 Median :4.000 Mode :character
## Mean :3.858 Mean :3.376
## 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000
## NA's :277 NA's :278
## prefer_abc prefer_ad prefer_overall wfh
## Length:4042 Length:4042 Length:4042 Length:4042
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## total_spend why_drink why_drink_other taste
## Length:4042 Length:4042 Length:4042 Length:4042
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## know_source most_paid most_willing value_cafe
## Length:4042 Length:4042 Length:4042 Length:4042
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## spent_equipment value_equipment gender gender_specify
## Length:4042 Length:4042 Length:4042 Length:4042
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## education_level ethnicity_race ethnicity_race_specify
## Length:4042 Length:4042 Length:4042
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## employment_status number_children political_affiliation
## Length:4042 Length:4042 Length:4042
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
# Setting the color palettes
brown_palette5 <- c("#553d2a", "#7e634e", "#907761", "#ab9680", "#c7b69f")
brown_palette7 <- c("#553d2a", "#69503c", "#7e634e", "#907761", "#ab9680", "#c7b69f", "#e3d7bf")
brown_palette10 <- c("#553d2a", "#5f4633", "#69503c", "#735945", "#7e634e", "#907761", "#ab9680", "#c7b69f", "#e3d7bf", "#fff9e1")
brown_palette12 <- c("#553d2a", "#5f4633", "#69503c", "#735945", "#7e634e", "#907761", "#ab9680", "#c7b69f","#d7cfc6", "#e3d7bf", "#fff9e1", "#efefec")
# Grouping and reordering the data to display the age distribution
coffee_data_filt_age <- coffee_data %>%
dplyr::filter(!is.na(age)) %>%
group_by(age) %>%
summarise(count = n())
new_order <- c("<18 years old", "18-24 years old", "25-34 years old", "35-44 years old", "45-54 years old", "55-64 years old", ">65 years old")
coffee_data_filt_age$age <- factor(coffee_data_filt_age$age, levels = new_order)
print(head(coffee_data_filt_age))
## # A tibble: 6 × 2
## age count
## <fct> <int>
## 1 18-24 years old 461
## 2 25-34 years old 1986
## 3 35-44 years old 960
## 4 45-54 years old 302
## 5 55-64 years old 187
## 6 <18 years old 20
# Plot to display the age distribution
Ageplot <- ggplot(coffee_data_filt_age, aes(x = age, y = count, fill = age)) +
geom_bar(stat = "identity") +
labs(title = "Age Distribution of Coffee Testers", x = "Age Group", y = "Number of Testers") +
scale_fill_manual(values = brown_palette7) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(Ageplot)
# Grouping and reordering the data to display the brewing methods
coffee_data_cleaned_brew <- coffee_data %>%
separate_rows(brew, sep = ", ") %>%
mutate(brew = gsub(" \\(.*\\)$", "", brew)) %>%
dplyr::filter(!is.na(brew)) %>%
group_by(brew) %>%
summarise(count = n()) %>%
arrange(desc(count))
coffee_data_cleaned_brew$brew <- factor(coffee_data_cleaned_brew$brew, levels = coffee_data_cleaned_brew$brew)
print(head(coffee_data_cleaned_brew))
## # A tibble: 6 × 2
## brew count
## <fct> <int>
## 1 Pour over 2295
## 2 Espresso 1518
## 3 French press 735
## 4 Other 677
## 5 Coffee brewing machine 663
## 6 Cold brew 525
# Plot to display the brewing methods
Brewplot <- ggplot(coffee_data_cleaned_brew , aes(x = reorder(brew, -count), y = count, fill = brew)) +
geom_bar(stat = "identity") +
labs(title = "How do people like to brew their coffee?", x = "Brewing Method", y = "Testers") +
scale_fill_manual(values = brown_palette10) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(Brewplot)
# Grouping and reordering the data to display the coffee likes
coffee_data_favorite <- coffee_data %>%
mutate(favorite = if_else(favorite == "Other", favorite_specify, favorite)) %>%
dplyr::filter(!is.na(favorite)) %>%
group_by(favorite) %>%
summarise(count = n()) %>%
mutate(count = as.numeric(count))
coffee_data_favorite <- coffee_data_favorite %>%
mutate(favorite = if_else(count < 10, "Others", favorite)) %>%
group_by(favorite) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
coffee_data_favorite$favorite <- factor(coffee_data_favorite$favorite, levels = coffee_data_favorite$favorite)
print(head(coffee_data_favorite))
## # A tibble: 6 × 2
## favorite count
## <fct> <dbl>
## 1 Pourover 1084
## 2 Latte 680
## 3 Regular drip coffee 442
## 4 Cappuccino 341
## 5 Espresso 330
## 6 Cortado 313
# Plot to display the favorite coffees
Favoriteplot <- ggplot(coffee_data_favorite, aes(x = reorder(favorite, -count), y = count, fill = favorite)) +
geom_bar(stat = "identity") +
labs(title = "How do people like to drink their coffee?", x = "Favorite Coffee", y = "Testers") +
scale_fill_manual(values = brown_palette12) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(Favoriteplot)
# Grouping and reordering the data to display where people like to drink coffee
coffee_data_place <- coffee_data %>%
separate_rows(where_drink, sep = ", ") %>%
dplyr::filter(!is.na(where_drink)) %>%
group_by(where_drink) %>%
summarise(count = n()) %>%
arrange(desc(count))
coffee_data_place$where_drink <- factor(coffee_data_place$where_drink, levels = coffee_data_place$where_drink)
print(head(coffee_data_place))
## # A tibble: 5 × 2
## where_drink count
## <fct> <int>
## 1 At home 3644
## 2 At the office 1430
## 3 At a cafe 1170
## 4 On the go 705
## 5 None of these 36
# Plot to display where people like to drink coffee
Whereplot <- ggplot(coffee_data_place, aes(x = reorder(where_drink, -count), y = count, fill = where_drink)) +
geom_bar(stat = "identity") +
labs(title = "Where do people like to drink their coffee?", x = "Favorite places", y = "Testers") +
scale_fill_manual(values = brown_palette5) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(Whereplot)
# Cleaning and grouping the data to examine if there any difference in how much they are willing to spend on a coffee.
coffee_data_spending <- coffee_data %>%
dplyr::filter(!is.na(gender) & !is.na(most_paid) & !is.na(most_willing) & !is.na(spent_equipment)) %>%
mutate(most_paid = gsub("[^0-9.]", "", most_paid), most_paid = as.numeric(most_paid)) %>%
mutate(most_willing = gsub("[^0-9.]", "", most_willing), most_willing = as.numeric(most_willing)) %>%
mutate(spent_equipment = gsub("[^0-9.]", "", spent_equipment), spent_equipment = as.numeric(spent_equipment))
gender_spending <- coffee_data_spending %>%
group_by(gender) %>%
summarise(average_most_paid = mean(most_paid, na.rm = TRUE), average_most_willing = mean(most_willing, na.rm = TRUE), average_spent_equipment = mean(spent_equipment, na.rm = TRUE))
print(gender_spending)
## # A tibble: 5 × 4
## gender average_most_paid average_most_willing average_spent_equipm…¹
## <chr> <dbl> <dbl> <dbl>
## 1 Female 484. 555. 628249.
## 2 Male 519. 549. 1116968.
## 3 Non-binary 627. 693. 916812.
## 4 Other (please s… 582. 575. 1161700
## 5 Prefer not to s… 556. 516. 865200.
## # ℹ abbreviated name: ¹average_spent_equipment
# Perform ANOVA's to compare means between genders
anova_most_paid <- aov(most_paid ~ gender, data = coffee_data_spending)
anova_most_willing <- aov(most_willing ~ gender, data = coffee_data_spending)
anova_spent_equipment <- aov(spent_equipment ~ gender, data = coffee_data_spending)
summary(anova_most_paid)
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 4 2181967 545492 2.451 0.0441 *
## Residuals 3445 766682576 222549
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_most_willing)
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 4 2091644 522911 1.965 0.0972 .
## Residuals 3445 916937316 266165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_spent_equipment)
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 4 1.492e+14 3.729e+13 10.45 2.07e-08 ***
## Residuals 3445 1.230e+16 3.570e+12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# If ANOVA is significant, perform post-hoc tests
tukey_most_paid <- TukeyHSD(anova_most_paid)
tukey_spent_equipment <- TukeyHSD(anova_spent_equipment)
print(tukey_most_paid)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = most_paid ~ gender, data = coffee_data_spending)
##
## $gender
## diff lwr upr
## Male-Female 34.76598 -16.994130 86.52608
## Non-binary-Female 142.34625 7.203237 277.48927
## Other (please specify)-Female 97.69593 -333.812666 529.20452
## Prefer not to say-Female 72.25865 -163.295311 307.81261
## Non-binary-Male 107.58028 -22.493629 237.65419
## Other (please specify)-Male 62.92995 -367.018014 492.87792
## Prefer not to say-Male 37.49268 -195.190060 270.17541
## Other (please specify)-Non-binary -44.65033 -492.355341 403.05469
## Prefer not to say-Non-binary -70.08760 -334.144002 193.96880
## Prefer not to say-Other (please specify) -25.43728 -512.942982 462.06843
## p adj
## Male-Female 0.3545126
## Non-binary-Female 0.0331391
## Other (please specify)-Female 0.9722822
## Prefer not to say-Female 0.9190457
## Non-binary-Male 0.1592508
## Other (please specify)-Male 0.9946399
## Prefer not to say-Male 0.9922490
## Other (please specify)-Non-binary 0.9987979
## Prefer not to say-Non-binary 0.9509147
## Prefer not to say-Other (please specify) 0.9999076
print(tukey_spent_equipment)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = spent_equipment ~ gender, data = coffee_data_spending)
##
## $gender
## diff lwr upr
## Male-Female 488718.96 281423.5 696014.4
## Non-binary-Female 288562.57 -252675.3 829800.4
## Other (please specify)-Female 533450.80 -1194709.4 2261611.0
## Prefer not to say-Female 236951.12 -706425.2 1180327.4
## Non-binary-Male -200156.39 -721092.8 320780.0
## Other (please specify)-Male 44731.84 -1677178.1 1766641.8
## Prefer not to say-Male -251767.83 -1183645.1 680109.4
## Other (please specify)-Non-binary 244888.24 -1548137.4 2037913.9
## Prefer not to say-Non-binary -51611.44 -1109137.9 1005915.0
## Prefer not to say-Other (please specify) -296499.68 -2248924.2 1655924.8
## p adj
## Male-Female 0.0000000
## Non-binary-Female 0.5918830
## Other (please specify)-Female 0.9173220
## Prefer not to say-Female 0.9596551
## Non-binary-Male 0.8325651
## Other (please specify)-Male 0.9999943
## Prefer not to say-Male 0.9477545
## Other (please specify)-Non-binary 0.9958973
## Prefer not to say-Non-binary 0.9999292
## Prefer not to say-Other (please specify) 0.9938233
# Descriptive statistics of genders and money spent on coffee
summary(gender_spending)
## gender average_most_paid average_most_willing
## Length:5 Min. :484.2 Min. :516.1
## Class :character 1st Qu.:519.0 1st Qu.:548.8
## Mode :character Median :556.5 Median :555.4
## Mean :553.6 Mean :577.6
## 3rd Qu.:581.9 3rd Qu.:574.8
## Max. :626.5 Max. :693.1
## average_spent_equipment
## Min. : 628249
## 1st Qu.: 865200
## Median : 916812
## Mean : 937786
## 3rd Qu.:1116968
## Max. :1161700
descriptive_stats <- coffee_data_spending %>%
group_by(gender) %>%
summarise(
count = n(),
mean_most_paid = mean(most_paid, na.rm = TRUE),
sd_most_paid = sd(most_paid, na.rm = TRUE),
mean_most_willing = mean(most_willing, na.rm = TRUE),
sd_most_willing = sd(most_willing, na.rm = TRUE),
mean_spent_equipment = mean(spent_equipment, na.rm = TRUE),
sd_spent_equipment = sd(spent_equipment, na.rm = TRUE)
)
print(descriptive_stats)
## # A tibble: 5 × 8
## gender count mean_most_paid sd_most_paid mean_most_willing sd_most_willing
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Female 824 484. 444. 555. 492.
## 2 Male 2484 519. 480. 549. 523.
## 3 Non-binary 102 627. 479. 693. 546.
## 4 Other (pl… 9 582. 534. 575. 410.
## 5 Prefer no… 31 556. 453. 516. 514.
## # ℹ 2 more variables: mean_spent_equipment <dbl>, sd_spent_equipment <dbl>
# Reporting ANOVA results
report(anova_most_paid)
## The ANOVA (formula: most_paid ~ gender) suggests that:
##
## - The main effect of gender is statistically significant and very small (F(4,
## 3445) = 2.45, p = 0.044; Eta2 = 2.84e-03, 95% CI [3.78e-05, 1.00])
##
## Effect sizes were labelled following Field's (2013) recommendations.
report(anova_spent_equipment)
## The ANOVA (formula: spent_equipment ~ gender) suggests that:
##
## - The main effect of gender is statistically significant and small (F(4, 3445)
## = 10.45, p < .001; Eta2 = 0.01, 95% CI [5.94e-03, 1.00])
##
## Effect sizes were labelled following Field's (2013) recommendations.
An Analysis of Variance (ANOVA) was conducted to compare the means of spending across different gender groups: - There is a significant difference in what is the most they’ve ever paid for a cup of coffee, with non-binary individuals spent significantly more than females. F(4, 3445) = 2.45, p = 0.044. Post-hoc comparisons using the Tukey HSD test indicated that the mean spending for non-binary (M = 627, SD = 479) was significantly higher than for females (M = 484, SD = 444), with a mean difference of 142.346 (95% CI, 7.203 to 277.489), p = 0.033. - There are no significant gender difference in how much they are willing to spend on a coffee. - There is a significant difference in how much they’ve spent on equipments, with man spent significantly more than females. F(4, 3445) = 10.45, p < .001 . Post-hoc comparisons using the Tukey HSD test indicated that the mean spending for males (M = 519, SD = 480) was significantly higher than for females (M = 484, SD = 444), with a mean difference of 488,718.96 (95% CI, 281423.5 to 696014.4), p < 0.0001.
#Presenting the results in a table
anova_summary_most_paid <- summary(anova_most_paid)
anova_summary_spent_equipment <- summary(anova_spent_equipment)
# Creating a data frame for the ANOVA table
anova_results <- data.frame(
Variable = c("most_paid", "spent_equipment"),
Df = c(anova_summary_most_paid[[1]]$Df[1], anova_summary_spent_equipment[[1]]$Df[1]),
`Sum Sq` = c(anova_summary_most_paid[[1]]$`Sum Sq`[1], anova_summary_spent_equipment[[1]]$`Sum Sq`[1]),
`Mean Sq` = c(anova_summary_most_paid[[1]]$`Mean Sq`[1], anova_summary_spent_equipment[[1]]$`Mean Sq`[1]),
`F value` = c(anova_summary_most_paid[[1]]$`F value`[1], anova_summary_spent_equipment[[1]]$`F value`[1]),
`Pr(>F)` = c(anova_summary_most_paid[[1]]$`Pr(>F)`[1], anova_summary_spent_equipment[[1]]$`Pr(>F)`[1])
)
# Format and display the table
anova_results %>%
kable(format = "html", digits = 4, col.names = c("Variable", "Df", "Sum Sq", "Mean Sq", "F value", "Pr(>F)"), align = 'c') %>%
kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1, bold = TRUE) %>%
add_header_above(c("ANOVA Main Effect Results" = 6))
Variable | Df | Sum Sq | Mean Sq | F value | Pr(>F) |
---|---|---|---|---|---|
most_paid | 4 | 2.181967e+06 | 5.454917e+05 | 2.4511 | 0.0441 |
spent_equipment | 4 | 1.491541e+14 | 3.728853e+13 | 10.4462 | 0.0000 |
The analysis indicates that non-binary people spent the most for a cup of coffee in their life, and man are spending more on coffee equipment then women.
We will examine if the number of children (which presumably causes fatigue) and how strongly they need caffeine (the level of caffeine and strengths of the coffee).
# Cleaning the data
# Selecting the columns
selected_data <- c("number_children", "caffeine", "strength", "cups")
coffee_data_select <- coffee_data %>%
select(all_of(selected_data))
# Remove missing values
coffee_data_clean <- coffee_data_select %>% drop_na()
# Recoding variables
coffee_data_clean$cups[coffee_data_clean$cups == "More than 4"] <- "5"
coffee_data_clean$cups[coffee_data_clean$cups == "Less than 1"] <- "0"
coffee_data_clean$number_children[coffee_data_clean$number_children == "More than 3"] <- "4"
coffee_data_clean$number_children[coffee_data_clean$number_children == "None"] <- "0"
coffee_data_clean$caffeine[coffee_data_clean$caffeine == "Decaf"] <- "0"
coffee_data_clean$caffeine[coffee_data_clean$caffeine == "Half caff"] <- "1"
coffee_data_clean$caffeine[coffee_data_clean$caffeine == "Full caffeine"] <- "2"
coffee_data_clean$strength[coffee_data_clean$strength == "Weak"] <- "1"
coffee_data_clean$strength[coffee_data_clean$strength == "Somewhat light"] <- "2"
coffee_data_clean$strength[coffee_data_clean$strength == "Medium"] <- "3"
coffee_data_clean$strength[coffee_data_clean$strength == "Somewhat strong"] <- "4"
coffee_data_clean$strength[coffee_data_clean$strength == "Very strong"] <- "5"
# Set variables numeric
coffee_data_clean$cups <- as.numeric(coffee_data_clean$cups)
coffee_data_clean$number_children <- as.numeric(coffee_data_clean$number_children)
coffee_data_clean$caffeine <- as.numeric(coffee_data_clean$caffeine)
coffee_data_clean$strength <- as.numeric(coffee_data_clean$strength)
complex_model <- lm(cups ~ ., data = coffee_data_clean)
summary(complex_model)
##
## Call:
## lm(formula = cups ~ ., data = coffee_data_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3012 -0.7620 0.0456 0.4114 3.8782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.07791 0.09714 0.802 0.423
## number_children 0.17344 0.01770 9.798 <2e-16 ***
## caffeine 0.46682 0.03872 12.055 <2e-16 ***
## strength 0.19235 0.02064 9.319 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9396 on 3378 degrees of freedom
## Multiple R-squared: 0.1001, Adjusted R-squared: 0.09926
## F-statistic: 125.2 on 3 and 3378 DF, p-value: < 2.2e-16
# Check normality assumption
qqnorm(residuals(complex_model))
qqline(residuals(complex_model))
# Shapiro-Wilk test
shapiro.test(residuals(complex_model))
##
## Shapiro-Wilk normality test
##
## data: residuals(complex_model)
## W = 0.95554, p-value < 2.2e-16
# Rainbow test
raintest(complex_model)
##
## Rainbow test
##
## data: complex_model
## Rain = 1.1658, df1 = 1691, df2 = 1687, p-value = 0.0008164
# Breusch-Pagan test
bptest(complex_model)
##
## studentized Breusch-Pagan test
##
## data: complex_model
## BP = 11.188, df = 3, p-value = 0.01075
# Check the multicollinearity assumption
vif(complex_model)
## number_children caffeine strength
## 1.000317 1.025048 1.024905
# Calculate robust standard errors
robust_se <- vcovHC(complex_model, type = "HC1")
# Display coefficients with robust standard errors
coeftest(complex_model, vcov = robust_se)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.077906 0.088145 0.8838 0.3768
## number_children 0.173438 0.018903 9.1752 <2e-16 ***
## caffeine 0.466816 0.036335 12.8475 <2e-16 ***
## strength 0.192346 0.020871 9.2160 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We performed a linear regression analysis to examine the relationship between the selected variables. To account for that the residuals are not normally distributed and the heteroscedasticity we used robust standard errors. - The intercept is not significant. - The number of children are significant predictor of daily coffee consumption, 0.1734 (p < 0.000). - The quantity of caffeine in a cup of coffee are significant predictor of daily coffee consumption, 0.4668 (p < 0.000). - The strength of the coffee are significant predictor of daily coffee consumption, 0.1923 (p < 0.000).
# Presenting the results in a table
# Extract coefficients, robust standard errors, t-values, and p-values
coefficients <- coef(summary(complex_model))
robust_se_vals <- sqrt(diag(robust_se))
t_values <- coefficients[, "Estimate"] / robust_se_vals
p_values <- 2 * pt(-abs(t_values), df = df.residual(complex_model))
# Create a data frame for the table
regression_results <- data.frame(
Estimate = coefficients[, "Estimate"],
`Robust Std. Error` = robust_se_vals,
`t value` = t_values,
`Pr(>|t|)` = p_values
)
# Format and display the table
regression_results %>%
kable(format = "html", digits = 4, col.names = c("Predictor", "Estimate", "Robust Std. Error", "t value", "Pr(>|t|)")) %>%
kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(1, bold = TRUE) %>%
add_header_above(c("Regression Coefficients" = 5))
Predictor | Estimate | Robust Std. Error | t value | Pr(>|t|) |
---|---|---|---|---|
(Intercept) | 0.0779 | 0.0881 | 0.8838 | 0.3768 |
number_children | 0.1734 | 0.0189 | 9.1752 | 0.0000 |
caffeine | 0.4668 | 0.0363 | 12.8475 | 0.0000 |
strength | 0.1923 | 0.0209 | 9.2160 | 0.0000 |
The analysis indicates that all of the tested factors contributes positively for the daily coffee consumption. The more children someone has and the stronger they like coffee, the more they will consume in a day.