We want to study the effect of 3 fertilizers in plant growth (example from wikipedia). So the factor fertilizer has 3 levels or groups.
library(tidyverse)
g1 <- c(6, 8, 4, 5, 3, 4) # growth under fertilizer 1
g2 <- c(8, 12, 9, 11, 6, 8) # growth under fertilizer 2
g3 <- c(13, 9, 11, 8, 7, 12) # growth under fertilizer 3
d1 <- tibble(response = g1, fertilizer ="g1")
d2 <- tibble(response = g2, fertilizer ="g2")
d3 <- tibble(response = g3, fertilizer ="g3")
dat <- d1 %>%
bind_rows(d2) %>%
bind_rows(d3) %>%
mutate(fertilizer = as.factor(fertilizer))
dat
## # A tibble: 18 × 2
## response fertilizer
## <dbl> <fct>
## 1 6 g1
## 2 8 g1
## 3 4 g1
## 4 5 g1
## 5 3 g1
## 6 4 g1
## 7 8 g2
## 8 12 g2
## 9 9 g2
## 10 11 g2
## 11 6 g2
## 12 8 g2
## 13 13 g3
## 14 9 g3
## 15 11 g3
## 16 8 g3
## 17 7 g3
## 18 12 g3
We plot the data
ggplot(dat, aes(x = fertilizer, y = response)) +
geom_point()
mean_all <- dat %>%
pull(response) %>%
mean()
SSb <- dat %>%
group_by(fertilizer) %>%
mutate(mean_g = mean(response)) %>%
mutate(sqr = (mean_g - mean_all)^2) %>%
pull(sqr) %>%
sum()
SSb
## [1] 84
SSw <- dat %>%
group_by(fertilizer) %>%
mutate(mean_g = mean(response)) %>%
mutate(sqr = (response - mean_g)^2) %>%
pull(sqr) %>%
sum()
SSw
## [1] 68
n_groups <- dat %>%
distinct(fertilizer) %>%
count() %>%
pull(n)
dfb <- n_groups - 1
MSb <- SSb / dfb
MSb
## [1] 42
n <- dat %>%
nrow()
dfw <- n - n_groups
MSw <- SSw / dfw
MSw
## [1] 4.533333
f <- MSb / MSw
f
## [1] 9.264706
Given that \(F\) is distributed according to a \(F\) distribution with \(df_b\) and \(df_w\) degrees of freedom
p_value <- pf(f, dfb, dfw, lower.tail = FALSE)
p_value
## [1] 0.002398777
aov
anova_oneway <- aov(response ~ fertilizer, data = dat)
anova_oneway %>% summary()
## Df Sum Sq Mean Sq F value Pr(>F)
## fertilizer 2 84 42.00 9.265 0.0024 **
## Residuals 15 68 4.53
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ezANOVA
library(ez)
ezANOVA(data = dat %>%
rownames_to_column() %>%
mutate(rowname = as.factor(rowname)),
dv = response,
wid = rowname,
between = fertilizer) %>%
pluck("ANOVA")
## Coefficient covariances computed by hccm()
## Effect DFn DFd F p p<.05 ges
## 1 fertilizer 2 15 9.264706 0.002398777 * 0.5526316
t.test
After computing the t-tests, you will need to correct the p.values using some correction (Bonferroni, Holm)
t.test(dat %>%
filter(fertilizer == "g1") %>%
pull(response),
dat %>%
filter(fertilizer == "g2") %>%
pull(response))
##
## Welch Two Sample t-test
##
## data: dat %>% filter(fertilizer == "g1") %>% pull(response) and dat %>% filter(fertilizer == "g2") %>% pull(response)
## t = -3.4641, df = 9.6154, p-value = 0.006444
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.586851 -1.413149
## sample estimates:
## mean of x mean of y
## 5 9
t.test(dat %>%
filter(fertilizer == "g1") %>%
pull(response),
dat %>%
filter(fertilizer == "g3") %>%
pull(response))
##
## Welch Two Sample t-test
##
## data: dat %>% filter(fertilizer == "g1") %>% pull(response) and dat %>% filter(fertilizer == "g3") %>% pull(response)
## t = -4.1286, df = 9.3077, p-value = 0.002387
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -7.725865 -2.274135
## sample estimates:
## mean of x mean of y
## 5 10
t.test(dat %>%
filter(fertilizer == "g2") %>%
pull(response),
dat %>%
filter(fertilizer == "g3") %>%
pull(response))
##
## Welch Two Sample t-test
##
## data: dat %>% filter(fertilizer == "g2") %>% pull(response) and dat %>% filter(fertilizer == "g3") %>% pull(response)
## t = -0.75955, df = 9.9412, p-value = 0.4652
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.935836 1.935836
## sample estimates:
## mean of x mean of y
## 9 10
pairwise.t.test
pairwise.t.test(x = dat$response,
g = dat$fertilizer)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: dat$response and dat$fertilizer
##
## g1 g2
## g2 0.011 -
## g3 0.003 0.429
##
## P value adjustment method: holm
If the comparisons are planned, we don’t need corrections for multiple comparisons
pairwise.t.test(x = dat$response,
g = dat$fertilizer,
p.adjust.method = "none")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: dat$response and dat$fertilizer
##
## g1 g2
## g2 0.0053 -
## g3 0.0010 0.4287
##
## P value adjustment method: none
You will see that the p-values are not the same to those obtained
using t.test
. This is because pairwise.t.test
calculates a common SD for all groups. This can be prevented:
pairwise.t.test(x = dat$response,
g = dat$fertilizer,
p.adjust.method = "none",
pool.sd = FALSE)
##
## Pairwise comparisons using t tests with non-pooled SD
##
## data: dat$response and dat$fertilizer
##
## g1 g2
## g2 0.0064 -
## g3 0.0024 0.4652
##
## P value adjustment method: none
Now, the p-values are the same that those obtained using
t.test
.
emmeans
library(emmeans)
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
emmeans_oneway <- emmeans(anova_oneway, "fertilizer")
pairs(emmeans_oneway, adjust = "holm")
## contrast estimate SE df t.ratio p.value
## g1 - g2 -4 1.23 15 -3.254 0.0107
## g1 - g3 -5 1.23 15 -4.067 0.0030
## g2 - g3 -1 1.23 15 -0.813 0.4287
##
## P value adjustment: holm method for 3 tests
If you want to run one-tailed tests, you need side
pairs(emmeans_oneway, adjust = "holm", side = "<")
## contrast estimate SE df t.ratio p.value
## g1 - g2 -4 1.23 15 -3.254 0.0053
## g1 - g3 -5 1.23 15 -4.067 0.0015
## g2 - g3 -1 1.23 15 -0.813 0.2143
##
## P value adjustment: holm method for 3 tests
## P values are left-tailed
anova_oneway_residuals <- residuals(anova_oneway)
hist(anova_oneway_residuals)
shapiro.test(anova_oneway_residuals)
##
## Shapiro-Wilk normality test
##
## data: anova_oneway_residuals
## W = 0.92223, p-value = 0.1416
If normalitiy does not hold, you can run a Kruskal-Wallis test
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following objects are masked from 'package:faraway':
##
## logit, vif
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
leveneTest(anova_oneway)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.5085 0.6114
## 15
It is actually running a Brown-Forsythe test as it is using the deviations from the median. It is more robust that the Levene test, which can be calculated like this:
leveneTest(anova_oneway, center = mean)
## Levene's Test for Homogeneity of Variance (center = mean)
## Df F value Pr(>F)
## group 2 0.6 0.5615
## 15
If homocedasticity is violates, the Welch one-way test can be calculated
oneway.test(response ~ fertilizer, data = dat)
##
## One-way analysis of means (not assuming equal variances)
##
## data: response and fertilizer
## F = 9.9391, num df = 2.0000, denom df = 9.8506, p-value = 0.004338
We can run the standard anova also using oneway.test
oneway.test(response ~ fertilizer, data = dat, var.equal = TRUE)
##
## One-way analysis of means
##
## data: response and fertilizer
## F = 9.2647, num df = 2, denom df = 15, p-value = 0.002399