-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path07-anova.R
186 lines (161 loc) · 7.26 KB
/
07-anova.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# The following code is taken from the fourth chapter of the online script, which provides more detailed explanations:
# https://imsmwu.github.io/MRDA2020/hypothesis-testing.html#comparing-several-means
#-------------------------------------------------------------------#
#---------------------Install missing packages----------------------#
#-------------------------------------------------------------------#
# At the top of each script this code snippet will make sure that all required packages are installed
## ------------------------------------------------------------------------
req_packages <- c("psych", "plyr", "ggplot2", "reshape2", "PMCMR", "car", "multcomp", "Hmisc", "ggstatsplot", "tidyr")
req_packages <- req_packages[!req_packages %in% installed.packages()]
lapply(req_packages, install.packages)
#-------------------------------------------------------------------#
#---------------------------One-way ANOVA---------------------------#
#-------------------------------------------------------------------#
# Load and inspect the data
## ------------------------------------------------------------------------
hours_abc <- read.table("https://raw.githubusercontent.com/IMSMWU/MRDA2018/master/data/hours_abc.dat",
sep = "\t",
header = TRUE) #read in data
hours_abc$group <- factor(hours_abc$group, levels = c("A","B","C"), labels = c("low", "medium","high")) #convert grouping variable to factor
str(hours_abc) #inspect data
head(hours_abc) #inspect data
# Descriptive statistics
## ------------------------------------------------------------------------
library(psych)
describeBy(hours_abc$hours,hours_abc$group) #inspect data
# Plot data
## ------------------------------------------------------------------------
# plot of means
library(Rmisc)
library(ggplot2)
mean_data <- summarySE(hours_abc, measurevar="hours", groupvars=c("group"))
ggplot(mean_data,aes(x = group, y = hours)) +
geom_bar(position=position_dodge(1), colour="black", fill = "#CCCCCC", stat="identity", width = 0.65) +
geom_errorbar(position=position_dodge(.9), width=.15, aes(ymin=hours-ci, ymax=hours+ci)) +
theme_bw() +
labs(x = "Group", y = "Average number of hours", title = "Average number of hours by group")+
theme_bw() +
theme(plot.title = element_text(hjust = 0.5,color = "#666666"))
# boxplot
ggplot(hours_abc,aes(x = group, y = hours)) +
geom_boxplot() +
geom_jitter(colour="red", alpha = 0.1) +
theme_bw() +
labs(x = "Group", y = "Average number of hours", title = "Average number of hours by group")+
theme_bw() +
theme(plot.title = element_text(hjust = 0.5,color = "#666666"))
# Inspect data in wide format
## ------------------------------------------------------------------------
library(reshape2)
dcast(hours_abc, index ~ group, value.var = "hours")
# Compute grand mean and category means
## ------------------------------------------------------------------------
mean(hours_abc$hours) #grand mean
by(hours_abc$hours,hours_abc$group,mean) #category mean
# Compute total sum of squares
## ------------------------------------------------------------------------
SST <- sum((hours_abc$hours - mean(hours_abc$hours))^2)
SST
# Compute model sum of squares
## ------------------------------------------------------------------------
SSM <- sum(100*(by(hours_abc$hours, hours_abc$group, mean) - mean(hours_abc$hours))^2)
SSM
# Compute residual sum of squares
## ------------------------------------------------------------------------
SSR <- sum((hours_abc$hours - rep(by(hours_abc$hours, hours_abc$group, mean), each = 100))^2)
SSR
# Compute effect strength (eta)
# http://imaging.mrc-cbu.cam.ac.uk/statswiki/FAQ/effectSize
## ------------------------------------------------------------------------
eta <- SSM/SST
eta
# Compute F-ratio
## ------------------------------------------------------------------------
f_ratio <- (SSM/2)/(SSR/297)
f_ratio
# Test if calculated test statistic is larger than critical value
## ------------------------------------------------------------------------
f_crit <- qf(.95, df1 = 2, df2 = 297) #critical value
f_crit
f_ratio > f_crit
# Testing assumptions
## ------------------------------------------------------------------------
# Normality test (optional here, since n > 30)
by(hours_abc$hours, hours_abc$group, shapiro.test)
# Q-Q plots
qqnorm(hours_abc[hours_abc$group=="low",]$hours)
qqline(hours_abc[hours_abc$group=="low",]$hours)
qqnorm(hours_abc[hours_abc$group=="medium",]$hours)
qqline(hours_abc[hours_abc$group=="medium",]$hours)
qqnorm(hours_abc[hours_abc$group=="high",]$hours)
qqline(hours_abc[hours_abc$group=="high",]$hours)
# Run the ANOVA
## ------------------------------------------------------------------------
aov <- aov(hours ~ group, data = hours_abc)
summary(aov)
# Compute eta from output
## ------------------------------------------------------------------------
summary(aov)[[1]]$'Sum Sq'[1]/(summary(aov)[[1]]$'Sum Sq'[1] + summary(aov)[[1]]$'Sum Sq'[2])
# Test residual variance
## ------------------------------------------------------------------------
# Homogeneity of variances
plot(aov,1)
# Normal distribution of residuals (plot)
plot(aov,2)
# Normal distribution of residuals (test)
shapiro.test(resid(aov))
# Homogeneity of variances
## ------------------------------------------------------------------------
library(car)
leveneTest(hours ~ group, data = hours_abc, center = mean)
# Robust ANOVA in case the variances between groups are not equal (which is the case in the example)
## ------------------------------------------------------------------------
oneway.test(hours ~ group, data = hours_abc)
# Post hoc tests
## ------------------------------------------------------------------------
# Bonferroni
pairwise.t.test(hours_abc$hours, hours_abc$group, data = hours_abc, p.adjust.method = "bonferroni")
# Tukey
## ------------------------------------------------------------------------
# derive q
q <- qtukey(0.95, nm = 3, df = 297)
q
# compute hsd
hsd <- q * sqrt(summary(aov)[[1]]$'Mean Sq'[2]/100)
hsd
# compute tukeys using the multcomp package
library(multcomp)
tukeys <- glht(aov, linfct = mcp(group = "Tukey"))
summary(tukeys)
# compute CIs
confint(tukeys)
# plot CIs
plot(tukeys)
# Using the ggstatsplot package
## ------------------------------------------------------------------------
library(ggstatsplot)
ggbetweenstats(
data = hours_abc,
x = group,
y = hours,
plot.type = "box",
pairwise.comparisons = TRUE,
pairwise.annotation = "p.value",
p.adjust.method = "bonferroni",
effsize.type = "partial_eta",
var.equal = FALSE,
mean.plotting = TRUE, # whether mean for each group is to be displayed
mean.ci = TRUE, # whether to display confidence interval for means
mean.label.size = 2.5, # size of the label for mean
type = "parametric", # which type of test is to be run
k = 3, # number of decimal places for statistical results
outlier.label.color = "darkgreen", # changing the color for the text label
title = "Comparison of listening times between groups",
xlab = "Experimental group", # label for the x-axis variable
ylab = "Listening time", # label for the y-axis variable
messages = FALSE,
bf.message = FALSE
)
#save plot (optional)
## ------------------------------------------------------------------------
ggsave("anova.jpg", height = 6, width = 7.5)