-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAnalysisFunctions.R
220 lines (168 loc) · 9.44 KB
/
AnalysisFunctions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
##########################################################################################
###### Functions for paper performing comparative analysis of aDNA extraction methods
###### For sourcing into Rstudio
###### Written 5/19-27/2017 by M. Nieves-Colon
##########################################################################################
### Load necessary libraries
library(reshape2) # for melting data
library(ggplot2) # for plotting
library(gridExtra) # for plotting multiple images with ggsave
library(RColorBrewer) # for color palette
library(tidyr) # for histogram overlay functions
library(car) # for Levene's test
### FUN 1. Get boxplot grouped by extraction method. Arguments are:
### df = data frame, col = column of interest in string format, y lab text in string format
# Example: get_bp(df, "gc", Average GC content (%)")
get_bp = function(df, col, ylabtext) {
# Subset dataframe by column of interest and melt
sub = df[, c("Site", "Method", "Tissue", col)]
df.m = melt(sub)
print(head(df.m))
# Create ggplot boxplot
ggplot(df.m, aes(x=Method, y=value, fill=Method)) +
geom_boxplot() + scale_fill_brewer(palette="Set1") +
scale_color_brewer(palette="Set1") +
labs(x="Extraction Method", y=ylabtext) +
theme_bw()
}
### FUN 3. Get boxplot facet wrapped by Tissue. Arguments are:
### df = data frame, col = column of interest, y lab text in string format
# Example: get_bp_facet(gc.m, "Average GC content (%)")
get_bp_facet = function(df, col, ylabtext) {
# Subset dataframe by column of interest and melt
sub = df[, c("Site", "Method", "Tissue", col)]
df.m = melt(sub)
print(head(df.m))
# Create ggplot boxplot
ggplot(df.m, aes(x=Method, y=value, fill=Method)) +
geom_boxplot() + scale_fill_brewer(palette="Set1") +
facet_wrap( ~ Tissue) +
scale_color_brewer(palette="Set1") +
labs(x="Extraction Method", y=ylabtext) +
theme_bw()
}
### FUN 4. Check normality assumptions with histogram, qqplot and sw test.
### Arguments are: x = variable column name of interest
### Example: check_norm(test, test$gc_perc, "GC content", "GC content")
check_norm = function(df, x, text, xlabtext) {
diff= x[df$Method=="D"]-x[df$Method=="H"] # Calculate difference of paired values
#print(x[df$Method=="D"])
# Set EPS device to print qqplot and histogram directly to EPS file. Set A4 size.
# Will not print to screen.
setEPS()
postscript(paste0(text,"_normtest.eps"), width = 8.3, height = 11.7) # A4
par(mfrow=c(2,1)) # For multipanel plot
hist(diff,main = paste0("Histogram for difference in ", # Generate histogram
text, " (method D - method H)"),
cex.main = 0.90, col="gray", xlab=xlabtext)
#Generate QQplot of difference between the two methods
qq.diff = qqnorm(diff, xlab="Standardized normal values", ylab="Observed values",
main = paste0("Normal Quantile plot for difference in ",
text, " (method D - method H)"),
cex.main = 0.90) ; qqline(diff, col ="red")
dev.off() # close EPS device
shapiro.test(diff) # Shapiro wilk test for normality print to screen
}
### FUN 5. Run paired T tests for samples across method
### Arguments are: df = dataframe, dataframe$col, colname = name of variable of interest)
### Example: pairedt(test, test$gc_perc, "GC content")
### Function calculates difference of paired values for t-test, then extracts statistics results and prints
### them. It then makes a new dataframe that appends an existing dataframe in environment (called statdf)
### and appends the stat test results as a new row. This new dataframe is then reassigned with same name
### in global environment. This way the results of each stat test are recursively added to the dataframe.
pairedt = function(df, col, colname) {
t = t.test(col[df$Method=="D"], col[df$Method=="H"], paired=TRUE) # test takes difference of paired values
row = c(colname, "paired_t_test", t$statistic, t$parameter, t$p.value)
#print(row)
print(t)
extdf <- rbind(statdf, row, stringsAsFactors=FALSE) # fixes "invalid factor level, NA generated" error
assign("statdf", extdf, .GlobalEnv)
}
### FUN 6. Run paired wilcoxon tests for samples across method. Use same logic as in above paired t function
### Arguments are: df = dataframe, dataframe$col, colname
### Example: wilcox(test, test$gc_perc, "GC content")
wilcox = function(df, col, colname) {
w = wilcox.test(col[df$Method=="D"], col[df$Method=="H"], paired=TRUE, alternative = "two.sided")
row = c(colname, "wilcox_Test", w$statistic, w$parameter, "null", w$p.value)
#print(row)
print(w)
extdf <- rbind(statdf, row, stringsAsFactors=FALSE) # fixes "invalid factor level, NA generated" error
assign("statdf", extdf, .GlobalEnv)
}
#### FUN 9. Per individual barplots
## Arguments: df = data frame, colname = string column name of interest, mult = factor to
## multiply data by in case its percent, ylabtext = string for y axis, .. for matching subset column
## Example: get_barplot(data, "perc_end", 100, "Endogenous content (%)", Tissue=="Petrous portion")
get_barplot = function(df, colname, mult, ylabtext, ...) {
# Subset dataframe by tissue. Must use eval to pass text for subset function
ssubset <- deparse(substitute(...))
x = subset(df, eval(parse(text = ssubset)))
# Subset again so just column of interest is present, subset tissue column for sanity check
sub = x[, c("Sample", "Method", "Tissue", colname)]
#print(head(sub))
# Melt dataset and multiply by desired factor (if none then enter 1)
m = melt(sub)
m$value <- m$value*mult
print(head(m))
bg = ggplot(data=m, aes(x=Sample, y=value, fill=Method)) +
geom_bar(stat="identity", position=position_dodge()) + # no line...
scale_fill_brewer(palette="Set1", name="Extraction\nMethod") +
labs(x="Sample", y=ylabtext) +
theme_bw()
return(bg)
}
#### FUN 10. Read length histograms
## Arguments: samplesvec = vector of sample basenames, sufx = filename suffix,
## tissue = tooth/petrous portion, ylabtext = text for y axis as character.
## Example: get_readhist(samples, ".ds.uniqreads.", "teeth", "read bp")
get_readhist = function(samplesvec, sufx, tissue, ylabtext) {
##### 1. Import data table with read length counts. Numeric columns must not have
##### characters like n/a. Use for loop to import data for multiple samples
##### Import and modify data for all samples at once.
for (i in samplesvec) {
print(paste0("Importing data for sample ", i))
table = read.table(paste0("readlength_hist/",tissue,"/",i, sufx, "hist.txt"), header=F, sep="\t")
table$sample <- rep(i, nrow(table))
print(head(table))
print(paste0("Number of rows in sample ", i, " table: ", nrow(table)))
cat("\n")
if(i==samplesvec[1]){mastertable<-table}else{mastertable<-rbind(mastertable, table)}
}
### 2. Split sample name and extraction method into two columns. Since this also removes the sample
### name delimiter fix and re-add to mastertable.
temptable = mastertable %>% separate(sample, c("samplename1", "samplename2", "extmethod"), "-")
temptable$name <- paste(temptable$samplename1, temptable$samplename2, sep="-")
### 3. Replace original sample name with new name and add in ext method table
mastertable$sample <- temptable$name
mastertable$extmethod <-temptable$extmethod
#print(head(mastertable))
#### 4. Melt data into long format
mastertable_long <- melt(mastertable, id=c("sample", "extmethod"))
#print(head(mastertable_long))
#### 5. Plot overlay histograms with facet wrap
ggplot(mastertable_long, aes(x=value, fill=extmethod)) +
facet_wrap( ~ sample, scales = "free") +
geom_histogram(binwidth=3, alpha=.5, position="identity") +
scale_fill_brewer(palette="Set1", name="Extraction\nMethod") +
labs(x=ylabtext, y="Frequency") + theme_bw()
}
### References
## 1. http://stackoverflow.com/questions/5142842/export-a-graph-to-eps-file-with-r
## 2. http://stackoverflow.com/questions/14184258/putting-multiple-plots-in-a-a4-sheet-by-using-r-codes
## 3. http://christianlemp.com/blog/2014/02/05/How-I-Manage-Data-Projects-with-RStudio-and-Git.html
## 4. https://stackoverflow.com/questions/28370249/correct-way-to-specifiy-optional-arguments-in-r-functions
## 5. https://stackoverflow.com/questions/7310186/function-in-r-passing-a-dataframe-and-a-column-name
## 6. https://stackoverflow.com/questions/10689055/create-an-empty-data-frame
## 7. https://stackoverflow.com/questions/16819956/invalid-factor-level-na-generated
## 8. https://stackoverflow.com/questions/10904124/global-and-local-variables-in-r
## 9. https://stat.ethz.ch/pipermail/r-help/2004-February/046366.html
## 11. https://stackoverflow.com/questions/11880906/pass-subset-argument-through-a-function-to-subset
## 12. https://cran.r-project.org/web/packages/muStat/muStat.pdf
## 13. http://www.dummies.com/programming/r/how-to-split-strings-in-r/
## 14. http://astrostatistics.psu.edu/su07/R/html/base/html/list.files.html
## 15. https://stackoverflow.com/questions/3969852/update-data-frame-via-function-doesnt-work
## 16. https://stackoverflow.com/questions/18028225/r-list-files-with-multiple-conditions
## 17. https://stackoverflow.com/questions/35618260/removing-legend-ggplot-2-2
## 18. http://ggplot2.tidyverse.org/reference/geom_point.html
## 19. https://stackoverflow.com/questions/11846295/how-to-add-different-lines-for-facets
## 20. https://stackoverflow.com/questions/12188509/cleaning-inf-values-from-an-r-dataframe