-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRLE_n_batch_normalization_fpkm_data.R
68 lines (55 loc) · 2.75 KB
/
RLE_n_batch_normalization_fpkm_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
library(dplyr)
library(DESeq2)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(sva)
library(preprocessCore)
library(factoextra)
par(mfrow=c(3,1))
tumor <- sapply(fpkm_gse156451,sum)
cellline <- sapply(fpkm_gse185055,sum)
cetuximab <- sapply(fpkm_gse196576,sum)
hist(tumor,main="fpkm counts tumor",cex=1.2,cex.lab=1.8,xlab="fpkm counts")
hist(cellline,main="fpkm counts cell line",cex=1.2,cex.lab=1.8,xlab="fpkm counts")
hist(cetuximab,main="fpkm counts cetuximab",cex=1.2,cex.lab=1.8,xlab="fpkm counts")
dev.off()
# we need to obtain a common subset of fpkm reads
a <- rownames(fpkm_gse156451)
b <- rownames(fpkm_gse185055)
c <- rownames(fpkm_gse196576)
common_gene_subset <- intersect(intersect(a,b),c)
# subset all dataframes with the common subset of genes
fpkm_gse156451 <- fpkm_gse156451[rownames(fpkm_gse156451)%in%common_gene_subset,]
fpkm_gse185055 <- fpkm_gse185055[rownames(fpkm_gse185055)%in%common_gene_subset,]
fpkm_gse196576 <- fpkm_gse196576[rownames(fpkm_gse196576)%in%common_gene_subset,]
# create a common merged dataframe
merged_fpkm <- cbind(fpkm_gse156451,fpkm_gse185055,fpkm_gse196576)
# FPKM solves within sample variation. For between sample variation,
# we will do median of ratios/RLE in DESEq 2
sample_names <- names(merged_fpkm)
condition <- c(rep("tumor",50),rep("cell_line",50),rep("cetuximab",50))
colData <- as.data.frame(cbind(sample_names,condition))
colData$condition <- as.factor(colData$condition)
dds <- DESeqDataSetFromMatrix(countData = merged_fpkm,colData = colData,design = ~ 1)
plotPCA(merged_fpkm) # plot PCA won't work with ratios
# we cannot use estimatesizefactors from DESeq2 as it doesn't work with ratios, hence
# we will use the quantile function in R
#first calculate quantiles for the samples before normalization
before_quantile <- sapply(merged_fpkm, function(x) quantile(x, probs = seq(0, 1, 1/4)))
# now carry out quantile normalization
qntl_fpkm_merged <- as.data.frame(normalize.quantiles(as.matrix(merged_fpkm)))
# now calculate quantiles after normalization
after_quantile <- sapply(qntl_fpkm_merged, function(x) quantile(x, probs = seq(0, 1, 1/4)))
# apply rownames and columnnames to the quantile normalized dataframe
rownames(qntl_fpkm_merged) <- rownames(merged_fpkm)
colnames(qntl_fpkm_merged) <- colnames(merged_fpkm)
# using ComBat from sva package to correct for batch effects
# create a batch
batch <- c(rep(1,50),rep(2,50),rep(3,50))
# run the ComBat function
batch_corr_qntl_fpkm_merged_fpkm = ComBat(dat=qntl_fpkm_merged,
batch=batch,
mod=NULL,
par.prior=TRUE,
prior.plots=TRUE)
corrected_df <- as.data.frame(batch_corr_qntl_fpkm_merged_fpkm)
# final dataframe for analysis obtained!