diff --git a/R/pipeline_functions.R b/R/pipeline_functions.R new file mode 100755 index 0000000..c0b7c7c --- /dev/null +++ b/R/pipeline_functions.R @@ -0,0 +1,9065 @@ +#' @import Biobase limma tximport igraph biomaRt openxlsx msigdbr ConsensusClusterPlus kableExtra +#' @importFrom GEOquery getGEO +#' @importFrom RColorBrewer brewer.pal +#' @importFrom plot3D scatter3D +#' @importFrom plotrix draw.ellipse +#' @importFrom impute impute.knn +#' @importFrom umap umap +#' @importFrom rhdf5 H5Fopen H5Fclose +#' @importFrom DESeq2 DESeqDataSetFromTximport DESeq +#' @importFrom ComplexHeatmap Heatmap +#' @importFrom graphics plot +#' @importFrom aricode clustComp +#' @importFrom GSVA gsva +#' @importFrom MCMCglmm MCMCglmm +#' @importFrom arm bayesglm +#' @importFrom reshape melt +#' @importFrom ordinal clm clmm +#' @importFrom rmarkdown render +#' @importFrom Matrix rowSums +#' @importFrom SummarizedExperiment assay +#' @importFrom lme4 lmer +#' @importFrom grDevices col2rgb colorRampPalette dev.off pdf rgb +#' @importFrom graphics abline arrows axis barplot boxplot hist image layout legend lines mtext par points polygon rect segments strheight stripchart strwidth text +#' @importFrom stats IQR aggregate as.dendrogram as.dist cutree density dist fisher.test gaussian glm hclust kmeans ks.test lm median model.matrix na.omit order.dendrogram p.adjust pchisq pnorm prcomp pt qnorm quantile sd +#' @importFrom utils read.delim write.table + +################################ +library(Biobase) ## basic functions for bioconductor +library(GEOquery) ## for samples from GEO +library(limma) ## for data normalization for micro-array +library(DESeq2) ## for data normalization for RNASeq +library(tximport) ## for data import from Salmon/sailfish/kallisto/rsem/stringtie output + +library(RColorBrewer) ## for color scale +library(colorspace) ## for color scale +library(plot3D) ## for 3D plot +library(igraph) ## for network related functions +library(plotrix) ## for draw.ellipse + +library(biomaRt) ## for gene id conversion +library(openxlsx) ## for output into excel +library(impute) ## for impute + +library(msigdbr) ## for msigDB gene sets +library(ComplexHeatmap) ## for complex heatmap +library(umap) ## for umap visualization +library(rhdf5) ## for read in MICA results +library(GSVA) + +library(MCMCglmm) +library(arm) +library(reshape) +library(ordinal) +library(rmarkdown) + +library(aricode) +## +check_para <- function(para_name,envir){ + if(base::exists(para_name,envir=envir)==FALSE){message(sprintf('%s missing !',para_name));return(0)} + if(is.null(base::get(para_name,envir=envir))==TRUE){message(sprintf('%s is NULL !',para_name));return(0)} + return(1) +} +check_option <- function(para_name,option_list,envir){ + if(!base::get(para_name,envir=envir) %in% option_list){ + message(sprintf('Only accept %s set at: %s !',para_name,base::paste(option_list,collapse=';')));return(0) + } + return(1) +} +clean_charVector <- function(x){ + x1 <- names(x) + x <- as.character(x); + x[which(x=='')] <- 'NULL'; + x[which(is.null(x)==TRUE)] <- 'NULL' + x[which(is.na(x)==TRUE)] <- 'NA' + names(x) <- x1 + x +} +## +# +#' Preload database files into R workspace for NetBID2 +#' +#' \code{db.preload} is a pre-processing function for NetBID2. It preloads needed data into R workspace, +#' and saves it locally under db/ directory with specified species name and analysis level. +#' +#' Users need to set the species name (e.g. human, mouse) and +#' analysis level (transcript or gene level). TF list and SIG list are optional, if not specified, list from package data will be used as default. +#' +#' @param use_level character, users can choose "transcript" or "gene". Default is "gene". +#' @param use_spe character, the name of an interested species (e.g. "human", "mouse", "rat"). Default is "human". +#' @param update logical, if TRUE, previous loaded RData will be updated. Default is FALSE. +#' @param TF_list a character vector, the list of TF (Transcription Factor) names. If NULL, the pre-defined list in the package will be used. +#' Default is NULL. +#' @param SIG_list a character vector, the list of SIG (Signaling Factor) names. If NULL, the pre-defined list in the package will be use. +#' Default is NULL. +#' @param input_attr_type character, the type of the TF_list and SIG_list. +#' Details please check biomaRt, \url{https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/biomaRt.html}. +#' If TF_list and SIG_list are not specified, the list in the NetBID2 package will be used. +#' This only support "external_gene_name" and "ensembl_gene_id". +#' Default is "external_gene_name". +#' @param main.dir character, the main directory for NetBID2. +#' If NULL, will be \code{system.file(package = "NetBID2")}. Default is NULL. +#' @param db.dir character, a path for saving the RData. +#' Default is \code{db} directory under the \code{main.dir}, if \code{main.dir} is provided. +#' +#' @return Return TRUE if loading is successful, otherwise return FALSE. Two variables will be loaded into R workspace, \code{tf_sigs} and \code{db_info}. +#' @examples +#' db.preload(use_level='gene',use_spe='human',update=FALSE) +#' +#' \dontrun{ +#' db.preload(use_level='transcript',use_spe='human',update=FALSE) +#' db.preload(use_level='gene',use_spe='mouse',update=FALSE) +#' } +#' @export +db.preload <- function(use_level='transcript',use_spe='human',update = FALSE, + TF_list=NULL,SIG_list=NULL,input_attr_type='external_gene_name', + main.dir=NULL, + db.dir=sprintf("%s/db/",main.dir)){ + all_input_para <- c('use_level','use_spe','update') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('use_level',c('transcript','gene'),envir=environment()), + check_option('update',c(TRUE,FALSE),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + ## load annotation info, including: TF/Sig list, gene info + if(is.null(main.dir)==TRUE){ + main.dir <- system.file(package = "NetBID2") + message(sprintf('main.dir not set, will use package directory: %s',main.dir)) + } + if(is.null(db.dir)==TRUE){ + db.dir <- sprintf("%s/db/",main.dir) + } + message(sprintf('Will use directory %s as the db.dir',db.dir)) + message(sprintf('Your setting for species is %s, with level at %s',use_spe,use_level)) + use_spe <- toupper(use_spe) + output.db.dir <- sprintf('%s/%s',db.dir,use_spe) + RData.file <- sprintf('%s/%s_%s.RData', output.db.dir,use_spe,use_level) + if (!file.exists(RData.file) | update==TRUE) { ## not exist or need to update + ## get info from use_spe + ensembl <- biomaRt::useMart("ensembl") + all_ds <- biomaRt::listDatasets(ensembl) + w1 <- grep(sprintf("^%s GENES",use_spe),toupper(all_ds$description)) + if(base::length(w1)==0){ + tmp_use_spe <- unlist(strsplit(use_spe,' ')); tmp_use_spe <- tmp_use_spe[base::length(tmp_use_spe)] + w1 <- grep(sprintf(".*%s_GENE_ENSEMBL",toupper(tmp_use_spe)),toupper(all_ds$dataset)) + if(base::length(w1)==1){use_spe <- toupper(strsplit(all_ds[w1,2],' ')[[1]][1]); output.db.dir <- sprintf('%s/%s',db.dir,use_spe)} + } + if(base::length(w1)==1){ + w2 <- all_ds[w1,1] + mart <- biomaRt::useMart(biomart="ensembl", dataset=w2) ## get id for input spe + message(sprintf('Read in ensembl annotation file for %s and output all db files in %s/%s !',use_spe,db.dir,use_spe)) + } + if(base::length(w1)==0){ + message(sprintf('Check input use_spe parameter: %s, not included in the ensembl database',use_spe)) + return(FALSE) + } + if(base::length(w1)>1){ + w2 <- base::paste(all_ds[w1,2],collapse=';') + message(sprintf('Check input use_spe parameter: %s, more than one species match in ensembl database : %s, + please check and re-try',use_spe,w2)) + return(FALSE) + } + } + RData.file <- sprintf('%s/%s_%s.RData', output.db.dir,use_spe,use_level) + if (update == TRUE | !file.exists(RData.file)) { + if(!file.exists(output.db.dir)){ + dir.create(output.db.dir) + } + ## get attributes for mart + filters <- biomaRt::listFilters(mart) + attributes <- biomaRt::listAttributes(mart) + ensembl.attr.transcript <- c('ensembl_transcript_id','ensembl_gene_id', + 'external_transcript_name','external_gene_name', + 'gene_biotype','gene_biotype', + 'chromosome_name','strand','start_position','end_position','band','transcript_start','transcript_end', + 'description','phenotype_description','refseq_mrna') + ensembl.attr.gene <- c('ensembl_gene_id','external_gene_name', + 'gene_biotype', + 'chromosome_name','strand','start_position','end_position','band', + 'description','phenotype_description','refseq_mrna') + if(use_spe=='HUMAN'){ + ensembl.attr.transcript <- c(ensembl.attr.transcript,'hgnc_symbol','entrezgene') + ensembl.attr.gene <- c(ensembl.attr.gene,'hgnc_symbol','entrezgene') + } + ## do not output hgnc in non-human species + if(use_spe != 'HUMAN') + ensembl.attr.transcript <- base::setdiff(ensembl.attr.transcript,'hgnc_symbol') + if(use_spe != 'HUMAN') + ensembl.attr.gene <- base::setdiff(ensembl.attr.gene,'hgnc_symbol') + ## if too much: Query ERROR: caught BioMart::Exception::Usage: Too many attributes selected for External References + # judge input type if TF_list, Sig_list not equal to NULL + if(is.null(TF_list)==FALSE | is.null(SIG_list)==FALSE){ + if(!input_attr_type %in% filters$name){ + message(sprintf('%s not in the filter name, please retry !',input_attr_type));return(FALSE) + } + if(!input_attr_type %in% ensembl.attr.transcript) + ensembl.attr.transcript <- c(ensembl.attr.transcript,input_attr_type) + if(!input_attr_type %in% ensembl.attr.gene) + ensembl.attr.gene <- c(ensembl.attr.gene,input_attr_type) + } + ## get TF/SIG list and output to output.db.dir, if not defined by user, will use in db/ (human) + # for TF list + filter_attr <- input_attr_type + if(is.null(TF_list)){ ## use TF.txt in db/ + TF_f <- sprintf('%s/%s_TF_%s.txt',db.dir,use_spe,filter_attr) + message(sprintf('Will use %s file as the input TF_list!',TF_f)) + TF_list <- read.delim(TF_f,stringsAsFactors=FALSE,header=F)$V1 + if(use_spe != 'HUMAN' & use_spe != 'MOUSE'){ ## if spe not human/mouse + filter_attr <- 'external_gene_name' + tmp1 <- biomaRt::getBM(attributes=c('hsapiens_homolog_associated_gene_name','external_gene_name'),values=TRUE,mart=mart,filters='with_hsapiens_homolog') + TF_list <- base::unique(tmp1[which(tmp1[,1] %in% TF_list),2]) + } + } + if(use_level=='transcript'){ + message(sprintf('Begin read TF list information from ensembl for %s !',use_spe)) + TF_info <- biomaRt::getBM(attributes = ensembl.attr.transcript,values=TF_list, mart=mart, filters=filter_attr) + } + if(use_level=='gene'){ + message(sprintf('Begin read TF list information from ensembl for %s !',use_spe)) + TF_info <- biomaRt::getBM(attributes = ensembl.attr.gene,values=TF_list, mart=mart, filters=filter_attr) + } + # for SIG list + if(is.null(SIG_list)){ + SIG_f <- sprintf('%s/%s_SIG_%s.txt',db.dir,use_spe,filter_attr) + message(sprintf('Will use %s file as the input SIG_list!',SIG_f)) + SIG_list <- read.delim(SIG_f,stringsAsFactors=FALSE,header=F)$V1 + if(use_spe != 'HUMAN' & use_spe != 'MOUSE'){ ## if spe not human/mouse + filter_attr <- 'external_gene_name' + tmp1 <- biomaRt::getBM(attributes=c('hsapiens_homolog_associated_gene_name','external_gene_name'),values=TRUE,mart=mart,filters='with_hsapiens_homolog') + SIG_list <- base::unique(tmp1[which(tmp1[,1] %in% SIG_list),2]) + } + } + if(use_level=='transcript'){ + message(sprintf('Begin read SIG list information from ensembl for %s !',use_spe)) + SIG_info <- biomaRt::getBM(attributes = ensembl.attr.transcript,values=SIG_list, mart=mart, filters=filter_attr) + } + if(use_level=='gene'){ + message(sprintf('Begin read SIG list information from ensembl for %s !',use_spe)) + SIG_info <- biomaRt::getBM(attributes = ensembl.attr.gene,values=SIG_list, mart=mart, filters=filter_attr) + } + # check input not in the list + miss_TF <- base::unique(base::setdiff(TF_list,TF_info[[filter_attr]])) + miss_SIG <- base::unique(base::setdiff(SIG_list,SIG_info[[filter_attr]])) + if(base::length(miss_TF)>0){message(sprintf("%d TFs could not match,please check and choose to re-try : %s", + base::length(miss_TF),base::paste(sort(miss_TF),collapse=';')))} + if(base::length(miss_SIG)>0){message(sprintf("%d SIGs could not match,please check and choose to re-try : %s", + base::length(miss_SIG),base::paste(sort(miss_SIG),collapse=';')))} + ####### output full info + tf_sigs <- list();tf_sigs$tf <- list();tf_sigs$sig <- list(); + tf_sigs$tf$info <- TF_info; tf_sigs$sig$info <- SIG_info; + for(each_id_type in base::intersect(c('ensembl_transcript_id','ensembl_gene_id', + 'external_transcript_name','external_gene_name','hgnc_symbol', + 'entrezgene','refseq_mrna'),colnames(TF_info))){ + tf_sigs$tf[[each_id_type]] <- base::setdiff(base::unique(TF_info[[each_id_type]]),"") + tf_sigs$sig[[each_id_type]] <- base::setdiff(base::unique(SIG_info[[each_id_type]]),"") + } + db_info <- all_ds[w1,] + save(tf_sigs,db_info=db_info,file = RData.file) + } + load(RData.file,.GlobalEnv) + return(TRUE) + } + +#' Get Transcription Factor (TF) and Signaling Factor (SIG) List +#' +#' \code{get.TF_SIG.list} is a function converts gene ID into the corresponding TF/SIG list, +#' with selected gene/transcript type. +#' +#' @param use_genes a vector of characters, genes will be used in the network construction. +#' If NULL, no filter will be performed to the TF/SIG list. Default is NULL. +#' @param use_gene_type character, the attribute name inherited from the biomaRt package. +#' Some options are, "ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id", "ensembl_transcript_id_version" and "refseq_mrna". +#' All options can be accessed by calling \code{biomaRt::useMart} (e.g. mart <- biomaRt::useMart('ensembl',db_info[1]); biomaRt::listAttributes(mart)$name). +#' +#' The type must match the gene type from the input \code{use_genes}. Default is "external_gene_name". +#' @param ignore_version logical, if TRUE, the version "ensembl_gene_id_version" or "ensembl_transcript_id_version" will be ignored. +#' Default is FALSE. +#' @param dataset character, the dataset used for ID conversion (e.g. "hsapiens_gene_ensembl"). +#' If NULL, use \code{db_info[1]} from \code{db.preload}. Default is NULL. +#' +#' +#' @return Return a list containing two elements. \code{tf} is the TF list, \code{sig} is the SIG list. +#' +#' @examples +#' db.preload(use_level='transcript',use_spe='human',update=FALSE) +#' use_genes <- c("ENST00000210187","ENST00000216083","ENST00000216127", +#' "ENST00000216416","ENST00000217233","ENST00000221418", +#' "ENST00000504956","ENST00000507468") +#' res_list <- get.TF_SIG.list(use_gene_type = 'ensembl_transcript_id', +#' use_genes=use_genes, +#' dataset='hsapiens_gene_ensembl') +#' print(res_list) +#' +#' \dontrun{ +#' } +#' +#' @export +get.TF_SIG.list <- function(use_genes=NULL, + use_gene_type='external_gene_name',ignore_version=FALSE, + dataset=NULL){ + # + all_input_para <- c('use_genes') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('ignore_version',c(TRUE,FALSE),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(is.null(dataset)==TRUE){ + check_res <- check_para('db_info',envir=environment()) + if(base::min(check_res)==0){message('Please use db.preload() to get db_info or set dataset, check and re-try!');return(FALSE)} + dataset <- db_info[1] + } + if(is.null(tf_sigs)==TRUE){ + message('tf_sigs not loaded yet, please run db.preload() before processing !');return(FALSE); + } + n1 <- names(tf_sigs$tf)[-1] + if(use_gene_type %in% n1){ + if(is.null(use_genes)==TRUE){ + TF_list <- base::unique(tf_sigs$tf[[use_gene_type]]) + SIG_list <- base::unique(tf_sigs$sig[[use_gene_type]]) + }else{ + TF_list <- base::unique(base::intersect(use_genes,tf_sigs$tf[[use_gene_type]])) + SIG_list <- base::unique(base::intersect(use_genes,tf_sigs$sig[[use_gene_type]])) + } + }else{ + if(grepl('version$',use_gene_type)==TRUE & ignore_version==TRUE){ + use_genes_no_v <- gsub('(.*)\\..*','\\1',use_genes) + transfer_tab <- data.frame(to_type=use_genes,from_type=use_genes_no_v,stringsAsFactors = FALSE) + print(str(transfer_tab)) + TF_list <- transfer_tab[which(transfer_tab$from_type %in% tf_sigs$tf[[gsub('(.*)_version','\\1',use_gene_type)]]),'to_type'] + SIG_list <- transfer_tab[which(transfer_tab$from_type %in% tf_sigs$sig[[gsub('(.*)_version','\\1',use_gene_type)]]),'to_type'] + }else{ + mart <- biomaRt::useMart(biomart="ensembl", dataset=dataset) ## get mart for id conversion !!!! db_info is saved in db RData + #filters <- biomaRt::listFilters(mart) + attributes <- biomaRt::listAttributes(mart) + if(!use_gene_type %in% attributes$name){ + message(sprintf('%s not in the attributes for %s, please check and re-try !',use_gene_type,dataset));return(FALSE) + } + transfer_tab <- get_IDtransfer(from_type=use_gene_type,to_type=n1[1],ignore_version = ignore_version) + print(str(transfer_tab)) + TF_list <- get_name_transfertab(use_genes=tf_sigs$tf[n1[1]][[1]],transfer_tab = transfer_tab,ignore_version = ignore_version,ignore_order=TRUE,from_type=n1[1],to_type=use_gene_type) + SIG_list <- get_name_transfertab(use_genes=tf_sigs$sig[n1[1]][[1]],transfer_tab = transfer_tab,ignore_version = ignore_version,ignore_order=TRUE,from_type=n1[1],to_type=use_gene_type) + } + TF_list <- base::unique(base::intersect(use_genes,TF_list)) + SIG_list <- base::unique(base::intersect(use_genes,SIG_list)) + } + message(sprintf('%d TFs and %s SIGs are included in the expression matrix !',base::length(TF_list),base::length(SIG_list))) + return(list(tf=TF_list,sig=SIG_list)) +} + +#' Creates Data Frame for ID Conversion +#' +#' \code{get_IDtransfer} creates a data frame for ID conversion using biomaRt. For example, to convert Ensembl ID into gene symbol. +#' +#' @param from_type character, the attribute name match the current ID type (the type of \code{use_genes}). +#' Such as "ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id", "ensembl_transcript_id_version" or "refseq_mrna". +#' The "attribute" is inherited from the biomaRt package. For details, user can call \code{biomaRt::listAttributes()} to display all available attributes in the selected dataset. +#' @param to_type character, the attribute name to convert into. +#' @param add_type character, the additional attribute name to add into the conversion data frame. +#' @param use_genes a vector of characters, the genes for ID conversion. +#' If NULL, all genes will be selected. +#' @param dataset character, name of the dataset used for ID conversion. For example, "hsapiens_gene_ensembl". +#' If NULL, \code{db_info[1]} will be used. \code{db_info} requires the calling of \code{db.preload} in the previous steps. +#' Default is NULL. +#' @param ignore_version logical, if it is set to TRUE and \code{from_type} is "ensembl_gene_id_version" or "ensembl_transcript_id_version", +#' the version of the original ID will be ignored in ID mapping. +#' Default is FALSE. +#' +#' @return +#' Return a data frame for ID conversion. +#' +#' @examples +#' use_genes <- c("ENST00000210187","ENST00000216083","ENST00000216127", +#' "ENST00000216416","ENST00000217233","ENST00000221418") +#' transfer_tab <- get_IDtransfer(from_type = 'ensembl_transcript_id', +#' to_type='external_gene_name', +#' use_genes=use_genes, +#' dataset='hsapiens_gene_ensembl') +#' ## get transfer table !!! +#' res1 <- get_name_transfertab(use_genes,transfer_tab=transfer_tab) +#' transfer_tab_withtype <- get_IDtransfer2symbol2type(from_type = 'ensembl_transcript_id', +#' use_genes=use_genes, +#' dataset='hsapiens_gene_ensembl') +#' ## get transfer table !!! +#' \dontrun{ +#' } +#' @export +get_IDtransfer <- function(from_type=NULL,to_type=NULL,add_type=NULL,use_genes=NULL,dataset=NULL,ignore_version=FALSE){ + # + all_input_para <- c('from_type','to_type') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('ignore_version',c(TRUE,FALSE),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(is.null(dataset)==TRUE){ + check_res <- check_para('db_info',envir=environment()) + if(base::min(check_res)==0){message('Please use db.preload() to get db_info or set dataset, check and re-try!');return(FALSE)} + dataset <- db_info[1] + } + mart <- biomaRt::useMart(biomart="ensembl", dataset=dataset) ## get mart for id conversion !!!! db_info is saved in db RData + attributes <- biomaRt::listAttributes(mart) + + if(!from_type %in% attributes$name){ + message(sprintf('%s not in the attributes for %s, please check and re-try !',from_type,dataset));return(FALSE) + } + if(!to_type %in% attributes$name){ + message(sprintf('%s not in the attributes for %s, please check and re-try !',to_type,dataset));return(FALSE) + } + ori_from_type <- from_type + ori_use_genes <- use_genes + if(from_type %in% c('ensembl_gene_id_version','ensembl_transcript_id_version')){ + if(ignore_version==FALSE){ + message(sprintf('Attention: %s in %s will be updated with new version number, please check the output. + If lots of missing, try to set ignore_version=TRUE and try again !',from_type,dataset)); + }else{ + from_type <- gsub('(.*)_version','\\1',from_type) + if(is.null(use_genes)==FALSE) use_genes <- gsub('(.*)\\..*','\\1',use_genes) + } + } + if(is.null(use_genes)==TRUE | base::length(use_genes)>100){ + tmp1 <- biomaRt::getBM(attributes=c(from_type,to_type,add_type),values=1,mart=mart,filters='strand') + tmp2 <- biomaRt::getBM(attributes=c(from_type,to_type,add_type),values=-1,mart=mart,filters='strand') + tmp1 <- base::rbind(tmp1,tmp2) + if(is.null(use_genes)==FALSE){ + tmp1 <- tmp1[which(tmp1[,1] %in% use_genes),] + } + }else{ + tmp1 <- biomaRt::getBM(attributes=c(from_type,to_type,add_type),values=use_genes,mart=mart,filters=from_type) + } + if(ori_from_type %in% c('ensembl_gene_id_version','ensembl_transcript_id_version') & is.null(use_genes)==FALSE & ignore_version==TRUE){ + tmp2 <- data.frame(ori_from_type=ori_use_genes,from_type=use_genes,stringsAsFactors=FALSE) + names(tmp2) <- c(ori_from_type,from_type) + tmp1 <- base::merge(tmp2,tmp1,by.y=from_type,by.x=from_type)[c(from_type,to_type,add_type,ori_from_type)] + } + w1 <- apply(tmp1,1,function(x)base::length(which(is.na(x)==TRUE | x==""))) + transfer_tab <- tmp1[which(w1==0),] + for(i in 1:ncol(transfer_tab)){ + transfer_tab[,i] <- as.character(transfer_tab[,i]) + } + return(transfer_tab) +} + +#' Create Data Frame for ID Conversion Between Species +#' +#' \code{get_IDtransfer_betweenSpecies} creates a data frame to convert ID between species. +#' +#' @param from_spe character, name of the original species (e.g. "human", "mouse", "rat") that \code{use_genes} belongs to. Default is "human". +#' @param to_spe character, name of the target species (e.g. "human", "mouse", "rat"). Default is "mouse". +#' @param from_type character, the attribute name match the current ID type (the type of \code{use_genes}). +#' Such as "ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id", "ensembl_transcript_id_version" or "refseq_mrna". +#' The "attribute" is inherited from the \code{biomaRt} package. For details, user can call \code{biomaRt::listAttributes()} function to display all available attributes in the selected dataset. +#' @param to_type character, the attribute name match the target ID type. +#' @param use_genes a vector of characters, the genes for ID conversion. Must be the genes with ID type of \code{from_type}, and from species \code{from_spe}. +#' If NULL, all the possible genes will be shown in the conversion table. Default is NULL. +#' +#' @return Return a data frame for ID conversion, from one species to another. +#' +#' @examples +#' use_genes <- c("ENST00000210187","ENST00000216083","ENST00000216127", +#' "ENST00000216416","ENST00000217233","ENST00000221418") +#' transfer_tab <- get_IDtransfer_betweenSpecies(from_spe='human', +#' to_spe='mouse', +#' from_type = 'ensembl_transcript_id', +#' to_type='external_gene_name', +#' use_genes=use_genes) +#' ## get transfer table !!! +#' transfer_tab <- get_IDtransfer_betweenSpecies(from_spe='human', +#' to_spe='mouse', +#' from_type = 'ensembl_transcript_id', +#' to_type='ensembl_transcript_id_version', +#' use_genes=use_genes) +#' ## get transfer table !!! +#' \dontrun{ +#' transfer_tab <- get_IDtransfer_betweenSpecies(from_spe='human', +#' to_spe='mouse', +#' from_type='refseq_mrna', +#' to_type='refseq_mrna') +#' } +#' @export +get_IDtransfer_betweenSpecies <- function(from_spe='human',to_spe='mouse', + from_type=NULL,to_type=NULL, + use_genes=NULL){ + # + all_input_para <- c('from_spe','to_spe','from_type','to_type') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + from_spe <- toupper(from_spe) + to_spe <- toupper(to_spe) + ensembl <- biomaRt::useMart("ensembl") + all_ds <- biomaRt::listDatasets(ensembl) + w1 <- grep(sprintf("^%s GENES",from_spe),toupper(all_ds$description)) + if(base::length(w1)==1){ + from_spe_ds <- all_ds[w1,1] + mart1 <- biomaRt::useMart(biomart="ensembl", dataset=from_spe_ds) ## get id for input spe + } + if(base::length(w1)==0){ + message(sprintf('Check input from_spe parameter: %s, not included in the ensembl database',from_spe)) + return(FALSE) + } + if(base::length(w1)>1){ + w2 <- base::paste(all_ds[w1,2],collapse=';') + message(sprintf('Check input from_spe parameter: %s, more than one species match in ensembl database : %s, + please check and re-try',from_spe,w2)) + return(FALSE) + } + w1 <- grep(sprintf("^%s GENES",to_spe),toupper(all_ds$description)) + if(base::length(w1)==1){ + to_spe_ds <- all_ds[w1,1] + mart2 <- biomaRt::useMart(biomart="ensembl", dataset=to_spe_ds) ## get id for input spe + } + if(base::length(w1)==0){ + message(sprintf('Check input to_spe parameter: %s, not included in the ensembl database',to_spe)) + return(FALSE) + } + if(base::length(w1)>1){ + w2 <- base::paste(all_ds[w1,2],collapse=';') + message(sprintf('Check input to_spe parameter: %s, more than one species match in ensembl database : %s, + please check and re-try',to_spe,w2)) + return(FALSE) + } + #### mart1 mart2 + attributes <- biomaRt::listAttributes(mart1) + if(!from_type %in% attributes$name){ + message(sprintf('%s not in the attributes for %s, please check and re-try !',from_type,from_spe));return(FALSE) + } + attributes <- biomaRt::listAttributes(mart2) + if(!to_type %in% attributes$name){ + message(sprintf('%s not in the attributes for %s, please check and re-try !',to_type,to_spe));return(FALSE) + } + ## get homolog between from_spe to to_spe + cn1 <- gsub('(.*)_gene_ensembl','\\1',from_spe_ds) + cn2 <- attributes$name ## attribute names in mart2 + cn3 <- cn2[grep(sprintf('%s_homolog_associated_gene_name',cn1),cn2)] + if(base::length(cn3)!=1){ + message('No homolog info found in Biomart, sorry !');return(FALSE) + } + tmp1 <- get_IDtransfer(from_type=from_type,to_type='external_gene_name',use_genes=use_genes,dataset=from_spe_ds) + tmp2 <- biomaRt::getBM(attributes=c(cn3,'external_gene_name'),values=TRUE, + mart=mart2,filters=sprintf('with_%s_homolog',cn1)) + colnames(tmp1) <- sprintf('%s_%s',colnames(tmp1),from_spe) + colnames(tmp2) <- sprintf('%s_%s',colnames(tmp2),to_spe) + tmp3 <- base::merge(tmp1,tmp2,by.x=sprintf('external_gene_name_%s',from_spe),by.y=sprintf('%s_%s',cn3,to_spe)) + transfer_tab <- tmp3[,c(2,3,1)] + if(to_type != 'external_gene_name'){ + tmp4 <- get_IDtransfer(from_type='external_gene_name',to_type=to_type,use_genes=tmp3[,3],dataset=to_spe_ds) + colnames(tmp4) <- sprintf('%s_%s',colnames(tmp4),to_spe) + tmp5 <- base::merge(tmp3,tmp4) + transfer_tab <- tmp5[,c(3,4,2,1)] + } + return(transfer_tab) +} + + +#' Create Data Frame for ID Conversion With Biotype Information +#' +#' \code{get_IDtransfer2symbol2type} creates a data frame to convert original ID into gene symbol and gene biotype (gene level), +#' or into transcript symbol and transcript biotype (transcript level). +#' +#' @param from_type character, the attribute name matches the current ID type (the type of use_genes). +#' Such as "ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id", "ensembl_transcript_id_version" or "refseq_mrna". +#' The "attribute" is inherited from the biomaRt package. +#' For details, user can call \code{biomaRt::listAttributes()} function to display all available attributes in the selected dataset. +#' @param use_genes a vector of characters, the genes for ID conversion. +#' If NULL, all genes will be selected. +#' @param dataset character, name of the dataset used for ID conversion. +#' For example, "hsapiens_gene_ensembl". +#' If NULL, \code{db_info[1]} will be used. \code{db_info} requires the calling of \code{db.preload} in the previous steps. +#' Default is NULL. +#' @param use_level character, users can chose between "transcript" and "gene". Default is "gene". +#' @param ignore_version logical, if it is set to TRUE and \code{from_type} is "ensembl_gene_id_version" or "ensembl_transcript_id_version", +#' the version of the original ID will be ignored in ID mapping. +#' +#' @return +#' Return a data frame for ID conversion, from ID to gene symbol and gene biotype. +#' +#' @examples +#' use_genes <- c("ENST00000210187","ENST00000216083","ENST00000216127", +#' "ENST00000216416","ENST00000217233","ENST00000221418") +#' transfer_tab <- get_IDtransfer(from_type = 'ensembl_transcript_id', +#' to_type='external_gene_name',use_genes=use_genes, +#' dataset='hsapiens_gene_ensembl') +#' ## get transfer table !!! +#' res1 <- get_name_transfertab(use_genes,transfer_tab=transfer_tab) +#' transfer_tab_withtype <- get_IDtransfer2symbol2type(from_type = 'ensembl_transcript_id', +#' use_genes=use_genes, +#' dataset='hsapiens_gene_ensembl', +#' use_level='transcript') +#' ## get transfer table !!! +#' \dontrun{ +#' } +#' @export +get_IDtransfer2symbol2type <- function(from_type=NULL,use_genes=NULL,dataset=NULL,use_level='gene',ignore_version=FALSE){ + # + all_input_para <- c('from_type') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('ignore_version',c(TRUE,FALSE),envir=environment()), + check_option('use_level',c('gene','transcript'),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(is.null(dataset)==TRUE){ + check_res <- check_para('db_info',envir=environment()) + if(base::min(check_res)==0){message('Please use db.preload() to get db_info or set dataset, check and re-try!');return(FALSE)} + dataset <- db_info[1] + } + message(sprintf('Your setting is at %s level',use_level)) + mart <- biomaRt::useMart(biomart="ensembl", dataset=dataset) ## get mart for id conversion !!!! db_info is saved in db RData + attributes <- biomaRt::listAttributes(mart) + if(!from_type %in% attributes$name){ + message(sprintf('%s not in the attributes for %s, please check and re-try !',from_type,dataset));return(FALSE) + } + if(use_level=='gene') tmp1 <- get_IDtransfer(from_type=from_type,to_type='external_gene_name',add_type='gene_biotype',use_genes=use_genes,dataset=dataset,ignore_version=ignore_version) + if(use_level=='transcript') tmp1 <- get_IDtransfer(from_type=from_type,to_type='external_transcript_name',add_type='transcript_biotype',use_genes=use_genes,dataset=dataset,ignore_version=ignore_version) + transfer_tab <- tmp1 + return(transfer_tab) +} + +#' Convert Original Gene ID into Target Gene ID +#' +#' \code{get_name_transfertab} converts the original gene IDs into target gene IDs, with conversion table provided. +#' +#' @param use_genes a vector of characters, the genes for ID conversion. +#' @param transfer_tab data.frame, the conversion table. Users can create it by calling \code{get_IDtransfer}. +#' @param from_type character, the attribute name match the current ID type (the type of use_genes). +#' Such as "ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id", "ensembl_transcript_id_version" or "refseq_mrna". +#' The "attribute" is inherited from the biomaRt package. For details, user can call \code{biomaRt::listAttributes()} to see all available attributes in the selected dataset. +#' If NULL, will use the first column of \code{transfer_tab}. +#' @param to_type character, the attribute name to convert into. If NULL, will use the second column of \code{transfer_tab}. +#' @param ignore_version logical, if TRUE and \code{from_type} is "ensembl_gene_id_version" or "ensembl_transcript_id_version", the version will be ignored. Default is FALSE. +#' @param ignore_order logical, whether need to ignore the output order to match the input list of \code{use_genes}. Default is FALSE. +#' @return Return a vector of converted gene IDs. +#' +#' @examples +#' use_genes <- c("ENST00000210187","ENST00000216083","ENST00000216127", +#' "ENST00000216416","ENST00000217233","ENST00000221418") +#' transfer_tab <- get_IDtransfer(from_type = 'ensembl_transcript_id', +#' to_type='external_gene_name',use_genes=use_genes, +#' dataset='hsapiens_gene_ensembl') +#' ## get transfer table !!! +#' res1 <- get_name_transfertab(use_genes=use_genes,transfer_tab=transfer_tab) +#' transfer_tab_withtype <- get_IDtransfer2symbol2type(from_type = 'ensembl_transcript_id', +#' use_genes=use_genes, +#' dataset='hsapiens_gene_ensembl') +#' ## get transfer table !!! +#' \dontrun{ +#' } +#' @export +get_name_transfertab <- function(use_genes=NULL,transfer_tab=NULL,from_type=NULL,to_type=NULL,ignore_version=FALSE,ignore_order=FALSE){ + # + all_input_para <- c('use_genes','transfer_tab') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('ignore_version',c(TRUE,FALSE),envir=environment()), + check_option('ignore_order',c(TRUE,FALSE),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(is.null(from_type)==TRUE){from_type=colnames(transfer_tab)[1];} + if(is.null(to_type)==TRUE){to_type=colnames(transfer_tab)[2];} + if(ignore_version==TRUE){ + w1 <- which(colnames(transfer_tab)==from_type) + transfer_tab[,w1] <- gsub('(.*)\\..*','\\1',transfer_tab[,w1]) + from_type <- gsub('(.*)_version','\\1',from_type) + colnames(transfer_tab)[w1] <- from_type + use_genes <- gsub('(.*)\\..*','\\1',use_genes) + } + transfer_tab <- base::unique(transfer_tab[,c(from_type,to_type)]) + x <- use_genes; + t1 <- base::unique(transfer_tab[which(transfer_tab[,from_type] %in% x),]) + c1 <- base::unique(t1[,from_type]) + if(base::length(c1) 1) + idx <- grep(GPL, attr(eset, "names")) + else + idx <- 1 + eset <- eset[[idx]] + if(GPL!=annotation(eset)) {GPL <- annotation(eset); message(sprintf('Real GPL:%s',GPL))} + expRData_dir <- sprintf('%s/%s_%s.RData', out.dir, GSE,GPL) + save(eset, file = expRData_dir) + message(sprintf('RData for the eset is saved in %s .',expRData_dir)) + } + return(eset) +} + +#' Load Gene Expression Set from Salmon Output (demo version) +#' +#' \code{load.exp.RNASeq.demoSalmon} is a function to read in Salmon results and convert it to eSet/DESeqDataSet class object. +#' +#' This function helps users to read in results created by Salmon. +#' Due to the complicated manipulations (e.g. reference sequence) in processing Salmon, this demo function may not be suitable for all scenarios. +#' +#' @param salmon_dir character, the directory to save the results created by Salmon. +#' @param tx2gene data.frame or NULL, this parameter will be passed to \code{tximport}. For details, please check \code{tximport}. +#' If NULL, will read in one of the transcript names from Salmon's results. Note, it works when using e.g. "gencode.v29.transcripts.fa" from GENCODE as reference. +#' @param use_phenotype_info data.frame, the data frame contains phenotype information. It must have the columns \code{use_sample_col} and \code{use_design_col}. +#' @param use_sample_col character, the column name, indicating which column in \code{use_phenotype_info} should be used as the sample name. +#' @param use_design_col character, the column name, indicating which column in \code{use_phenotype_info} should be used as the design feature for samples. +#' @param return_type character, the class of the return object. +#' "txi" is the output of tximport. It is a list containing three matrices, abundance, counts and length. +#' "counts" is the matrix of raw count. +#' "tpm" is the raw tpm. +#' "fpm", "cpm" is the fragments/counts per million mapped fragments (fpm/cpm). +#' "raw-dds" is the DESeqDataSet class object, which is the original one without processing. +#' "dds" is the DESeqDataSet class object, which is processed by DESeq. +#' "eset" is the ExpressionSet class object, which is processed by DESeq and vst. +#' Default is "tpm". +#' @param merge_level character, users can choose between "gene" and "transcript". +#' "gene", the original salmon results will be mapped to the transcriptome and the expression matrix will be merged to the gene level. +#' This only works when using e.g. "gencode.vXX.transcripts.fa" from GENCODE as the reference. +#' @export +load.exp.RNASeq.demoSalmon <- function(salmon_dir = NULL,tx2gene=NULL, + use_phenotype_info = NULL, + use_sample_col=NULL, + use_design_col=NULL, + return_type='tpm', + merge_level='gene') { + # + all_input_para <- c('salmon_dir','use_phenotype_info','use_sample_col','use_design_col','return_type','merge_level') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('return_type',c('txi','counts','tpm','fpm','cpm','raw-dds','dds','eset'),envir=environment()), + check_option('merge_level',c('gene','transcript'),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + files <- file.path(salmon_dir, list.files(salmon_dir), "quant.sf") + if(!grepl('_salmon',use_phenotype_info[1,use_sample_col])) sample_name <- gsub('(.*)_salmon', '\\1', list.files(salmon_dir)) ## if no _salmon is also ok + names(files) <- sample_name + w1 <- base::length(files) + message(sprintf('%d %s/quant.sf found !',w1,salmon_dir)) + if(is.null(tx2gene)){ + gene_info <- read.delim(file = files[1], stringsAsFactors = FALSE)[, 1] + gen1 <- sapply(gene_info, function(x)unlist(strsplit(x, '\\|'))) + gen1 <- t(gen1) + if(merge_level=='gene'){ + tx2gene <- data.frame('transcript' = gene_info,'gene' = gen1[,2],stringsAsFactors = FALSE) + }else{ + tx2gene <- data.frame('transcript' = gene_info,'gene' = gen1[,1],stringsAsFactors = FALSE) + } + } + eset <- load.exp.RNASeq.demo(files,type='salmon', + tx2gene=tx2gene, + use_phenotype_info=use_phenotype_info, + use_sample_col=use_sample_col, + use_design_col=use_design_col, + return_type=return_type, + merge_level=merge_level) + return(eset) +} + +#' Load Gene Expression Set from RNA-Seq Results (demo version) +#' +#' \code{load.exp.RNASeq.demo} is a function to read in RNA-Seq results and convert it to \code{eSet/DESeqDataSet} class object. +#' +#' This function helps users to read in RNA-Seq results from various sources. +#' Due to the complicated manipulations (e.g. reference sequence) in processing RNA-Seq, this demo function may not be suitable for all scenarios. +#' +#' @param files a vector of characters, the filenames for the transcript-level abundances. It will be passed to \code{tximport}. +#' For details, please check \code{tximport}. +#' @param type character, the type of software used to generate the abundances. It will be passed to \code{tximport}. +#' For details, please check \code{tximport}. +#' @param tx2gene data.frame or NULL, this parameter will be passed to \code{tximport}. For details, please check \code{tximport}. +#' @param use_phenotype_info data.frame, the data frame contains phenotype information. It must have the columns \code{use_sample_col} and \code{use_design_col}. +#' @param use_sample_col character, the column name, indicating which column in \code{use_phenotype_info} should be used as the sample name. +#' @param use_design_col character, the column name, indicating which column in \code{use_phenotype_info} should be used as the design feature for samples. +#' @param return_type character, the class of the return object. +#' "txi" is the output of \code{tximport}. It is a list containing three matrices, abundance, counts and length. +#' "counts" is the matrix of raw count. +#' "tpm" is the raw tpm. +#' "fpm", "cpm" is the fragments/counts per million mapped fragments. +#' "raw-dds" is the DESeqDataSet class object, which is the original one without processing. +#' "dds" is the DESeqDataSet class object, which is processed by \code{DESeq}. +#' "eset" is the ExpressionSet class object, which is processed by \code{DESeq} and \code{vst}. +#' Default is "tpm". +#' @param merge_level character, users can choose between "gene" and "transcript". +#' "gene", the original salmon results will be mapped to the transcriptome and the expression matrix will be merged to the gene level. +#' This only works when using e.g. "gencode.vXX.transcripts.fa" from GENCODE as the reference. +#' @export +load.exp.RNASeq.demo <- function(files,type='salmon', + tx2gene=NULL, + use_phenotype_info = NULL, + use_sample_col=NULL, + use_design_col=NULL, + return_type='tpm', + merge_level='gene') { + # + all_input_para <- c('files','type','tx2gene','use_phenotype_info','use_sample_col','use_design_col','return_type','merge_level') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('return_type',c('txi','counts','tpm','fpm','cpm','raw-dds','dds','eset'),envir=environment()), + check_option('merge_level',c('gene','transcript'),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + n1 <- colnames(use_phenotype_info) + if(!use_sample_col %in% n1){ + message(sprintf('%s not in the colnames of use_phenotype_info, + please check and re-try !',use_sample_col));return(FALSE) + } + if(!use_design_col %in% n1){ + message(sprintf('%s not in the colnames of use_phenotype_info, + please check and re-try !',use_design_col));return(FALSE) + } + # get intersected samples + rownames(use_phenotype_info) <- use_phenotype_info[,use_sample_col] + w1 <- base::intersect(names(files),rownames(use_phenotype_info)) + files <- files[w1]; use_phenotype_info <- use_phenotype_info[w1,] + if(base::length(w1)==0){ + message(sprintf('No sample could match the %s in the use_phenotype_info, please check and re-try !',use_sample_col)) + return(FALSE) + } + message(sprintf('%d samples could further processed !',base::length(w1))) + # import into txi + txi <- tximport::tximport(files, type = type, tx2gene = tx2gene) ## key step one, tximport + if(return_type=='counts'){ + return(txi$counts) + } + if(return_type=='tpm'){ + return(txi$abundance) + } + if(return_type=='txi'){ + return(txi) + } + use_phenotype_info <- use_phenotype_info[colnames(txi$abundance), ] + tmp_phe <- base::cbind(group=use_phenotype_info[,use_design_col],use_phenotype_info,stringsAsFactors=FALSE) + # import into deseq2 + dds <- DESeq2::DESeqDataSetFromTximport(txi, colData = tmp_phe, design = ~ group) ## key step two, DESeqDataSetFromTximport + if(return_type=='raw-dds'){ + return(dds) + } + if(return_type=='fpm' | return_type=='cpm'){ + return(DESeq2::fpm(dds)) + } + dds <- DESeq2::DESeq(dds) + if(return_type=='dds'){ + return(dds) + }else{ + vsd <- DESeq2::vst(dds) + mat <- SummarizedExperiment::assay(vsd) + eset <- generate.eset(exp_mat=mat, phenotype_info = use_phenotype_info, feature_info = NULL, annotation_info='Salmon') + if(return_type=='eset') return(eset) + if(return_type=='both') return(list(eset=eset,dds=dds)) + } + } + +#' Generate ExpressionSet Object +#' +#' \code{generate.eset} generates ExpressionSet class object to contain and describe the high-throughput assays. +#' Users need to define its slots, which are expression matrix (required), +#' phenotype information and feature information (optional). +#' It is very useful when only expression matrix is available. +#' +#' @param exp_mat matrix, the expression data matrix. Each row represents a gene/transcript/probe, each column represents a sample. +#' @param phenotype_info data.frame, the phenotype information for all the samples in \code{exp_mat}. +#' In the phenotype data frame, each row represents a sample, each column represents a phenotype feature. +#' The row names must match the column names of \code{exp_mat}. If NULL, it will generate a single-column data frame. +#' Default is NULL. +#' @param feature_info data.frame, the feature information for all the genes/transcripts/probes in \code{exp_mat}. +#' In the feature data frame, each row represents a gene/transcript/probe and each column represents an annotation of the feature. +#' The row names must match the row names of \code{exp_mat}. If NULL, it will generate a single-column data frame. +#' Default is NULL. +#' @param annotation_info character, the annotation set by users for easier reference. Default is "". +#' +#' @return Return an ExressionSet object. +#' +#' @examples +#' mat1 <- matrix(rnorm(10000),nrow=1000,ncol=10) +#' colnames(mat1) <- paste0('Sample',1:ncol(mat1)) +#' rownames(mat1) <- paste0('Gene',1:nrow(mat1)) +#' eset <- generate.eset(exp_mat=mat1) +#' @export +generate.eset <- function(exp_mat=NULL, phenotype_info=NULL, feature_info=NULL, annotation_info="") { + # + all_input_para <- c('exp_mat') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(is.null(dim(exp_mat))==TRUE){ + exp_mat <- t(as.matrix(exp_mat));rownames(exp_mat) <- 'g1' + } + if (is.null(phenotype_info)) { + phenotype_info <- data.frame(group = colnames(exp_mat), stringsAsFactors = FALSE) + rownames(phenotype_info) <- colnames(exp_mat) + } + if (is.null(feature_info)) { + feature_info <- data.frame(gene = rownames(exp_mat), stringsAsFactors = FALSE) + rownames(feature_info) <- rownames(exp_mat) + } + if((class(phenotype_info)=='character' | is.null(dim(phenotype_info))==TRUE) & is.null(names(phenotype_info))==TRUE){ + phenotype_info <- data.frame(group = phenotype_info, stringsAsFactors = FALSE) + rownames(phenotype_info) <- colnames(exp_mat) + } + if((class(feature_info)=='character' | is.null(dim(feature_info))==TRUE) & is.null(names(feature_info))==TRUE){ + feature_info <- data.frame(gene = feature_info, stringsAsFactors = FALSE) + rownames(feature_info) <- rownames(exp_mat) + } + # + eset <- + new( + "ExpressionSet", + phenoData = new("AnnotatedDataFrame", phenotype_info), + featureData = new("AnnotatedDataFrame", feature_info), + annotation = annotation_info, + exprs = as.matrix(exp_mat) + ) + return(eset) +} + +#' Merge Two ExpressionSet Class Objects into One +#' +#' \code{merge_eset} merges two ExpressionSet class objects and returns one ExpresssionSet object. +#' If genes in the two ExpressionSet objects are identical, the expression matrix will be combined directly. +#' Otherwise, Z-transformation is strongly suggested to be performed before combination (set std=TRUE). +#' +#' @param eset1 ExpressionSet class, the first ExpressionSet. +#' @param eset2 ExpressionSet class, the second ExpressionSet. +#' @param group1 character, name of the first ExpressionSet. +#' @param group2 character, name of the second ExpressionSet. +#' @param use_col a vector of characters, the column names in the phenotype information to be kept. +#' If NULL, shared column names of \code{eset1} and \code{eset2} will be used. Default is NULL. +#' @param group_col_name character, name of the column which contains the names defined in \code{group1} and \code{group2}. +#' This column is designed to show which original ExpressionSet each sample comes from before combination. +#' Default name of this column is "original_group". +#' @param remove_batch logical, if TRUE, remove the batch effects from these two expression datasets. Default is FALSE. +#' @param std logical, whether to perform std to the original expression matrix. Default is FALSE. +#' +#' @return Return an ExressionSet class object. +#' @examples +#' mat1 <- matrix(rnorm(10000),nrow=1000,ncol=10) +#' colnames(mat1) <- paste0('Sample1_',1:ncol(mat1)) +#' rownames(mat1) <- paste0('Gene',1:nrow(mat1)) +#' eset1 <- generate.eset(exp_mat=mat1) +#' mat2 <- matrix(rnorm(10000),nrow=1000,ncol=10) +#' colnames(mat2) <- paste0('Sample2_',1:ncol(mat1)) +#' rownames(mat2) <- paste0('Gene',1:nrow(mat1)) +#' eset2 <- generate.eset(exp_mat=mat2) +#' new_eset <- merge_eset(eset1,eset2) +#' @export +merge_eset <- function(eset1,eset2, + group1=NULL,group2=NULL, + group_col_name='original_group', + use_col = NULL, + remove_batch = FALSE,std=FALSE) { + # + all_input_para <- c('eset1','eset2','group_col_name') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('remove_batch',c(TRUE,FALSE),envir=environment()), + check_option('std',c(TRUE,FALSE),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + mat1 <- Biobase::exprs(eset1) + mat2 <- Biobase::exprs(eset2) + w1 <- base::intersect(rownames(mat1), rownames(mat2)) + if(base::length(w1)==0){ + message('No overlap genes between two eSet, please check and re-try!');return(FALSE); + } + if((base::length(w1)1) + rphe <- base::rbind(phe1[colnames(mat1), use_col], phe2[colnames(mat2), use_col]) + if(base::length(use_col)==1){ + rphe <- c(phe1[colnames(mat1), use_col], phe2[colnames(mat2), use_col]) + rphe <- data.frame(rphe,stringsAsFactors=FALSE); colnames(rphe) <- use_col; + rownames(rphe) <- colnames(rmat) + } + if(base::length(use_col)==0){message('Warning: no intersected phenotype column!');} + if(is.null(group1)==TRUE) group1 <- 'group1' + if(is.null(group2)==TRUE) group2 <- 'group2' + rphe[[group_col_name]]<- c(rep(group1, ncol(mat1)), rep(group2, ncol(mat2))) + if (remove_batch == TRUE) { + rmat <- limma::removeBatchEffect(rmat,batch=rphe[[group_col_name]]) + } + if(class(rphe)=='list'){rphe <- as.data.frame(rphe,stringsAsFactors=FALSE); rownames(rphe) <- colnames(rmat)} + reset <- generate.eset(rmat,phenotype_info = rphe, annotation_info = 'combine') + return(reset) +} + +#' Reassign featureData slot of ExpressionSet and Update feature information +#' +#' \code{update_eset.feature} reassigns the featureData slot of ExpressionSet object based on user's demand. It is mainly used for gene ID conversion. +#' +#' User can pass a conversion table to \code{use_eset} for the ID conversion. A conversion table can be obtained from the original featureDta slot +#' (if one called the \code{load.exp.GEO} function and set getGPL==TRUE) or by running the \code{get_IDtransfer} function. +#' The mapping between original ID and target ID can be summerised into 4 categories. +#' 1) One-to-one, simply replaces the original ID with target ID; +#' 2) Many-to-one, the expression value for the target ID will be merged from its original ID; +#' 3) One-to-many, the expression value for the original ID will be distributed to the matched target IDs; +#' 4) Many-to-many, apply part 3) first, then part 2). +#' +#' @param use_eset ExpressionSet class object. +#' @param use_feature_info data.frame, a data frame contains feature information, it can be obtained by calling \code{fData} function. +#' @param from_feature character, orginal ID. Must be one of the column names in \code{use_feature_info} and correctly characterize the \code{use_eset}'s row names. +#' @param to_feature character, target ID. Must be one of the column names in \code{use_feature_info}. +#' @param merge_method character, the agglomeration method to be used for merging gene expression value. +#' This should be one of, "median", "mean", "max" or "min". Default is "median". +#' @param distribute_method character, the agglomeration method to be used for distributing the gene expression value. +#' This should be one of, "mean" or "equal". Default is "equal". +#' +#' @return Return an ExressionSet object with updated feature information. +#' @examples +#' mat1 <- matrix(rnorm(10000),nrow=1000,ncol=10) +#' colnames(mat1) <- paste0('Sample',1:ncol(mat1)) +#' rownames(mat1) <- paste0('Gene',1:nrow(mat1)) +#' eset <- generate.eset(exp_mat=mat1) +#' test_transfer_table <- data.frame( +#' 'Gene'=c('Gene1','Gene1','Gene2','Gene3','Gene4'), +#' 'Transcript'=c('T11','T12','T2','T3','T3')) +#' new_eset <- update_eset.feature(use_eset=eset, +#' use_feature_info=test_transfer_table, +#' from_feature='Gene', +#' to_feature='Transcript', +#' merge_method='median', +#' distribute_method='equal' +#' ) +#' print(Biobase::exprs(eset)[test_transfer_table$Gene,]) +#' print(Biobase::exprs(new_eset)) +#' +#' @export update_eset.feature +update_eset.feature <- function(use_eset=NULL,use_feature_info=NULL,from_feature=NULL,to_feature=NULL, + merge_method='median',distribute_method='equal'){ + # + all_input_para <- c('use_eset','from_feature','to_feature') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('merge_method',c("median","mean","max","min"),envir=environment()), + check_option('distribute_method',c('mean','equal'),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(is.null(use_feature_info)) use_feature_info <- Biobase::fData(use_eset) + n1 <- colnames(use_feature_info) + if(!from_feature %in% n1){ + message(sprintf('%s not in in the colnames of use_feature_info, please re-try!',from_feature));return(use_eset) + } + if(!to_feature %in% n1){ + message(sprintf('%s not in in the colnames of use_feature_info, please re-try!',to_feature));return(use_eset) + } + mat <- Biobase::exprs(use_eset) + use_feature_info <- base::unique(use_feature_info); + w1 <- which(use_feature_info[,1]!="" & use_feature_info[,2]!="" & is.na(use_feature_info[,1])==FALSE & is.na(use_feature_info[,2])==FALSE) + use_feature_info <- use_feature_info[w1,] + g1 <- rownames(mat) ## rownames for the expmat + f1 <- clean_charVector(use_feature_info[,from_feature]) ## from feature info + t1 <- clean_charVector(use_feature_info[,to_feature]) ## to feature info + w1 <- which(f1 %in% g1); f1 <- f1[w1]; t1 <- t1[w1]; ## only consider features in the rownames of expmat + if(base::length(w1)==0){ + message(sprintf('Rownames of the expression matrix was not included in the %s column, please check and re-try !',from_feature)) + return(use_eset) + } + message(sprintf('%d transfer pairs related with %d rows from original expression matrix will be keeped !',base::length(w1),base::length(g1))) + fc1 <- base::table(f1); tc1 <- base::table(t1); fw1 <- which(fc1>1); tw1 <- which(tc1>1); ## check duplicate records + if(base::length(fw1)>0){ + message(sprintf('Original feature %s has %d items with duplicate records, will distribute the original values equal to all related items ! + if do not want this, please check and retry !',from_feature,base::length(fw1))) + #return(use_eset) + w2 <- which(f1 %in% names(fw1)) ## need to distribute + w0 <- base::setdiff(1:base::length(f1),w2) ## do not need to distribute + if(distribute_method=='equal'){ + v1 <- mat[f1[w2],]; ## distribute equal + } + if(distribute_method=='mean'){ + v1 <- mat[f1[w2],]; ## distribute mean + tt <- as.numeric(base::table(f1[w2])[f1[w2]]) + v1 <- v1/tt; + } + rownames(v1) <- paste0(f1[w2],'-',t1[w2]); + f1[w2] <- paste0(f1[w2],'-',t1[w2]); # update transfer table + mat <- base::rbind(v1,mat[f1[w0],]) # update mat table + fc1 <- base::table(f1); tc1 <- base::table(t1); fw1 <- which(fc1>1); tw1 <- which(tc1>1); ## update f1, t1 and related values + } + if(base::length(tw1)>0){ + w2 <- which(t1 %in% names(tw1)) ## need to merge + w0 <- base::setdiff(1:base::length(t1),w2) ## do not need to merge + mat_new_0 <- mat[f1[w0],]; rownames(mat_new_0) <- t1[w0] ## mat do not need to merge + if(merge_method=='mean') tmp1 <- stats::aggregate(mat[f1[w2],,drop=FALSE],list(t1[w2]),function(x){base::mean(x,na.rm=TRUE)}) + if(merge_method=='median') tmp1 <- stats::aggregate(mat[f1[w2],,drop=FALSE],list(t1[w2]),function(x){stats::median(x,na.rm=TRUE)}) + if(merge_method=='max') tmp1 <- stats::aggregate(mat[f1[w2],,drop=FALSE],list(t1[w2]),function(x){base::max(x,na.rm=TRUE)}) + if(merge_method=='min') tmp1 <- stats::aggregate(mat[f1[w2],,drop=FALSE],list(t1[w2]),function(x){base::min(x,na.rm=TRUE)}) + mat_new_1 <- tmp1[,-1]; rownames(mat_new_1) <- tmp1[,1] ## mat merged + mat_new <- base::rbind(mat_new_0,mat_new_1) + }else{ + mat_new <- mat[f1,] + rownames(mat_new) <- t1 + } + new_eset <- generate.eset(exp_mat=mat_new, phenotype_info=Biobase::pData(use_eset), feature_info=NULL, annotation_info=annotation(use_eset)) + return(new_eset) +} + +#' Reassign the phenoData slot of ExpressionSet and Update phenotype information +#' +#' \code{update_eset.phenotype} reassigns the phenoData slot of ExpressionSet based on user's demand. +#' It is mainly used to modify sample names and extract interested phenotype information for further sample clustering. +#' +#' @param use_eset ExpressionSet class object. +#' @param use_phenotype_info data.frame, a dataframe contains phenotype information, can be obtained by calling \code{pData} function. +#' @param use_sample_col character, must be one of the column names in \code{use_phenotype_info}. +#' @param use_col character, the columns will be kept from \code{use_phenotype_info}. +#' 'auto', only extracting 'cluster-meaningful' sample features (e.g. it is meaningless to use 'gender' as clustering feature, if all samples are female). +#' 'GEO-auto' means it will extract the following selected columns, +#' "geo_accession", "title", "source_name_ch1", and columns ended with ":ch1". Default is "auto". +#' @return Return an ExressionSet object with updated phenotype information. +#' @examples +#' \dontrun{ +#' net_eset <- load.exp.GEO(out.dir='./test', +#' GSE='GSE116028', +#' GPL='GPL6480', +#' getGPL=TRUE, +#' update=FALSE) +#' net_eset <- update_eset.phenotype(use_eset=net_eset, +#' use_phenotype_info=Biobase::pData(net_eset), +#' use_sample_col='geo_accession', +#' use_col='GEO-auto') +#' } +#' @export update_eset.phenotype +update_eset.phenotype <- function(use_eset=NULL,use_phenotype_info=NULL,use_sample_col=NULL,use_col='auto'){ + # + all_input_para <- c('use_eset') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(is.null(use_eset)){ + message('use_eset required, please re-try !'); + return(use_eset) + } + if(is.null(use_phenotype_info)) use_phenotype_info <- Biobase::pData(use_eset) + if(is.null(use_sample_col)==FALSE){ + if(!use_sample_col %in% colnames(use_phenotype_info)){ + stop(sprintf('%s not in the colnames of use_phenotype_info, please re-try!',use_sample_col));#return(use_eset) + } + } + if(is.null(use_col)) use_col <- colnames(use_phenotype_info) + mat <- Biobase::exprs(use_eset) + s1 <- colnames(mat) ## all samples + if(is.null(use_sample_col)==TRUE){ + p1 <- rownames(use_phenotype_info) + }else{ + p1 <- use_phenotype_info[,use_sample_col] ## sample in the phenotype info + } + w1 <- which(p1 %in% s1); p1 <- p1[w1]; ## only consider samples in the colnames of expmat + if(base::length(w1)==0){ + if(is.null(use_sample_col)==TRUE){ + message('Colnames of the expression matrix was not included in the rownames of use_phenotype_info, please check and re-try !') + }else{ + message(sprintf('Colnames of the expression matrix was not included in the %s column, please check and re-try !',use_sample_col)) + } + return(use_eset) + } + message(sprintf('%d out of %d samples from the expression matrix will be keeped !',base::length(w1),base::length(s1))) + mat_new <- mat[,p1] + use_phenotype_info <- use_phenotype_info[w1,] + n1 <- colnames(use_phenotype_info) + if(use_col[1] == 'GEO-auto'){ + w1 <- c('geo_accession','title','source_name_ch1',n1[grep(':ch1',n1)]) + p1 <- use_phenotype_info[,w1] + colnames(p1)[4:ncol(p1)] <- gsub('(.*):ch1','\\1',colnames(p1)[4:ncol(p1)]) + colnames(p1)[3] <- gsub('(.*)_ch1','\\1',colnames(p1)[3]) + if(is.null(use_sample_col)==FALSE) rownames(p1) <- use_phenotype_info[,use_sample_col] + if(base::length(w1)>1) p1 <- as.data.frame(apply(p1,2,clean_charVector),stringsAsFactors=FALSE) + if(base::length(w1)==1) p1 <- as.data.frame(clean_charVector(p1),stringsAsFactors=FALSE) + new_phenotype_info <- p1; + }else{ + if(use_col[1] == 'auto'){ + u1 <- apply(use_phenotype_info,2,function(x)base::length(base::unique(x))) + w1 <- which(u1>=2 & u1<=nrow(use_phenotype_info)-1) + if(base::length(w1)==0){ + message('No column could match the auto criteria, please check and re-try!');return(FALSE) + } + p1 <- use_phenotype_info[,w1] + if(base::length(w1)>1) p1 <- as.data.frame(apply(p1,2,clean_charVector),stringsAsFactors=FALSE) + if(base::length(w1)==1) p1 <- as.data.frame(clean_charVector(p1),stringsAsFactors=FALSE) + new_phenotype_info <- use_phenotype_info[,w1];names(new_phenotype_info) <- names(use_phenotype_info)[w1]; + }else{ + if(base::length(base::setdiff(use_col,n1))>0){ + message(sprintf('%s not in use_phenotype_info, please re-try!',base::paste(base::setdiff(use_col,n1),collapse=';')));return(FALSE) + } + p1 <- use_phenotype_info[,use_col] + if(base::length(use_col)>1) p1 <- as.data.frame(apply(p1,2,clean_charVector),stringsAsFactors=FALSE) + if(base::length(use_col)==1) p1 <- as.data.frame(clean_charVector(p1),stringsAsFactors=FALSE) + new_phenotype_info <- p1; names(new_phenotype_info) <- use_col; + } + } + rownames(new_phenotype_info) <- rownames(use_phenotype_info) + #print(new_phenotype_info) + message(sprintf('%d out of %d sample features will be keeped !',ncol(new_phenotype_info),ncol(use_phenotype_info))) + new_eset <- generate.eset(exp_mat=mat_new, phenotype_info=new_phenotype_info, feature_info=Biobase::fData(use_eset), annotation_info=annotation(use_eset)) + return(new_eset) +} + +#' IQR (interquartile range) filter to extract genes from expression matrix +#' +#' \code{IQR.filter} is a function to extract genes from the expression matrix by setting threshold to their IQR value. +#' IQR (interquartile range) is a measure of statistical dispersion. It is calculated for each gene across all the samples. +#' By setting threshold value, genes with certain statistical dispersion across samples will be filtered out. +#' This step is mainly used to perform sample cluster and to prepare the input for SJAracne. +#' +#' @param exp_mat matrix, the gene expression matrix. Each row represents a gene/transcript/probe, each column represents a sample. +#' @param use_genes a vector of characters, the gene list needed to be filtered. Default is the row names of \code{exp_mat}. +#' @param thre numeric, the threshold for IQR of the genes in \code{use_genes}. Default is 0.5. +#' @param loose_gene a vector of characters, the gene list that only need to pass the \code{loose_thre}. +#' This parameter is designed for the input of possible drivers used in SJAracne. Default is NULL. +#' @param loose_thre numeric, the threshold for IQR of the genes in \code{loose_gene}. Default is 0.1. +#' @return Return a vector with logical values indicate which genes should be kept. +#' @examples +#' mat1 <- matrix(rnorm(15000),nrow=1500,ncol=10) +#' colnames(mat1) <- paste0('Sample',1:ncol(mat1)) +#' rownames(mat1) <- paste0('Gene',1:nrow(mat1)) +#' choose1 <- IQR.filter(mat1,thre=0.5, +#' loose_gene=paste0('Gene',1:100)) +#' @export +IQR.filter <- function(exp_mat,use_genes=rownames(exp_mat),thre = 0.5,loose_gene=NULL,loose_thre=0.1) { + # + all_input_para <- c('exp_mat','use_genes','thre') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + use_genes <- base::intersect(use_genes,rownames(exp_mat)) + use_genes <- base::setdiff(use_genes,"") + exp_mat <- exp_mat[use_genes,] + iqr <- apply(exp_mat, 1, stats::IQR) ## calculate IQR for each gene + choose0 <- use_genes[iqr > quantile(iqr, loose_thre)] ## for loose_gene + choose1 <- use_genes[iqr > quantile(iqr, thre)] ## for all genes + choose2 <- base::unique(c(base::intersect(loose_gene, choose0), choose1)) ## union set + use_vec <- rep(FALSE,length.out=base::length(use_genes));names(use_vec) <- use_genes + use_vec[choose2] <- TRUE + print(base::table(use_vec)) + return(use_vec) +} + +#' Normalization of RNA-Seq Reads Count +#' +#' \code{RNASeqCount.normalize.scale} is a simple version to normalize the RNASeq reads count data. +#' +#' Users can also load \code{load.exp.RNASeq.demo}, and follow the \code{DESeq2} pipeline for RNASeq data processing. +#' Warning, \code{load.exp.RNASeq.demo} and \code{load.exp.RNASeq.demoSalmon} in NetBID2 may not cover all the possible scenarios. +#' +#' @param mat matrix, matrix of RNA-Seq reads data. Each row is a gene/transcript, each column is a sample. +#' @param total integer, total RNA-Seq reads count. If NULL, will use the mean of each column's summation. Default is NULL. +#' @param pseudoCount integer, the integer added to avoid "-Inf" showing up during log transformation. Default is 1. +#' +#' @return Return a numeric matrix, containing the normalized RNA-Seq reads count. +#' +#' @examples +#' mat1 <- matrix(rnbinom(10000, mu = 10, size = 1),nrow=1000,ncol=10) +#' colnames(mat1) <- paste0('Sample1',1:ncol(mat1)) +#' rownames(mat1) <- paste0('Gene',1:nrow(mat1)) +#' norm_mat1 <- RNASeqCount.normalize.scale(mat1) +#' @export +RNASeqCount.normalize.scale <- function(mat, + total = NULL, + pseudoCount = 1) { + # + all_input_para <- c('mat','pseudoCount') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + d <- mat + if (!is.data.frame(d)) + d <- data.frame(d) + if (!all(d > 0)) + d <- d + pseudoCount + s <- apply(d, 2, sum) + m <- + ifelse(is.null(total), as.integer(base::mean(s)), as.integer(total)) ## total or mean sum + options(digits = 2 + nchar(m)) + fac <- m / s + for (i in 1:base::length(s)) { + d[, i] <- round(d[, i] * fac[i], 0) + } + if (!all(d > 0)) + d <- d + pseudoCount + d +} + +## inner function for dist2 +dist2.mod <- function (x, fun = function(a, b) base::mean(abs(a - b), na.rm = TRUE), + diagonal = 0) +{ + if (!(is.numeric(diagonal) && (base::length(diagonal) == 1))) + stop("'diagonal' must be a numeric scalar.") + if (missing(fun)) { + res = apply(x, 2, function(w) base::colMeans(abs(x - w), na.rm = TRUE)) + } + else { + res = matrix(diagonal, ncol = ncol(x), nrow = ncol(x)) + if (ncol(x) >= 2) { + for (j in 2:ncol(x)) for (i in 1:(j - 1)) res[i, + j] = res[j, i] = fun(x[, i], x[, j]) + } + } + colnames(res) = rownames(res) = colnames(x) + return(res) +} +########################### activity-related functions +## functions for activity score calculation, mean, absmean, maxmean, weighted mean ? +es <- function(z, es.method = "mean") { + if (es.method == "maxmean") { + n <- base::length(z) + m1 <- ifelse(sum(z > 0) > 0, sum(z[z > 0]) / n, 0) + m2 <- ifelse(sum(z < 0) > 0, sum(z[z < 0]) / n, 0) + if (m1 > -m2) + es <- m1 + else + es <- m2 + } + else if (es.method == 'absmean') { + es <- base::mean(abs(z),na.rm=TRUE) + } + else if (es.method == 'mean') { + es <- base::mean(z,na.rm=TRUE) + } + else if (es.method == 'median') { + es <- stats::median(z,na.rm=TRUE) + } + else if (es.method == 'max') { + es <- base::max(z,na.rm=TRUE) + } + else if (es.method == 'min') { + es <- base::min(z,na.rm=TRUE) + } + return(es) +} +do.std <- function(x) { + x <- x[!is.na(x)] + (x - base::mean(x,na.rm=TRUE)) / sd(x,na.rm=TRUE) +} + +#' Calculate Activity Value for Each Driver +#' +#' \code{cal.Activity} calculates the activity value for each driver. +#' This function requires two inputs, the driver-to-target list object \code{target_list} and the expression matrix. +#' +#' @param target_list list, the driver-to-target list object. Either igraph_obj or target_list is necessary for this function. +#' The names of the list elements are drivers. +#' Each element is a data frame, usually contains at least three columns. +#' "target", target gene names; +#' "MI", mutual information; +#' "spearman", spearman correlation coefficient. +#' "MI" and "spearman" is necessary if es.method="weightedmean". +#' Users can call \code{get.SJAracne.network} to get this list from the network file generated by SJAracne (the second element of the return list) +#' or prepare the list object by hand but should match the data format described above. +#' @param igraph_obj igraph object, optional. Either igraph_obj or target_list is necessary for this function. +#' Users can call \code{get.SJAracne.network} to get this object from the network file generated by SJAracne (the third element of the return list), +#' or prepare the igraph network object by hand (Directed network and the edge attributes should include "weight" and "sign" if es.method="weightedmean"). +#' @param cal_mat numeric matrix, the expression matrix of genes/transcripts. +#' @param es.method character, method applied to calculate the activity value. User can choose from "mean", "weightedmean", "maxmean" and "absmean". +#' Default is "weightedmean". +#' @param std logical, if TRUE, the expression matrix will be normalized by column. Default is TRUE. +#' @param memory_constrain logical, if TRUE, the calculation strategy will not use Matrix Cross Products, which is memory consuming. +#' Default is FALSE. +#' @return Return a matrix of activity values. Rows are drivers, columns are samples. +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ac_mat <- cal.Activity(target_list=analysis.par$merge.network$target_list, +#' cal_mat=Biobase::exprs(analysis.par$cal.eset), +#' es.method='weightedmean') +#' ac_mat <- cal.Activity(igraph_obj=analysis.par$merge.network$igraph_obj, +#' cal_mat=Biobase::exprs(analysis.par$cal.eset), +#' es.method='maxmean') +#' @export +cal.Activity <- function(target_list=NULL, igraph_obj = NULL, cal_mat=NULL, es.method = 'weightedmean',std=TRUE,memory_constrain=FALSE) { + # + all_input_para <- c('cal_mat','es.method','std','memory_constrain') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('memory_constrain',c(TRUE,FALSE),envir=environment()), + check_option('std',c(TRUE,FALSE),envir=environment()), + check_option('es.method',c('mean','weightedmean','maxmean','absmean'),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(is.null(target_list)==TRUE & is.null(igraph_obj)==TRUE){ + message('Either target_list or igraph_obj is required, please check and re-try!');return(FALSE); + } + if(is.null(target_list)==FALSE & memory_constrain==TRUE){ + ac.mat <- cal.Activity.old(target_list=target_list, cal_mat=cal_mat, es.method = es.method,std=std) + return(ac.mat) + } + if(is.null(target_list)==TRUE & memory_constrain==TRUE){ + message('Only accepts target_list input when memory_constrain=TRUE, please check and re-try!');return(FALSE); + } + if(nrow(cal_mat)==0){ + message('No genes in the cal_mat, please check and re-try!');return(FALSE); + } + if(std==TRUE) cal_mat <- apply(cal_mat, 2, do.std) + if(is.null(igraph_obj)==FALSE){ + gr <- igraph_obj + mat1 <- get_igraph2matrix(gr,es.method=es.method) + mat2 <- get_igraph2matrix(gr,es.method='mean') + all_source <- get_gr2driver(gr) + }else{ + if(is.null(target_list)==FALSE){ + mat1 <- get_target_list2matrix(target_list,es.method=es.method) + mat2 <- get_target_list2matrix(target_list,es.method='mean') + all_source <- names(target_list) + } + } + ## + mat1_source <- mat1[all_source,,drop=FALSE] + w1 <- base::intersect(rownames(cal_mat),colnames(mat1_source)) + if(base::length(w1)==0){ + message('No intersected genes found for the cal_mat and target in the network, please check and re-try!'); + return(FALSE) + } + use_mat1_source <- mat1_source[,w1,drop=FALSE] ## network info + use_mat2_source <- mat2[all_source,w1,drop=FALSE] ## network binary info + + ## weighted mean + mean + if(es.method %in% c('weightedmean','mean')){ + use_cal_mat <- cal_mat[w1,,drop=FALSE] ## expression info + out_mat <- use_mat1_source %*% use_cal_mat + out_mat <- out_mat/Matrix::rowSums(use_mat2_source) ## get mean + } + ## absmean + if(es.method == 'absmean'){ + use_cal_mat <- cal_mat[w1,,drop=FALSE] ## expression info + out_mat <- use_mat1_source %*% abs(use_cal_mat) + out_mat <- out_mat/Matrix::rowSums(use_mat2_source) ## get mean + } + ## maxmean + if(es.method == 'maxmean'){ + use_cal_mat <- cal_mat[w1,,drop=FALSE] ## expression info + use_cal_mat_pos <- use_cal_mat;use_cal_mat_pos[which(use_cal_mat_pos<0)] <- 0; + use_cal_mat_neg <- use_cal_mat;use_cal_mat_neg[which(use_cal_mat_neg>0)] <- 0; + out_mat_pos <- use_mat1_source %*% use_cal_mat_pos + out_mat_pos <- out_mat_pos/Matrix::rowSums(use_mat2_source) ## get mean + out_mat_neg <- use_mat1_source %*% use_cal_mat_neg + out_mat_neg <- out_mat_neg/Matrix::rowSums(use_mat2_source) ## get mean + out_mat_sign <- sign(abs(out_mat_pos)-abs(out_mat_neg)) + out_mat_sign_pos <- out_mat_sign; out_mat_sign_pos[out_mat_sign_pos!=1] <-0; + out_mat_sign_neg <- out_mat_sign; out_mat_sign_neg[out_mat_sign_neg!= -1] <-0; + out_mat <- out_mat_pos*out_mat_sign_pos-out_mat_neg*out_mat_sign_neg + } + ## median, min, max , not supported + # output + ac.mat <- as.matrix(out_mat) + w1 <- which(is.na(ac.mat[,1])==FALSE) + if(base::length(w1)==0){ + message('Fail in calculating activity, please check the ID type in cal_mat and target_list and try again !') + } + ac.mat <- ac.mat[w1,,drop=FALSE] + return(ac.mat) +} + +## inner functions +cal.Activity.old <- function(target_list=NULL, cal_mat=NULL, es.method = 'weightedmean',std=TRUE) { + ## mean, absmean, maxmean, weightedmean + use_genes <- row.names(cal_mat) + if(base::length(use_genes)==0){ + message('No genes in the cal_mat, please check and re-try!');return(FALSE); + } + all_target <- target_list + #all_target <- all_target[base::intersect(use_genes, names(all_target))] ## if the driver is not included in cal_mat but its target genes are included, will also calculate activity + ac.mat <- + matrix(NA, ncol = ncol(cal_mat), nrow = base::length(all_target)) ## generate activity matrix, each col for sample, each row for source target + #z-normalize each sample + if(std==TRUE) cal_mat <- apply(cal_mat, 2, do.std) + for (i in 1:base::length(all_target)) { + x <- names(all_target)[i] + x1 <- all_target[[x]] + x2 <- base::unique(base::intersect(rownames(x1), use_genes)) ## filter target by cal genes + x1 <- x1[x2, ] ## target info + target_num <- base::length(x2) + if (target_num == 0) + next + if (target_num == 1){ + if (es.method == 'weightedmean') ac.mat[i, ] <- cal_mat[x2,] + if (es.method != 'weightedmean') ac.mat[i, ] <- cal_mat[x2,]*x1$MI * sign(x1$spearman) + next + } + if (es.method != 'weightedmean') + ac.mat[i, ] <- apply(cal_mat[x2,,drop=FALSE], 2, es, es.method) + if (es.method == 'weightedmean') { + weight <- x1$MI * sign(x1$spearman) + ac.mat[i, ] <- apply(cal_mat[x2,,drop=FALSE] * weight, 2, es, 'mean') + } + } + rownames(ac.mat) <- names(all_target) + colnames(ac.mat) <- colnames(cal_mat) + w1 <- which(is.na(ac.mat[,1])==FALSE) + if(base::length(w1)==0){ + message('Fail in calculating activity, please check the ID type in cal_mat and target_list and try again !') + } + ac.mat <- ac.mat[w1,] + return(ac.mat) +} +get_target_list2matrix <- function(target_list=NULL,es.method = 'weightedmean') { + all_source <- names(target_list) + all_target <- base::unique(unlist(lapply(target_list,function(x)rownames(x)))) + mat1 <- matrix(0,nrow=base::length(all_source),ncol=base::length(all_target)) + rownames(mat1) <- all_source; + colnames(mat1) <- all_target; + for(i in all_source){ + if(es.method!='weightedmean') mat1[i,rownames(target_list[[i]])] <- 1; + if(es.method=='weightedmean') mat1[i,rownames(target_list[[i]])] <- target_list[[i]]$MI*sign(target_list[[i]]$spearman); + } + return(mat1) +} +get_igraph2matrix <- function(gr=NULL,es.method = 'weightedmean'){ + if(es.method=='weightedmean'){ + if('weight' %in% igraph::edge_attr_names(gr) & 'sign' %in% igraph::edge_attr_names(gr)){ + mat1 <- igraph::as_adjacency_matrix(gr,type='both',attr = 'weight') + mat2 <- igraph::as_adjacency_matrix(gr,type='both',attr = 'sign') + mat1 <- mat1*mat2 + }else{ + message('weight, sign attributes are not included in the input igraph object, weightedmean can not be used !') + return(FALSE) + } + }else{ + mat1 <- igraph::as_adjacency_matrix(gr,type='both') + } + return(mat1) +} +get_gr2driver <- function(gr,mode='out'){ + d1 <- igraph::degree(gr,mode=mode) + names(d1[which(d1>0)]) +} + +#' Calculate Activity Value for Gene Sets +#' +#' \code{cal.Activity.GS} calculates activity value for each gene set, and return a numeric matrix with rows of gene sets and columns of samples. +#' +#' @param use_gs2gene list, contains elements of gene sets. Element name is gene set name, each element contains a vector of genes belong to that gene set. +#' Default is using \code{all_gs2gene[c('H','CP:BIOCARTA','CP:REACTOME','CP:KEGG')]}, which is loaded from \code{gs.preload}. +#' @param cal_mat numeric matrix, gene/transcript expression matrix. +#' @param es.method character, method to calculate the activity value. Users can choose from "mean", "absmean", "maxmean", "gsva", "ssgsea", "zscore" and "plage". +#' The details for using the last four options, users can check \code{gsva}. Default is "mean". +#' @param std logical, if TRUE, the expression matrix will be normalized by column. Default is TRUE. +#' @return Return an activity matrix with rows of gene sets and columns of samples. +#' +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' gs.preload(use_spe='Homo sapiens',update=FALSE) +#' use_gs2gene <- merge_gs(all_gs2gene=all_gs2gene, +#' use_gs=c('H','CP:BIOCARTA','CP:REACTOME','CP:KEGG','C5')) +#' exp_mat_gene <- Biobase::exprs(analysis.par$cal.eset) +#' ## each row is a gene symbol, if not, must convert ID first +#' ac_gs <- cal.Activity.GS(use_gs2gene = use_gs2gene, +#' cal_mat = exp_mat_gene) +#' @export +cal.Activity.GS <- function(use_gs2gene=all_gs2gene[c('H','CP:BIOCARTA','CP:REACTOME','CP:KEGG')], cal_mat=NULL, es.method = 'mean',std=TRUE) { + # + all_input_para <- c('use_gs2gene','cal_mat','es.method','std') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('std',c(TRUE,FALSE),envir=environment()), + check_option('es.method',c("mean","absmean","maxmean","gsva","ssgsea","zscore","plage"),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + while(class(use_gs2gene[[1]])=='list'){ + nn <- unlist(lapply(use_gs2gene,names)) + use_gs2gene <- unlist(use_gs2gene,recursive = FALSE) + names(use_gs2gene)<-nn + } + use_genes <- row.names(cal_mat) + if(es.method %in% c('gsva','ssgsea','zscore','plage')){ + ac.mat <- GSVA::gsva(generate.eset(cal_mat),use_gs2gene,method=es.method) + ac.mat <- Biobase::exprs(ac.mat) + return(ac.mat) + } + ac.mat <- + matrix(NA, ncol = ncol(cal_mat), nrow = base::length(use_gs2gene)) ## generate activity matrix, each col for sample, each row for source target + #z-normalize each sample + if(std==TRUE) cal_mat <- apply(cal_mat, 2, do.std) + for (i in 1:base::length(use_gs2gene)) { + x <- names(use_gs2gene)[i] + x1 <- use_gs2gene[[x]] + x2 <- base::unique(base::intersect(x1, use_genes)) ## filter target by cal genes + target_num <- base::length(x2) + if (target_num == 0) + next + if (target_num == 1){ + ac.mat[i, ] <- cal_mat[x2,] + next + } + ac.mat[i, ] <- apply(cal_mat[x2,,drop=FALSE], 2, es, es.method) + } + rownames(ac.mat) <- names(use_gs2gene) + colnames(ac.mat) <- colnames(cal_mat) + w1 <- apply(ac.mat,1,function(x)base::length(which(is.na(x)==TRUE))) + ac.mat <- ac.mat[which(w1==0),,drop=FALSE] + return(ac.mat) +} + +#' Differential Expression Analysis and Differential Activity Analysis Between 2 Sample Groups Using Bayesian Inference +#' +#' \code{getDE.BID.2G} is a function performs differential gene expression analysis and differential driver activity analysis between +#' control group (parameter G0) and experimental group (parameter G1). +#' +#' @param eset ExpressionSet class object, contains gene expression data or driver activity data. +#' @param output_id_column character, the column names of Biobase::fData(eset). +#' This option is useful when the \code{eset} expression matrix is at transcript-level, and user is expecting to see the gene-level comparison statistics. +#' If NULL, rownames of the Biobase::fData(eset) will be used. +#' Default is NULL. +#' @param G1 a vector of characters, the sample names of experimental group. +#' @param G0 a vecotr of characters, the sample names of control group. +#' @param G1_name character, the name of experimental group (e.g. "Male"). Default is "G1". +#' @param G0_name character, the name of control group (e.g. "Female"). Default is "G0". +#' @param logTransformed logical, if TRUE, log tranformation of the expression value will be performed. +#' @param method character, users can choose between "MLE" and "Bayesian". +#' "MLE", the maximum likelihood estimation, will call generalized linear model(glm/glmer) to perform data regression. +#' "Bayesian", will call Bayesian generalized linear model (bayesglm) or multivariate generalized linear mixed model (MCMCglmm) to perform data regression. +#' Default is "Bayesian". +#' @param family character or family function or the result of a call to a family function. +#' This parameter is used to define the model's error distribution. See \code{?family} for details. +#' Currently, options are gaussian, poisson, binomial(for two-group sample classes)/category(for multi-group sample classes)/ordinal(for multi-group sample classes with class_ordered=TRUE). +#' If set with gaussian or poission, the response variable in the regression model will be the expression level, and the independent variable will be the sample's phenotype. +#' If set with binomial, the response variable in the regression model will be the sample phenotype, and the independent variable will be the expression level. +#' For binomial, category and ordinal input, the family will be automatically reset, based on the sample's class level and the setting of \code{class_ordered}. +#' Default is gaussian. +#' @param pooling character, users can choose from "full","no" and "partial". +#' "full", use probes as independent observations. +#' "no", use probes as independent variables in the regression model. +#' "partial", use probes as random effect in the regression model. +#' Default is "full". +#' @param verbose logical, if TRUE, sample names of both groups will be printed. Default is TRUE. +#' +#' @return +#' Return a data frame. Rows are genes/drivers, columns are "ID", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "Z-statistics", "Ave.G1" and "Ave.G0". +#' Names of the columns may vary from different group names. Sorted by P.Value. +#' +#' @examples +#' mat <- matrix(c(0.50099,-1.2108,-1.0524, +#' 0.34881,-0.13441,0.87112, +#' 1.84579,-2.0356,-2.6025, +#' 1.62954,1.88281,1.29604),nrow=2,byrow=TRUE) +#' rownames(mat) <- c('A1','A2') +#' colnames(mat) <- c('Case-rep1','Case-rep2','Case-rep3', +#' 'Control-rep1','Control-rep2','Control-rep3') +#' tmp_eset <- generate.eset(mat,feature_info=data.frame(row.names=rownames(mat), +#' probe=rownames(mat),gene=rep('GeneX',2), +#' stringsAsFactors = FALSE)) +#' res1 <- getDE.BID.2G(tmp_eset,output_id_column='probe', +#' G1=c('Case-rep1','Case-rep2','Case-rep3'), +#' G0=c('Control-rep1','Control-rep2','Control-rep3')) +#' res2 <- getDE.BID.2G(tmp_eset,output_id_column='gene', +#' G1=c('Case-rep1','Case-rep2','Case-rep3'), +#' G0=c('Control-rep1','Control-rep2','Control-rep3')) +#' res3 <- getDE.BID.2G(tmp_eset,output_id_column='gene', +#' G1=c('Case-rep1','Case-rep2','Case-rep3'), +#' G0=c('Control-rep1','Control-rep2','Control-rep3'), +#' pooling='partial') +#' \dontrun{ +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' phe_info <- Biobase::pData(analysis.par$cal.eset) +#' each_subtype <- 'G4' +#' G0 <- rownames(phe_info)[which(phe_info$`subgroup`!=each_subtype)] # get sample list for G0 +#' G1 <- rownames(phe_info)[which(phe_info$`subgroup`==each_subtype)] # get sample list for G1 +#' DE_gene_BID <- getDE.BID.2G(eset=analysis.par$cal.eset, +#' G1=G1,G0=G0, +#' G1_name=each_subtype, +#' G0_name='other') +#' DA_driver_BID <- getDE.BID.2G(eset=analysis.par$merge.ac.eset, +#' G1=G1,G0=G0, +#' G1_name=each_subtype, +#' G0_name='other') +#' } +#' @export +getDE.BID.2G <-function(eset,output_id_column=NULL,G1=NULL, G0=NULL,G1_name=NULL,G0_name=NULL,method='Bayesian',family=gaussian,pooling='full',logTransformed=TRUE,verbose=TRUE){ + # + all_input_para <- c('eset','G1','G0','method','family','pooling','logTransformed','verbose') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('logTransformed',c(TRUE,FALSE),envir=environment()), + check_option('verbose',c(TRUE,FALSE),envir=environment()), + check_option('method',c('Bayesian','MLE'),envir=environment()), + check_option('pooling',c('full','no','partial'),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + exp_mat <- Biobase::exprs(eset) + G1 <- base::intersect(G1,colnames(exp_mat)) + G0 <- base::intersect(G0,colnames(exp_mat)) + if(verbose==TRUE){ + print(sprintf('G1:%s', base::paste(G1, collapse = ';'))) + print(sprintf('G0:%s', base::paste(G0, collapse = ';'))) + } + if(base::length(G1)==0 | base::length(G0)==0){ + message('Too few samples, please check the sample name of G1, G0 and samples in eset !');return(FALSE); + } + # + rn <- rownames(exp_mat) + exp_mat <- exp_mat[,c(G1,G0)] + if(is.null(dim(exp_mat))==TRUE){exp_mat <- t(as.matrix(exp_mat));rownames(exp_mat)<-rn} + if(is.null(output_id_column)==TRUE) use_id <- rownames(Biobase::fData(eset)) else use_id <- Biobase::fData(eset)[,output_id_column] + comp <- c(rep(1,length.out=base::length(G1)),rep(0,length.out=base::length(G0))) + all_id <- base::unique(use_id) + de <- lapply(all_id,function(x){ + w1 <- which(use_id==x) + x1 <- exp_mat[w1,,drop=FALSE] + bid(mat=x1,use_obs_class=comp,class_order=c(0,1),family=family,method=method, + nitt=13000,burnin=3000,thin=1,pooling=pooling,class_ordered=FALSE, + logTransformed=logTransformed,std=FALSE,average.method=c('geometric'),verbose=FALSE) + }) + de <- as.data.frame(do.call(base::rbind,de)) + de$adj.P.Val<-p.adjust(de$P.Value,'fdr') + de$logFC<-sign(de$FC)*log2(abs(de$FC)) + de$ID <- all_id + de<-de[,c('ID','logFC','AveExpr','t','P.Value','adj.P.Val','Z-statistics')] + rownames(de) <- de$ID + de <- de[order(de$P.Value),] + tT <- de + new_mat <- stats::aggregate(exp_mat,list(use_id),mean)[,-1] + tT <- tT[all_id,,drop=FALSE] + exp_G1 <- base::rowMeans(new_mat[,G1,drop=FALSE]); + exp_G0 <- base::rowMeans(new_mat[,G0,drop=FALSE]); + tT <- base::cbind(tT,'Ave.G0'=exp_G0,'Ave.G1'=exp_G1) + if(is.null(G0_name)==FALSE) colnames(tT) <- gsub('Ave.G0',paste0('Ave.',G0_name),colnames(tT)) + if(is.null(G1_name)==FALSE) colnames(tT) <- gsub('Ave.G1',paste0('Ave.',G1_name),colnames(tT)) + tT <- tT[order(tT$P.Value),] + return(tT) +} + +#' Combine Multiple Comparison Results from Differential Expression (DE) or Differential Activity (DA) Analysis +#' +#' \code{combineDE} combines multiple comparisons of DE or DA analysis. +#' Can combine DE with DE, DA with DA and also DE with DA if proper transfer table prepared. +#' +#' For example, there are 4 subgroups in the phenotype, G1, G2, G3 and G4. One DE analysis was performed on G1 vs. G2, and another DE was performed on G1 vs. G3. +#' If user is interested in the DE analysis between G1 vs. (G2 and G3), he can call this function to combine the two comparison results above toghether. +#' The combined P values will be taken care by \code{combinePvalVector}. +#' +#' @param DE_list list, each element in the list is one DE/DA comparison need to be combined. +#' @param DE_name a vector of characters, the DE/DA comparison names. +#' If not NULL, it must match the names of DE_list in correct order. +#' If NULL, names of the DE_list will be used. +#' Default is NULL. +#' @param transfer_tab data.frame, the ID conversion table. Users can call \code{get_IDtransfer} to get this table. +#' The purpose is to correctly mapping ID for \code{DE_list}. The column names must match \code{DE_name}. +#' If NULL, ID column of each DE comparison will be considered as the same type. +#' Default is NULL. +#' @param main_id character, a name of the element in \code{DE_list}. The ID column of that comparison will be used as the ID of the final combination. +#' If NULL, the first element name from \code{DE_list} will be used. Default is NULL. +#' @param method character, users can choose between "Stouffer" and "Fisher". Default is "Stouffer". +#' @param twosided logical, if TRUE, a two-tailed test will be performed. +#' If FALSE, a one-tailed test will be performed, and P value falls within the range of 0 to 0.5. Default is TRUE. +#' @param signed logical, if TRUE, give a sign to the P value, which indicating the direction of testing. +#' Default is TRUE. +#' @return Return a list contains the combined DE/DA analysis. Each single comparison result before combination is wrapped inside +#' (may have with some IDs filtered out, due to the combination). A data frame named "combine" inside the list is the combined analysis. +#' Rows are genes/drivers, columns are combined statistics (e.g. "logFC", "AveExpr", "t", "P.Value" etc.). +#' +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' phe_info <- Biobase::pData(analysis.par$cal.eset) +#' each_subtype <- 'G4' +#' G0 <- rownames(phe_info)[which(phe_info$`subgroup`!=each_subtype)] # get sample list for G0 +#' G1 <- rownames(phe_info)[which(phe_info$`subgroup`==each_subtype)] # get sample list for G1 +#' DE_gene_limma <- getDE.limma.2G(eset=analysis.par$cal.eset, +#' G1=G1,G0=G0, +#' G1_name=each_subtype, +#' G0_name='other') +#' DA_driver_limma <- getDE.limma.2G(eset=analysis.par$merge.ac.eset, +#' G1=G1,G0=G0, +#' G1_name=each_subtype, +#' G0_name='other') +#' DE_list <- list(DE=DE_gene_limma,DA=DA_driver_limma) +#' g1 <- gsub('(.*)_.*','\\1',DE_list$DA$ID) +#' transfer_tab <- data.frame(DE=g1,DA=DE_list$DA$ID,stringsAsFactors = FALSE) +#' res1 <- combineDE(DE_list,transfer_tab=transfer_tab,main_id='DA') +#' +#' \dontrun{ +#' each_subtype <- 'G4' +#' G0 <- rownames(phe_info)[which(phe_info$`subgroup`!=each_subtype)] # get sample list for G0 +#' G1 <- rownames(phe_info)[which(phe_info$`subgroup`==each_subtype)] # get sample list for G1 +#' DE_gene_limma_G4 <- getDE.limma.2G(eset=analysis.par$cal.eset, +#' G1=G1,G0=G0, +#' G1_name=each_subtype, +#' G0_name='other') +#' each_subtype <- 'SHH' +#' G0 <- rownames(phe_info)[which(phe_info$`subgroup`!=each_subtype)] # get sample list for G0 +#' G1 <- rownames(phe_info)[which(phe_info$`subgroup`==each_subtype)] # get sample list for G1 +#' DE_gene_limma_SHH <- getDE.limma.2G(eset=analysis.par$cal.eset, +#' G1=G1,G0=G0, +#' G1_name=each_subtype, +#' G0_name='other') +#' DE_list <- list(G4=DE_gene_limma_G4,SHH=DE_gene_limma_SHH) +#' res2 <- combineDE(DE_list,transfer_tab=NULL) +#' } +#' @export +combineDE<-function(DE_list,DE_name=NULL,transfer_tab=NULL,main_id=NULL,method='Stouffer',twosided=TRUE,signed=TRUE){ + # + all_input_para <- c('DE_list','method','twosided','signed') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('twosided',c(TRUE,FALSE),envir=environment()), + check_option('signed',c(TRUE,FALSE),envir=environment()), + check_option('method',c('Stouffer','Fisher'),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + nDE<-base::length(DE_list) + if(nDE<2) stop('At least two DE outputs are required for combineDE analysis!\n') + if(is.null(DE_name)==TRUE){ + DE_name <- names(DE_list) + } + if(is.null(transfer_tab)==TRUE){ + transfer_tab <- lapply(DE_list,function(x)x$ID) + names(transfer_tab) <- DE_name + w1 <- names(which(base::table(unlist(transfer_tab))==nDE)) + if(base::length(w1)==0){message('No intersected IDs found between DE list, please check and re-try!');return(FALSE);} + transfer_tab <- do.call(base::cbind,lapply(transfer_tab,function(x)w1)) + transfer_tab <- as.data.frame(transfer_tab,stringsAsFactors=FALSE) + } + w1 <- lapply(DE_name,function(x){ + which(transfer_tab[[x]] %in% DE_list[[x]]$ID) + }) + w11 <- which(base::table(unlist(w1))==nDE) + if(base::length(w11)==0){ + message('No intersected IDs found between DE list, please check and re-try!');return(FALSE); + } + w2 <- as.numeric(names(w11)) + transfer_tab <- transfer_tab[w2,] + combine_info <- lapply(DE_name,function(x1){ + DE_list[[x1]][transfer_tab[[x1]],] + }) + names(combine_info) <- DE_name + dd <- do.call(base::cbind,lapply(combine_info,function(x1){ + x1$P.Value*sign(x1$logFC) + })) + res1 <- t(apply(dd,1,function(x){ + combinePvalVector(x,method=method,signed=signed,twosided=twosided) + })) + res1 <- as.data.frame(res1) + if(is.null(main_id)==TRUE) main_id <- DE_name[1] + res1$adj.P.Val <- p.adjust(res1$P.Value,'fdr') + res1$logFC <- base::rowMeans(do.call(base::cbind,lapply(combine_info,function(x)x$logFC))) + res1$AveExpr <- base::rowMeans(do.call(base::cbind,lapply(combine_info,function(x)x$AveExpr))) + res1 <- base::cbind(ID=transfer_tab[,main_id],transfer_tab,res1,stringsAsFactors=FALSE) + rownames(res1) <- res1$ID + combine_info$combine <- res1 + return(combine_info) +} + + +# inner function: class_label can be obtained by get_class +get_class2design <- function(class_label){ + design <- model.matrix(~0+class_label);colnames(design) <- base::unique(class_label); + rownames(design) <- names(class_label) + return(design) + #design.mat <-as.data.frame(matrix(0, nrow = base::length(class_label), ncol = base::length(base::unique(class_label)))) + #rownames(design.mat) <- names(class_label) ## sample + #colnames(design.mat) <- base::unique(class_label) + #for(i in 1:base::length(class_label)){design.mat[names(class_label)[i],class_label[i]]<-1;} + #return(design.mat) +} + +#' Differential Expression Analysis and Differential Activity Analysis Between 2 Sample Groups Using Limma +#' +#' \code{getDE.limma.2G} is a function performs differential gene expression analysis and differential driver activity analysis +#' between control group (parameter G0) and experimental group (parameter G1), using limma related functions. +#' +#' @param eset ExpressionSet class object, contains gene expression data or driver activity data. +#' @param G1 a vector of characters, the sample names of experimental group. +#' @param G0 a vecotr of characters, the sample names of control group. +#' @param G1_name character, the name of experimental group (e.g. "Male"). Default is "G1". +#' @param G0_name character, the name of control group (e.g. "Female"). Default is "G0". +#' @param verbose logical, if TRUE, sample names of both groups will be printed. Default is TRUE. +#' @param random_effect a vector of characters, vector or factor specifying a blocking variable. +#' Default is NULL, no random effect will be considered. +#' +#' @return +#' Return a data frame. Rows are genes/drivers, columns are "ID", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B", "Z-statistics", "Ave.G1" and "Ave.G0". +#' Names of the columns may vary from different group names. Sorted by P-values. +#' +#' +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' phe_info <- Biobase::pData(analysis.par$cal.eset) +#' each_subtype <- 'G4' +#' G0 <- rownames(phe_info)[which(phe_info$`subgroup`!=each_subtype)] # get sample list for G0 +#' G1 <- rownames(phe_info)[which(phe_info$`subgroup`==each_subtype)] # get sample list for G1 +#' DE_gene_limma <- getDE.limma.2G(eset=analysis.par$cal.eset, +#' G1=G1,G0=G0, +#' G1_name=each_subtype, +#' G0_name='other') +#' DA_driver_limma <- getDE.limma.2G(eset=analysis.par$merge.ac.eset, +#' G1=G1,G0=G0, +#' G1_name=each_subtype, +#' G0_name='other') +#' @export +getDE.limma.2G <- function(eset=NULL, G1=NULL, G0=NULL,G1_name=NULL,G0_name=NULL,verbose=TRUE,random_effect=NULL) { + # + all_input_para <- c('eset','G1','G0','verbose') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('verbose',c(TRUE,FALSE),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + exp_mat <- Biobase::exprs(eset) + G1 <- base::intersect(G1,colnames(exp_mat)) + G0 <- base::intersect(G0,colnames(exp_mat)) + if(verbose==TRUE){ + print(sprintf('G1:%s', base::paste(G1, collapse = ';'))) + print(sprintf('G0:%s', base::paste(G0, collapse = ';'))) + } + if(base::length(G1)==0 | base::length(G0)==0){ + message('Too few samples, please check the sample name of G1, G0 and samples in eset !');return(FALSE); + } + # + all_samples <- colnames(Biobase::exprs(eset)) + use_samples <- c(G0, G1) + phe <- as.data.frame(Biobase::pData(eset)[use_samples, ,drop=FALSE]); + rownames(phe) <- use_samples + new_eset <- generate.eset(exp_mat=Biobase::exprs(eset)[, use_samples,drop=F],phenotype_info=phe, feature_info=Biobase::fData(eset)) + new_mat <- Biobase::exprs(new_eset) + ## + design.mat <-as.data.frame(matrix(NA, nrow = base::length(use_samples), ncol = 1)) + rownames(design.mat) <-use_samples + colnames(design.mat) <- 'group' + design.mat[base::intersect(G0, use_samples), 'group'] <- 'G0' + design.mat[base::intersect(G1, use_samples), 'group'] <- 'G1' + # design <- model.matrix( ~ group + 0, design.mat) + group <- factor(design.mat$group) + design <- model.matrix(~0+group); + colnames(design) <- levels(group); rownames(design) <- colnames(new_mat) + + if(is.null(random_effect)==TRUE){ + fit <- limma::lmFit(new_mat,design) + }else{ + random_effect <- random_effect[colnames(Biobase::exprs(new_eset))] + corfit <- limma::duplicateCorrelation(new_eset,design,block=random_effect) + fit <- limma::lmFit(new_mat,design,block=random_effect,correlation=corfit$consensus) + } + contrasts <- limma::makeContrasts(G1-G0,levels=design) + fit2 <- limma::contrasts.fit(fit,contrasts=contrasts) + fit2 <- limma::eBayes(fit2,trend=TRUE) + #summary(decideTests(fit2, method="global")) + ## + tT <- limma::topTable(fit2, adjust.method = "fdr", number = Inf,coef=1) + if(nrow(tT)==1){ + rownames(tT) <- rownames(new_mat) + } + tT <- base::cbind(ID=rownames(tT),tT,stringsAsFactors=FALSE) + tT <- tT[rownames(new_mat),,drop=FALSE] + exp_G1 <- base::rowMeans(new_mat[,G1,drop=FALSE]); + exp_G0 <- base::rowMeans(new_mat[,G0,drop=FALSE]); + w1 <- which(tT$P.Value<=0); + if(base::length(w1)>0) tT$P.Value[w1] <- .Machine$double.xmin; + z_val <- sapply(tT$P.Value*sign(tT$logFC),function(x)combinePvalVector(x,twosided = TRUE)[1]) + if(is.null(random_effect)==TRUE){ + tT <- base::cbind(tT,'Z-statistics'=z_val,'Ave.G0'=exp_G0,'Ave.G1'=exp_G1) + }else{ + tT <- base::cbind(tT,'Z-statistics'=z_val,'Ave.G0'=exp_G0,'Ave.G1'=exp_G1, + 'Ave.G0_RemoveRandomEffect'=fit@.Data[[1]][rownames(tT),'G0'], + 'Ave.G1_RemoveRandomEffect'=fit@.Data[[1]][rownames(tT),'G1']) + } + if(is.null(G0_name)==FALSE) colnames(tT) <- gsub('Ave.G0',paste0('Ave.',G0_name),colnames(tT)) + if(is.null(G1_name)==FALSE) colnames(tT) <- gsub('Ave.G1',paste0('Ave.',G1_name),colnames(tT)) + tT <- tT[order(tT$P.Value, decreasing = FALSE), ] + return(tT) +} + +#' Combine P Values Using Fisher's Method or Stouffer's Method +#' +#' \code{combinePvalVector} is a function to combine multiple comparison's P values using Fisher's method or Stouffer's method. +#' +#' @param pvals a vector of numerics, the P values from multiple comparison need to be combined. +#' @param method character, users can choose between "Stouffer" and "Fisher". Default is "Stouffer". +#' @param signed logical, if TRUE, will give a sign to the P value to indicate the direction of testing. +#' Default is TRUE. +#' @param twosided logical, if TRUE, P value is calculated in a one-tailed test. +#' If FALSE, P value is calculated in a two-tailed test, and it falls within the range 0 to 0.5. +#' Default is TRUE. +#' @return Return a vector contains the "Z-statistics" and "P.Value". +#' @examples +#' combinePvalVector(c(0.1,1e-3,1e-5)) +#' combinePvalVector(c(0.1,1e-3,-1e-5)) +#' @export +combinePvalVector <- + function(pvals, + method = 'Stouffer', + signed = TRUE, + twosided = TRUE) { + # + all_input_para <- c('pvals','method','signed','twosided') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('signed',c(TRUE,FALSE),envir=environment()), + check_option('twosided',c(TRUE,FALSE),envir=environment()), + check_option('method',c('Stouffer','Fisher'),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + #remove NA pvalues + pvals <- pvals[!is.na(pvals) & !is.null(pvals)] + pvals[which(abs(pvals)<=0)] <- .Machine$double.xmin + if (sum(is.na(pvals)) >= 1) { + stat <- NA + pval <- NA + } else{ + if (twosided & (sum(pvals > 1 | pvals < -1) >= 1)) + stop('pvalues must between 0 and 1!\n') + if (!twosided & (sum(pvals > 0.5 | pvals < -0.5) >= 1)) + stop('One-sided pvalues must between 0 and 0.5!\n') + + if (!signed) { + pvals <- abs(pvals) + } + + signs <- sign(pvals) + signs[signs == 0] <- 1 + + if (grepl('Fisher', method, ignore.case = TRUE)) { + if (twosided & signed) { + neg.pvals <- pos.pvals <- abs(pvals) / 2 + pos.pvals[signs < 0] <- 1 - pos.pvals[signs < 0] + neg.pvals[signs > 0] <- 1 - neg.pvals[signs > 0] + } else{ + neg.pvals <- pos.pvals <- abs(pvals) + } + pvals <- + c(1, -1) * c( + pchisq( + -2 * sum(log(as.numeric(pos.pvals))), + df = 2 * base::length(pvals), + lower.tail = FALSE + ) / 2, + pchisq( + -2 * sum(log(as.numeric(neg.pvals))), + df = 2 * base::length(pvals), + lower.tail = FALSE + ) / 2 + ) + pval <- base::min(abs(pvals))[1] + #if two pvals are equal, pick up the first one + stat <- + sign(pvals[abs(pvals) == pval])[1] * qnorm(pval, lower.tail = F)[1] + pval <- 2 * pval + } + else if (grepl('Stou', method, ignore.case = TRUE)) { + if (twosided) { + zs <- signs * qnorm(abs(pvals) / 2, lower.tail = FALSE) + stat <- sum(zs) / sqrt(base::length(zs)) + pval <- 2 * pnorm(abs(stat), lower.tail = FALSE) + } + else{ + zs <- signs * qnorm(abs(pvals), lower.tail = FALSE) + stat <- sum(zs) / sqrt(base::length(zs)) + pval <- pnorm(abs(stat), lower.tail = FALSE) + } + } + else{ + stop('Only \"Fisher\" or \"Stouffer\" method is supported!!!\n') + } + } + return(c(`Z-statistics` = stat, `P.Value` = pval)) + } + + +#' Merge Activity Values from TF (transcription factors) ExpressionSet Object and Sig (signaling factors) ExpressionSet Object +#' +#' \code{merge_TF_SIG.AC} combines the activity value from TF (transcription factors) and Sig (signaling factors) ExpressionSet objects together, +#' and adds "_TF" or "_SIG" suffix to drivers for easier distinction. +#' +#' +#' @param TF_AC ExpressionSet object, containing the activity values for all TFs. +#' @param SIG_AC ExpressionSet object, containing the activity values for all SIGs. +#' +#' @return Return an ExpressionSet object. +#' @examples +#' if(exists('analysis.par')==TRUE) rm(analysis.par) +#' network.dir <- sprintf('%s/demo1/network/',system.file(package = "NetBID2")) # use demo +#' network.project.name <- 'project_2019-02-14' # demo project name +#' project_main_dir <- 'test/' +#' project_name <- 'test_driver' +#' analysis.par <- NetBID.analysis.dir.create(project_main_dir=project_main_dir, +#' project_name=project_name, +#' network_dir=network.dir, +#' network_project_name=network.project.name) +#' analysis.par$tf.network <- get.SJAracne.network(network_file=analysis.par$tf.network.file) +#' analysis.par$sig.network <- get.SJAracne.network(network_file=analysis.par$sig.network.file) +#' ## get eset (here for demo, use network.par$net.eset) +#' network.par <- list() +#' network.par$out.dir.DATA <- system.file('demo1','network/DATA/',package = "NetBID2") +#' NetBID.loadRData(network.par=network.par,step='exp-QC') +#' analysis.par$cal.eset <- network.par$net.eset +#' ac_mat_TF <- cal.Activity(target_list=analysis.par$tf.network$target_list, +#' cal_mat=Biobase::exprs(analysis.par$cal.eset), +#' es.method='weightedmean') +#' ac_mat_SIG <- cal.Activity(target_list=analysis.par$tf.network$target_list, +#' cal_mat=Biobase::exprs(analysis.par$cal.eset), +#' es.method='weightedmean') +#' analysis.par$ac.tf.eset <- generate.eset(exp_mat=ac_mat_TF, +#' phenotype_info=Biobase::pData(analysis.par$cal.eset)) +#' analysis.par$ac.sig.eset <- generate.eset(exp_mat=ac_mat_SIG, +#' phenotype_info=Biobase::pData(analysis.par$cal.eset)) +#' analysis.par$merge.ac.eset <- merge_TF_SIG.AC(TF_AC=analysis.par$ac.tf.eset, +#' SIG_AC=analysis.par$ac.sig.eset) +#' @export +merge_TF_SIG.AC <- function(TF_AC=NULL,SIG_AC=NULL){ + # + all_input_para <- c('TF_AC','SIG_AC') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + mat_TF <- Biobase::exprs(TF_AC) + mat_SIG <- Biobase::exprs(SIG_AC) + funcType <- c(rep('TF',nrow(mat_TF)),rep('SIG',nrow(mat_SIG))) + rn <- c(rownames(mat_TF),rownames(mat_SIG)) + rn_label <- base::paste(rn,funcType,sep='_') + mat_combine <- base::rbind(mat_TF,mat_SIG[,colnames(mat_TF)]) + rownames(mat_combine) <- rn_label + eset_combine <- generate.eset(exp_mat=mat_combine,phenotype_info=Biobase::pData(TF_AC)[colnames(mat_combine),], + feature_info=NULL,annotation_info='activity in dataset') + return(eset_combine) +} + +#' Merge TF (transcription factor) Network and Sig (signaling factor) Network +#' +#' \code{merge_TF_SIG.network} takes TF network and Sig network and combine them together. +#' The merged list object contains three elements, a data.frame contains all the combined network information \code{network_dat}, +#' a driver-to-target list object \code{target_list}, and an igraph object of the network \code{igraph_obj}. +#' +#' @param TF_network list, the TF network created by \code{get.SJAracne.network} function. +#' @param SIG_network list, the SIG network created by \code{get.SJAracne.network} function. +#' @return +#' Return the a list containing three elements, \code{network_dat}, \code{target_list} and \code{igraph_obj}. +#' @examples +#' if(exists('analysis.par')==TRUE) rm(analysis.par) +#' network.dir <- sprintf('%s/demo1/network/',system.file(package = "NetBID2")) # use demo +#' network.project.name <- 'project_2019-02-14' # demo project name +#' project_main_dir <- 'test/' +#' project_name <- 'test_driver' +#' analysis.par <- NetBID.analysis.dir.create(project_main_dir=project_main_dir, +#' project_name=project_name, +#' network_dir=network.dir, +#' network_project_name=network.project.name) +#' analysis.par$tf.network <- get.SJAracne.network(network_file=analysis.par$tf.network.file) +#' analysis.par$sig.network <- get.SJAracne.network(network_file=analysis.par$sig.network.file) +#' analysis.par$merge.network <- merge_TF_SIG.network(TF_network=analysis.par$tf.network, +#' SIG_network=analysis.par$sig.network) +#' @export +merge_TF_SIG.network <- function(TF_network=NULL,SIG_network=NULL){ + # + all_input_para <- c('TF_network','SIG_network') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + s_TF <- names(TF_network$target_list) + s_SIG <- names(SIG_network$target_list) + funcType <- c(rep('TF',base::length(s_TF)),rep('SIG',base::length(s_SIG))) + rn <- c(s_TF,s_SIG) + rn_label <- base::paste(rn,funcType,sep='_') + target_list_combine <- c(TF_network$target_list,SIG_network$target_list) + names(target_list_combine) <- rn_label + n_TF <- TF_network$network_dat + if(nrow(n_TF)>0) n_TF$source <- base::paste(n_TF$source,'TF',sep='_') + n_SIG <- SIG_network$network_dat + if(nrow(n_SIG)>0) n_SIG$source <- base::paste(n_SIG$source,'SIG',sep='_') + net_dat <- base::rbind(n_TF,n_SIG) + igraph_obj <- graph_from_data_frame(net_dat[,c('source','target')],directed=TRUE) + if('MI' %in% colnames(net_dat)) igraph_obj <- set_edge_attr(igraph_obj,'weight',index=E(igraph_obj),value=net_dat[,'MI']) + if('spearman' %in% colnames(net_dat)) igraph_obj <- set_edge_attr(igraph_obj,'sign',index=E(igraph_obj),value=sign(net_dat[,'spearman'])) + return(list(network_dat=net_dat,target_list=target_list_combine,igraph_obj=igraph_obj)) +} + +#' Generate the Master Table for Drivers +#' +#' \code{generate.masterTable} generates a master table to show the mega information of all tested drivers. +#' +#' The master table gathers TF (transcription factor) information, Sig (signaling factor) information, all the DE (differential expression analysis) +#' and DA (differential activity analysis) from multiple comparisons. It also shows each driver's target gene size and other additional information +#' (e.g. gene biotype, chromosome name, position etc.). +#' +#' @param use_comp a vector of characters, the name of multiple comparisons. It will be used to name the columns of master table. +#' @param DE list, a list of DE comparisons, each comparison is a data.frame. The element name in the list must contain the name in \code{use_comp}. +#' @param DA list, a list of DA comparisons, each comparison is a data.frame. The element name in the list must contain the name in \code{use_comp}. +#' @param target_list list, a driver-to-target list. The names of the list elements are drivers. Each element is a data frame, usually contains three columns. +#' "target", target gene names; "MI", mutual information; "spearman", spearman correlation coefficient. +#' It is highly suggested to follow the NetBID2 pipeline, and the \code{TF_network} could be generated by \code{get_net2target_list} and \code{get.SJAracne.network}. +#' @param main_id_type character, the type of driver's ID. It comes from the attribute name in biomaRt package. +#' Such as "ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id", "ensembl_transcript_id_version" or "refseq_mrna". +#' For details, user can call \code{biomaRt::listAttributes()} to display all available attributes in the selected dataset. +#' @param transfer_tab data.frame, the data frame for ID conversion. This can be obtained by calling \code{get_IDtransfer}. +#' If NULL and \code{main_id_type} is not in the column names of \code{tf_sigs}, it will use the conversion table within the function. +#' Default is NULL. +#' @param tf_sigs list, contains all the detailed information of TF and Sig. Users can call \code{db.preload} for access. +#' @param z_col character, name of the column in \code{DE} and \code{DA} contains the Z statistics. Default is "Z-statistics". +#' @param display_col character, name of the column in \code{DE} and \code{DA} need to be kept in the master table. Default is c("logFC","P.Value"). +#' @param column_order_stratey character, users can choose between "type" and "comp". Default is "type". +#' If set as type, the columns will be ordered by column type; If set as comp, the columns will be ordered by comparison. +#' @return Return a data frame contains the mega information of all tested drivers. +#' The column "originalID" and "originalID_label" is the same ID as from the original dataset. +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' #analysis.par$final_ms_tab ## this is master table generated before +#' ac_mat <- cal.Activity(target_list=analysis.par$merge.network$target_list, +#' cal_mat=Biobase::exprs(analysis.par$cal.eset),es.method='weightedmean') +#' analysis.par$ac.merge.eset <- generate.eset(exp_mat=ac_mat, +#' phenotype_info=Biobase::pData(analysis.par$cal.eset)) +#' phe_info <- Biobase::pData(analysis.par$cal.eset) +#' all_subgroup <- base::unique(phe_info$subgroup) ## +#' for(each_subtype in all_subgroup){ +#' comp_name <- sprintf('%s.Vs.others',each_subtype) ## each comparison must give a name !!! +#' G0 <- rownames(phe_info)[which(phe_info$`subgroup`!=each_subtype)] # get sample list for G0 +#' G1 <- rownames(phe_info)[which(phe_info$`subgroup`==each_subtype)] # get sample list for G1 +#' DE_gene_limma <- getDE.limma.2G(eset=analysis.par$cal.eset,G1=G1,G0=G0, +#' G1_name=each_subtype,G0_name='other') +#' analysis.par$DE[[comp_name]] <- DE_gene_limma +#' DA_driver_limma <- getDE.limma.2G(eset=analysis.par$ac.merge.eset,G1=G1,G0=G0, +#' G1_name=each_subtype,G0_name='other') +#' analysis.par$DA[[comp_name]] <- DA_driver_limma +#' } +#' all_comp <- names(analysis.par$DE) ## get all comparison name for output +#' db.preload(use_level='gene',use_spe='human',update=FALSE); +#' test_ms_tab <- generate.masterTable(use_comp=all_comp, +#' DE=analysis.par$DE, +#' DA=analysis.par$DA, +#' target_list=analysis.par$merge.network$target_list, +#' tf_sigs=tf_sigs, +#' z_col='Z-statistics', +#' display_col=c('logFC','P.Value'), +#' main_id_type='external_gene_name') +#' @export +generate.masterTable <- function(use_comp=NULL,DE=NULL,DA=NULL, + target_list=NULL,main_id_type=NULL,transfer_tab=NULL, + tf_sigs=tf_sigs, + z_col='Z-statistics',display_col=c('logFC','P.Value'),column_order_stratey='type'){ + # + all_input_para <- c('use_comp','DE','DA','target_list','main_id_type','tf_sigs','z_col','display_col','column_order_stratey') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('column_order_stratey',c('type','comp'),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(base::length(base::setdiff(use_comp,names(DE)))>0){message(sprintf('%s not included in DE, please check and re-try!',base::setdiff(use_comp,names(DE))));return(FALSE)} + if(base::length(base::setdiff(use_comp,names(DA)))>0){message(sprintf('%s not included in DA, please check and re-try!',base::setdiff(use_comp,names(DA))));return(FALSE)} + # get original ID + ori_rn <- rownames(DA[[1]]) ## with label + w1 <- grep('(.*)_TF',ori_rn); w2 <- grep('(.*)_SIG',ori_rn) + funcType <- rep(NA,length.out=base::length(ori_rn));rn <- funcType + funcType[w1] <- 'TF'; funcType[w2] <- 'SIG'; + rn[w1] <- gsub('(.*)_TF',"\\1",ori_rn[w1]);rn[w2] <- gsub('(.*)_SIG',"\\1",ori_rn[w2]); + rn_label <- ori_rn + use_size <- unlist(lapply(target_list[rownames(DA[[1]])],nrow)) + # id issue + current_id <- names(tf_sigs$tf)[-1] + use_info <- base::unique(base::rbind(tf_sigs$tf$info,tf_sigs$sig$info)) + if(main_id_type %in% current_id){ + use_info <- use_info[which(use_info[,main_id_type] %in% rn),] + }else{ + if(is.null(transfer_tab)==TRUE){ + transfer_tab <- get_IDtransfer(from_type=main_id_type,to_type=current_id[1],use_genes=rn,ignore_version = TRUE) + use_info <- base::merge(use_info,transfer_tab,by.x=current_id[1],by.y=current_id[1]) + }else{ + transfer_tab <- transfer_tab[which(transfer_tab[,main_id_type] %in% rn),] + uid <- base::intersect(colnames(transfer_tab),colnames(use_info)) + if(base::length(uid)==0){message('No ID type in the transfer_tab could match ID type in tf_sigs, please check and re-try!');return(FALSE);} + uid <- uid[1] + use_info <- base::merge(use_info,transfer_tab,by.x=uid,by.y=uid) + } + use_info <- use_info[which(use_info[,main_id_type] %in% rn),] + } + use_info <- base::unique(use_info) + if(nrow(use_info)==0){message('ID issue error, please check main_id_type setting!');return(FALSE);} + # merge info + tmp1 <- stats::aggregate(use_info,list(use_info[,main_id_type]),function(x){ + x1 <- x[which(x!="")] + x1 <- x1[which(is.na(x1)==FALSE)] + base::paste(sort(base::unique(x1)),collapse=';') + }) + tmp1 <- tmp1[,-1]; rownames(tmp1) <- tmp1[,main_id_type] + geneSymbol <- tmp1[rn,'external_gene_name'] ## this column for function enrichment + if('external_transcript_name' %in% colnames(tmp1)){ ## this column for display + gene_label <- base::paste(tmp1[rn,'external_transcript_name'],funcType,sep = '_') + }else{ + gene_label <-base::paste(tmp1[rn,'external_gene_name'],funcType,sep = '_') + } + # + #label_info <- data.frame('gene_label'=gene_label,'geneSymbol'=geneSymbol, + # 'originalID'=rn,'originalID_label'=rn_label,'funcType'=funcType,'Size'=use_size,stringsAsFactors=FALSE) + label_info <- data.frame('originalID_label'=rn_label,'originalID'=rn,'gene_label'=gene_label,'geneSymbol'=geneSymbol, + 'funcType'=funcType,'Size'=use_size,stringsAsFactors=FALSE) + w1 <- which(is.na(geneSymbol)==TRUE) + label_info[w1,'geneSymbol'] <- label_info[w1,'originalID'] + label_info[w1,'gene_label'] <- label_info[w1,'originalID_label'] + add_info <- tmp1[rn,] + # + combine_info <- lapply(use_comp,function(x){ + DA[[x]] <- DA[[x]][rn_label,] + DE[[x]] <- as.data.frame(DE[[x]])[rn,] + avg_col <- colnames(DA[[x]])[grep('^Ave',colnames(DA[[x]]))] + uc <- c(z_col,avg_col,base::setdiff(display_col,c(z_col,avg_col))); uc <- base::intersect(uc,colnames(DA[[x]])) + DA_info <- DA[[x]][rn_label,uc] + avg_col <- colnames(DE[[x]])[grep('^Ave',colnames(DE[[x]]))] + uc <- c(z_col,avg_col,base::setdiff(display_col,c(z_col,avg_col))); uc <- base::intersect(uc,colnames(DE[[x]])) + DE_info <- as.data.frame(DE[[x]])[rn,uc] + colnames(DA_info) <- paste0(colnames(DA_info),'.',x,'_DA') + colnames(DE_info) <- paste0(colnames(DE_info),'.',x,'_DE') + colnames(DA_info)[1] <- paste0('Z.',x,'_DA') + colnames(DE_info)[1] <- paste0('Z.',x,'_DE') + out <- base::cbind(DA_info,DE_info,stringsAsFactors=FALSE) + rownames(out) <- rn_label + out + }) + combine_info_DA <- do.call(base::cbind,lapply(combine_info,function(x)x[rn_label,grep('_DA$',colnames(x))])) + combine_info_DE <- do.call(base::cbind,lapply(combine_info,function(x)x[rn_label,grep('_DE$',colnames(x))])) + # re-organize the columns for combine info + if(column_order_stratey=='type' & length(use_comp)>1){ + col_ord <- c('Z','AveExpr',display_col) + tmp1 <- lapply(col_ord,function(x){ + combine_info_DA[,grep(sprintf('^%s\\.',x),colnames(combine_info_DA))] + }) + combine_info_DA <- do.call(base::cbind,tmp1) + tmp1 <- lapply(col_ord,function(x){ + combine_info_DE[,grep(sprintf('^%s\\.',x),colnames(combine_info_DA))] + }) + combine_info_DE <- do.call(base::cbind,tmp1) + } + # put them together + ms_tab <- base::cbind(label_info,combine_info_DA,combine_info_DE,add_info) + rownames(ms_tab) <- ms_tab$originalID_label + return(ms_tab) +} + +#' Save the Master Table into Excel File +#' +#' \code{out2excel} is a function can save data frame as Excel File. This is mainly for the output of master table generated by \code{generate.masterTable}. +#' +#' @param all_ms_tab list or data.frame, if data.frame, it is generated by \code{generate.masterTable}. +#' If list, each list element is data.frame/master table. +#' The name of the list element will be the sheet name in the excel file. +#' @param out.xlsx character, path and file name of the output Excel file. +#' @param mark_gene list, list of marker genes. The name of the list element is the marked group name. Each element is a vector of marker genes. +#' This is optional, just to add additional information to the file. +#' @param mark_col character, the color to mark the marker genes. If NULL, will use \code{get.class.color} to get the colors. +#' @param mark_strategy character, users can choose between "color" and "add_column". +#' "Color" means the mark_gene will be marked by filling its background color; +#' "add_column" means the mark_gene will be displayed in a separate column with TRUE/FALSE, indicating whether the gene belongs to a mark group or not. +#' @param workbook_name character, name of the workbook for the output Excel. Default is "ms_tab". +#' @param only_z_sheet logical, if TRUE, will create a separate sheet only contains Z-statistics related columns from DA/DE analysis. +#' Default is FALSE. +#' @param z_column character, name of the columns contain Z-statistics. If NULL, find column names start with "Z.". +#' Default is NULL. +#' @param sig_thre numeric, threshold for the Z-statistics. Z values passed the threshold will be colored. The color scale is defined by \code{z2col}. +#' Default is 1.64. +#' @return Return a logical value. If TRUE, the Excel file has been generated successfully. +#' @examples +#' \dontrun{ +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab ## this is master table generated before +#' mark_gene <- list(WNT=c('WIF1','TNC','GAD1','DKK2','EMX2'), +#' SHH=c('PDLIM3','EYA1','HHIP','ATOH1','SFRP1'), +#' Group3=c('IMPG2','GABRA5','EGFL11','NRL','MAB21L2','NPR3','MYC'), +#' Group4=c('KCNA1','EOMES','KHDRBS2','RBM24','UNC5D')) +#' mark_col <- get.class.color(names(mark_gene), +#' pre_define=c('WNT'='blue','SHH'='red', +#' 'Group3'='yellow','Group4'='green')) +#' outfile <- 'test_out.xlsx' +#' out2excel(ms_tab,out.xlsx = outfile,mark_gene,mark_col) +#' } +#' @export +out2excel <- function(all_ms_tab,out.xlsx, + mark_gene=NULL, + mark_col=NULL, + mark_strategy='color', + workbook_name='ms_tab', + only_z_sheet=FALSE, + z_column=NULL,sig_thre=1.64){ + # + all_input_para <- c('all_ms_tab','out.xlsx','mark_strategy','workbook_name','only_z_sheet','sig_thre') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('only_z_sheet',c(TRUE,FALSE),envir=environment()), + check_option('mark_strategy',c('color','add_column'),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + wb <- openxlsx::createWorkbook(workbook_name) + if(!'list' %in% class(all_ms_tab)){ + all_ms_tab <- list('Sheet1'=as.data.frame(all_ms_tab)) + } + if(only_z_sheet==TRUE){ + nn <- names(all_ms_tab) + all_ms_tab <- lapply(all_ms_tab,function(x){ + w1 <- grep('.*_[DE|DA]',colnames(x)) + w2 <- base::setdiff(colnames(x)[w1],colnames(x)[w1][grep('^Z.*',colnames(x)[w1])]) + w3 <- base::setdiff(colnames(x),w2) + list(x[,w3],x) + }) + all_ms_tab <- unlist(all_ms_tab,recursive = FALSE) + nn1 <- lapply(nn,function(x){ + if(x!='Sheet1'){ + c(sprintf('Only_Z_%s',x),sprintf('Full_info_%s',x)) + }else{ + c('Only_Z','Full_info') + } + }) + nn1 <- unlist(nn1) + names(all_ms_tab) <- nn1 + } + if(is.null(mark_gene)==FALSE){ + if(!'list' %in% class(mark_gene)){ + mark_gene <- list('mark_gene'=mark_gene) + } + if(is.null(names(mark_gene))==TRUE){ + message('Must give name to the mark_gene list');return(FALSE) + } + if(mark_strategy=='color' & is.null(mark_col)==TRUE){ + mark_col <- get.class.color(names(mark_gene)) + } + if(mark_strategy=='add_column'){ + new_col_name <- paste0('is',names(mark_gene)) + all_ms_tab <- lapply(all_ms_tab,function(x){ + g1 <- x$geneSymbol + r1 <- do.call(base::cbind,lapply(mark_gene,function(x1){ + ifelse(g1 %in% x1,'TRUE','FALSE') + })) + new_x <- base::cbind(x,r1) + colnames(new_x) <- c(colnames(x),new_col_name) + new_x + }) + } + } + z_column_index <- 'defined' + if(is.null(z_column)==TRUE) z_column_index <- 'auto' + i <- 0 + headerStyle <- openxlsx::createStyle(fontSize = 14, fontColour = "#FFFFFF", halign = "center",fgFill = "#4F81BD", + border="TopBottom", borderColour = "#4F81BD",wrapText=TRUE) ## style for header line + for(sheetname in names(all_ms_tab)){ ## list, each item create one sheet + i <- i +1 + d <- as.data.frame(all_ms_tab[[sheetname]]) + if(z_column_index=='auto') use_z_column <- colnames(d)[grep('^Z\\.',colnames(d),ignore.case = TRUE)] else use_z_column <- base::intersect(colnames(d),z_column) + openxlsx::addWorksheet(wb,sheetName=sheetname) + openxlsx::writeData(wb,sheet = i,d) + openxlsx::addStyle(wb, sheet = i, headerStyle, rows = 1, cols = 1:ncol(d), gridExpand = TRUE) ## add header style + all_c <- list() + for(z_col in use_z_column){ ## find colnames with z. (only applied to pipeline excel) + j <- which(colnames(d)==z_col) + z1 <- d[,j]; c1 <- z2col(z1,sig_thre=sig_thre) + all_c[[as.character(j)]] <- c1 + } + mat_c <- do.call(base::cbind,all_c) + uni_c <- base::unique(unlist(all_c)) + for(r in uni_c){ + w1 <- which(mat_c==r) + nn <- nrow(mat_c) + rr <- w1%%nn+1 + cc <- w1%/%nn+1 + cc[which(rr==1)] <- cc[which(rr==1)]-1 + rr[which(rr==1)] <- nn+1 + openxlsx::addStyle(wb, sheet = i, createStyle(fgFill=r), rows =rr, cols = as.numeric(names(all_c)[cc])) + } + if(is.null(mark_gene)==FALSE){ + for(k in names(mark_gene)){ + openxlsx::addStyle(wb, sheet = i, openxlsx::createStyle(fgFill=mark_col[k]), + rows =which(toupper(d[,which(colnames(d)=='geneSymbol')]) %in% toupper(mark_gene[[k]]))+1, + cols = which(colnames(d)=='geneSymbol')) ## find column with geneSymbol and mark color + } + } + w1 <- which(gsub("\\s","",as.matrix(d))=='TRUE') + nn <- nrow(d) + rr <- w1%%nn+1 + cc <- w1%/%nn+1 + cc[which(rr==1)] <- cc[which(rr==1)]-1 + rr[which(rr==1)] <- nn+1 + openxlsx::addStyle(wb, sheet = i, openxlsx::createStyle(fontColour='#FF0000'), rows =rr, cols = as.numeric(cc)) ## find column with TRUE/FALSE and mark with color + } + openxlsx::saveWorkbook(wb, out.xlsx, overwrite = TRUE) + return(TRUE) + ## +} + +#' Load MSigDB Database into R Workspace +#' +#' \code{gs.preload} downloads data from MSigDB and stores it into two variables in R workspace, \code{all_gs2gene} and \code{all_gs2gene_info}. +#' \code{all_gs2gene} is a list object with elements of gene sets collections. +#' \code{all_gs2gene_info} is a data.frame contains the description of each gene sets. +#' +#' This is a pre-processing function for NetBID2 advanced analysis. User only need to input the species name (e.g. "Homo sapiens", "Mus musculus"). +#' It will call \code{msigdbr} to download data from MSigDB and save it as RData under the \code{db/} directory with species name. +#' +#' @param use_spe character, name of interested species (e.g. "Homo sapiens", "Mus musculus"). +#' Users can call \code{msigdbr_show_species()} to access the full list of available species names. +#' Default is "Homo sapiens". +#' @param update logical, if TRUE, the previous loaded RData will be updated. Default is FALSE. +#' @param main.dir character, the main file path of user's NetBID2 project. +#' If NULL, will be set to \code{system.file(package = "NetBID2")}. Default is NULL. +#' @param db.dir character, the file path to save the RData. Default is \code{db} directory under the \code{main.dir}, if one has a \code{main.dir}. +#' +#' @return Reture a logical value. If TRUE, MsigDB database is loaded successfully, with \code{all_gs2gene} and \code{all_gs2gene_info} created +#' in the workspace. +#' +#' @examples +#' gs.preload(use_spe='Homo sapiens',update=FALSE) +#' gs.preload(use_spe='Mus musculus',update=FALSE) +#' print(all_gs2gene_info) +#' # contain the information for all gene set category, category info, category size, +#' ## sub category,sub category info, sub category size +#' print(names(all_gs2gene)) # the first level of the list is the category and sub-category IDs +#' print(str(all_gs2gene$`CP:KEGG`)) +#' +#' \dontrun{ +#' gs.preload(use_spe='Homo sapiens',update=TRUE) +#' } +#' @export +gs.preload <- function(use_spe='Homo sapiens',update=FALSE, + main.dir=NULL, + db.dir=sprintf("%s/db/",main.dir)){ + # + all_input_para <- c('use_spe','update') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + all_spe <- msigdbr::msigdbr_show_species() + check_res <- c(check_option('update',c(TRUE,FALSE),envir=environment()), + check_option('use_spe',all_spe,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + ## only support geneSymbol (because pipeline-generated master table will contain geneSymbol column) + if(is.null(main.dir)==TRUE){ + main.dir <- system.file(package = "NetBID2") + message(sprintf('main.dir not set, will use package directory: %s',main.dir)) + } + if(is.null(db.dir)==TRUE){ + db.dir <- sprintf("%s/db/",main.dir) + } + message(sprintf('Will use directory %s as the db.dir',db.dir)) + use_spe1 <- gsub(' ','_',use_spe) + out_file <- sprintf('%s/%s_gs2gene.RData',db.dir,use_spe1) + if(file.exists(out_file)==FALSE | update==TRUE){ + message('Begin generating all_gs2gene !') + all_gs_info <- msigdbr::msigdbr(species = use_spe) ## use msigdbr_show_species() to check possible available species + # for gs_cat + all_gs_cat <- base::unique(all_gs_info$gs_cat) + all_gs2gene_1 <- lapply(all_gs_cat,function(x){ + x1 <- all_gs_info[which(all_gs_info$gs_cat==x),] + all_gs <- base::unique(x1$gs_name) + x2 <- lapply(all_gs, function(y){ + base::unique(x1$gene_symbol[which(x1$gs_name==y)]) + }) + names(x2) <- all_gs;x2 + }) + names(all_gs2gene_1) <- all_gs_cat + all_gs_subcat <- base::setdiff(base::unique(all_gs_info$gs_subcat),"") + all_gs2gene_2 <- lapply(all_gs_subcat,function(x){ + x1 <- all_gs_info[which(all_gs_info$gs_subcat==x),] + all_gs <- base::unique(x1$gs_name) + x2 <- lapply(all_gs, function(y){ + base::unique(x1$gene_symbol[which(x1$gs_name==y)]) + }) + names(x2) <- all_gs;x2 + }) + names(all_gs2gene_2) <- all_gs_subcat + all_gs2gene <- c(all_gs2gene_1,all_gs2gene_2) + # + all_gs2gene <- all_gs2gene[sort(names(all_gs2gene))] + gs_size <- unlist(lapply(all_gs2gene,length)) + # info for cat + info_cat <- c('C1'='positional gene sets','C2'='curated gene set','C3'='motif','C4'='computational','C5'='GO','C6'='oncogenic','C7'='immune','H'='hallmark genesets') + info_subcat <- c('CGP'='chemical and genetic perturbations','CP'='Canonical pathways','CP:BIOCARTA'='BioCarta gene sets','CP:KEGG'='KEGG gene sets', + 'CP:REACTOME'='Reactome gene sets','MIR'='microRNA targets','TFT'='transcription factor targets','CGN'='cancer gene neighborhoods','CM'='cancer modules', + 'BP'='Biological Process','MF'='Molecular Function','CC'='Cellular Component') + cat_rel <- base::unique(as.data.frame(all_gs_info[,c('gs_cat','gs_subcat')])) + all_gs2gene_info <- data.frame(cat_rel[,1],info_cat[cat_rel[,1]],gs_size[cat_rel[,1]],cat_rel[,2],info_subcat[cat_rel[,2]],gs_size[cat_rel[,2]],stringsAsFactors = FALSE) + colnames(all_gs2gene_info) <- c('Category','Category_Info','Category_Size','Sub-Category','Sub-Category_Info','Sub-Category_Size') + all_gs2gene_info <- all_gs2gene_info[order(all_gs2gene_info[,1]),] + all_gs2gene_info[,c(1,2,4,5)] <- as.data.frame(apply(all_gs2gene_info[,c(1,2,4,5)],2,function(x){x[which(is.na(x)==TRUE)] <- "";x}),stringsAsFactors=FALSE) + save(all_gs2gene,all_gs2gene_info,file=out_file) + } + load(out_file,.GlobalEnv) + message('all_gs2gene loaded, you could see all_gs2gene_info to check the details !') + return(TRUE) +} + +######################################################### visualization functions +## simple functions to get info +#' Create a vector of each sample's selected phenotye descriptive information. +#' +#' \code{get_obs_label} creates a vector of each sample's selected phenotype descriptive information. +#' This is a helper function for data visualization. +#' +#' @param phe_info data.frame, the phenotype data of the samples. +#' It is a data frame that can store any number of descriptive columns (covariates) for each sample row. +#' To get the phenotype data, using the accessor function \code{pData}. +#' @param use_col a vector of numerics or characters. +#' Users can select the interested descriptive column(s) by calling index or name of the column(s). +#' @param collapse character, an optional character string to separate the results when the length +#' of \code{use_col} is more than 1. Not NA_character. Default is "|". +#' +#' @return +#' Return a vector of selected phenotype descriptive information (covariates) for each sample. +#' Vector name is the sample name. + +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' phe_info <- Biobase::pData(analysis.par$cal.eset) +#' use_obs_class <- get_obs_label(phe_info = phe_info,'subgroup') +#' print(use_obs_class) +#' \dontrun{ +#'} +#' @export +get_obs_label <- function(phe_info,use_col,collapse='|'){ + # + all_input_para <- c('phe_info','use_col','collapse') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + w1 <- base::setdiff(use_col,colnames(phe_info)) + if(length(w1)>0){ + message(sprintf('%s not in the colnames(phe_info),please check and re-try!',paste(w1,collapse=';')));return(FALSE); + } + obs_label<-phe_info[,use_col]; + if(base::length(use_col)>1){ + obs_label<-apply(obs_label,1,function(x)base::paste(x,collapse=collapse)) + } + names(obs_label) <- rownames(phe_info); + obs_label +} + +#' Get interested phenotype groups from pData slot of the ExpressionSet object. +#' +#' \code{get_int_group} is a function to extract interested phenotype groups from the ExpressionSet object +#' with 'cluster-meaningful' sample features. +#' +#' @param eset an ExpressionSet object. +#' @return Return a vector of phenotype groups which could be used for sample cluster analysis. +#' +#' @examples +#' network.par <- list() +#' network.par$out.dir.DATA <- system.file('demo1','network/DATA/',package = "NetBID2") +#' NetBID.loadRData(network.par=network.par,step='exp-QC') +#' intgroups <- get_int_group(network.par$net.eset) +#' @export +get_int_group <- function(eset){ + all_input_para <- c('eset') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + phe <- Biobase::pData(eset) + feature_len <- apply(phe,2,function(x)base::length(base::unique(x))) + intgroup <- colnames(phe)[which(feature_len>1 & feature_lenlow_K){ + graphics::text(xxx[i],yyy[j],v1,cex=clust_cex) + }else{ + v2 <- names(obs_label)[which(pred_label==rownames(t1)[i] & obs_label==colnames(t1)[j])] + v2 <- base::paste(v2,collapse='\n') + graphics::text(xxx[i],yyy[j],v2,cex=outlier_cex) + } + } + } + return(t1) +} + + +#' Set Color Scale for Z Statistics Value +#' +#' \code{z2col} is a helper function in \code{out2excel}. It defines the color scale of the Z statistics value. +#' +#' @param x a vector of numerics, a vector of Z statistics. +#' @param n_len integer, number of unique colors. Default is 60. +#' @param sig_thre numeric, the threshold for significance (absolute value of Z statistics). Z values failed to pass the threshold will be colored "white". +#' @param col_min_thre numeric, the lower threshold for the color bar value. Default is 0.01. +#' @param col_max_thre numeric, the upper threshold for the color bar value. Default is 3. +#' @param blue_col a vector of characters, the blue colors used to show the negative Z values. Default is brewer.pal(9,'Set1')[2]. +#' @param red_col a vector of characters, the red colors used to show positive Z values. Default is brewer.pal(9,'Set1')[1]. +#' @return Return a vector of color codes. +#' @examples +#' t1 <- sort(rnorm(mean=0,sd=2,n=100)) +#' graphics::image(as.matrix(t1),col=z2col(t1)) +#' @export +z2col <- function(x,n_len=60,sig_thre=0.01,col_min_thre=0.01,col_max_thre=3, + blue_col=brewer.pal(9,'Set1')[2], + red_col=brewer.pal(9,'Set1')[1]){ + # + tmp_x <- setdiff(x,c(Inf,-Inf)) + if(length(tmp_x)==0) return(ifelse(x>0,'red','blue')) + all_input_para <- c('x','n_len','sig_thre','col_min_thre','col_max_thre','blue_col','red_col') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + ## create vector for z-score, can change sig threshold + x[which(is.na(x)==TRUE)] <- 0 + x[which(x==Inf)]<- base::max(x[which(x!=Inf)])+1 + x[which(x==-Inf)]<- base::min(x[which(x!=-Inf)])-1 + if(col_min_thre<0) col_min_thre<-0.01 + if(col_max_thre<0) col_max_thre<-3 + c2 <- grDevices::colorRampPalette(c(blue_col,'white',red_col))(n_len) + r1 <- 1.05*base::max(abs(x)) ## -r1~r1 + if(r1 < col_max_thre){ + r1 <- col_max_thre + } + if(col_min_thre>r1){ + r2 <- seq(-r1,r1,length.out=n_len+1) + }else{ + r21 <- seq(-r1,-col_min_thre,length.out=n_len/2) + r22 <- base::seq(col_min_thre,r1,length.out=n_len/2) + r2 <- c(r21,r22) + } + x1 <- cut(x,r2) + names(c2) <- levels(x1) + x2 <- c2[x1] + x2[which(abs(x) 0) { + if(w1 < length(use_color)){ + cc1 <- use_color[1:w1] + }else{ + cc1 <- grDevices::colorRampPalette(use_color)(w1) + } + names(cc1) <- x2 + } + cc2 <- c(pre_define, cc1) + cc2 <- cc2[x] + } + return(cc2) +} + +## get color box text,inner function ## refer from web +# https://stackoverflow.com/questions/45366243/text-labels-with-background-colour-in-r +boxtext <- function(x, y, labels = NA, col.text = NULL, col.bg = NA, + border.bg = NA, adj = NULL, pos = NULL, offset = 0.5, + padding = c(0.5, 0.5), cex = 1, font = graphics::par('font')){ + + ## The Character expansion factro to be used: + theCex <- graphics::par('cex')*cex + ## Is y provided: + if (missing(y)) y <- x + ## Recycle coords if necessary: + if (base::length(x) != base::length(y)){ + lx <- base::length(x) + ly <- base::length(y) + if (lx > ly){ + y <- rep(y, ceiling(lx/ly))[1:lx] + } else { + x <- rep(x, ceiling(ly/lx))[1:ly] + } + } + ## Width and height of text + textHeight <- graphics::strheight(labels, cex = theCex, font = font) + textWidth <- graphics::strwidth(labels, cex = theCex, font = font) + ## Width of one character: + charWidth <- graphics::strwidth("e", cex = theCex, font = font) + ## Is 'adj' of length 1 or 2? + if (!is.null(adj)){ + if (base::length(adj == 1)){ + adj <- c(adj[1], 0.5) + } + } else { + adj <- c(0.5, 0.5) + } + + ## Is 'pos' specified? + if (!is.null(pos)){ + if (pos == 1){ + adj <- c(0.5, 1) + offsetVec <- c(0, -offset*charWidth) + } else if (pos == 2){ + adj <- c(1, 0.5) + offsetVec <- c(-offset*charWidth, 0) + } else if (pos == 3){ + adj <- c(0.5, 0) + offsetVec <- c(0, offset*charWidth) + } else if (pos == 4){ + adj <- c(0, 0.5) + offsetVec <- c(offset*charWidth, 0) + } else { + stop('Invalid argument pos') + } + } else { + offsetVec <- c(0, 0) + } + ## Padding for boxes: + if (base::length(padding) == 1){ + padding <- c(padding[1], padding[1]) + } + ## Midpoints for text: + xMid <- x + (-adj[1] + 1/2)*textWidth + offsetVec[1] + yMid <- y + (-adj[2] + 1/2)*textHeight + offsetVec[2] + + ## Draw rectangles: + rectWidth <- textWidth + 2*padding[1]*charWidth + rectHeight <- textHeight + 2*padding[2]*charWidth + graphics::rect(xleft = xMid - rectWidth/2,ybottom = yMid - rectHeight/2, + xright = xMid + rectWidth/2,ytop = yMid + rectHeight/2, + col = col.bg, border = border.bg,xpd=TRUE) + ## Place the text: + graphics::text(xMid, yMid, labels, col = col.text, cex = theCex, font = font,adj = c(0.5, 0.5),xpd=TRUE) + ## Return value: + if (base::length(xMid) == 1){ + invisible(c(xMid - rectWidth/2, xMid + rectWidth/2, yMid - rectHeight/2,yMid + rectHeight/2)) + } else { + invisible(base::cbind(xMid - rectWidth/2, xMid + rectWidth/2, yMid - rectHeight/2,yMid + rectHeight/2)) + } +} + +#' Visualize Sample Clustering Result in 2D Plot +#' +#' \code{draw.2D} creats a 2D plot to visualize the sample clustering result. +#' +#' @param X a vector of numerics, the x coordinates of points in the plot. If user would like to create a PCA biplot, this parameter should be the first component. +#' @param Y a vector of numerics, the y coordinates of points in the plot. If user would like to create a PCA biplot, this parameter should be the second component. +#' @param class_label a vector of characters, labels or categories of samples. The vector name should be sample names. +#' @param xlab character, the label for x-axis. Default is "PC1". +#' @param ylab character, the label for y-axis. Default is "PC2". +#' @param legend_cex numeric, giving the amount by which the text of legend should be magnified relative to the default. Default is 0.8. +#' @param main character, an overall title for the plot. Default is "". +#' @param point_cex numeric, giving the amount by which the size of the data points should be magnified relative to the default. Default is 1. +#' @param use_color a vector of color codes, colors to be assigned to each member of display label. Default is brewer.pal(9, 'Set1'). +#' @param pre_define a vector of characters, pre-defined color codes for a certain input (e.g. c("blue", "red") with names c("A", "B")). Default is NULL. +#' +#' @return Return a logical value. If TRUE, the plot has been created successfully. +#' @examples +#' mat1 <- matrix(rnorm(2000,mean=0,sd=1),nrow=100,ncol=20) +#' rownames(mat1) <- paste0('Gene',1:nrow(mat1)) +#' colnames(mat1) <- paste0('Sample',1:ncol(mat1)) +#' pc <- stats::prcomp(t(mat1))$x +#' pred_label <- kmeans(pc,centers=4)$cluster ## this can use other cluster results +#' draw.2D(X=pc[,1],Y=pc[,2],class_label=pred_label) +#' @export +draw.2D <- function(X,Y,class_label,xlab='PC1',ylab='PC2',legend_cex=0.8,main="",point_cex=1,use_color=NULL,pre_define=NULL){ + # + all_input_para <- c('X','Y','class_label','xlab','ylab','legend_cex','main','point_cex') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + class_label <- clean_charVector(class_label) + # + if(base::length(X)!=base::length(Y)){ + message('Input two dimension vector with different length, please check and re-try !');return(FALSE); + } + if(base::length(X)!=base::length(class_label)){ + message('Input dimension vector has different length with the class_label, please check and re-try !');return(FALSE); + } + par(mai = c(1, 1, 1, 0.5+base::max(strwidthMod(class_label,units='inch',cex=legend_cex,ori=FALSE,mod=FALSE)))) + cls_cc <- get.class.color(class_label,use_color=use_color,pre_define=pre_define) ## get color for each label + graphics::plot(Y ~ X,pch = 16,cex = point_cex,col = cls_cc,main=main,xlab=xlab,ylab=ylab) + graphics::legend(par()$usr[2],par()$usr[4],sort(base::unique(class_label)),fill = cls_cc[sort(base::unique(class_label))], + horiz = FALSE,xpd = TRUE,border = NA,bty = 'n',cex=legend_cex) + return(TRUE) +} + +#' Visualize Sample Clustering Result in 2D Plot with interactive mode +#' +#' \code{draw.2D.interactive} creats a 2D plot to visualize the sample clustering result with interactive mode realized by plotly. +#' +#' @param X a vector of numerics, the x coordinates of points in the plot. If user would like to create a PCA biplot, this parameter should be the first component. +#' @param Y a vector of numerics, the y coordinates of points in the plot. If user would like to create a PCA biplot, this parameter should be the second component. +#' @param sample_label a vector of characters, name of samples to be displayed on the figure. +#' @param color_label a vector of characters, labels used to define the point color. +#' @param shape_label a vector of characters, labels used to define the point shape. +#' @param xlab character, the label for x-axis. Default is "PC1". +#' @param ylab character, the label for y-axis. Default is "PC2". +#' @param main character, an overall title for the plot. Default is "". +#' @param point_cex numeric, giving the amount by which the size of the data points should be magnified relative to the default. Default is 1. +#' @param use_color a vector of color codes, colors to be assigned to each member of display label. Default is brewer.pal(9, 'Set1'). +#' @param pre_define a vector of characters, pre-defined color codes for a certain input (e.g. c("blue", "red") with names c("A", "B")). Default is NULL. +#' +#' @return Return the plotly class object for interactive visualization. +#' @examples +#' mat1 <- matrix(rnorm(2000,mean=0,sd=1),nrow=100,ncol=20) +#' rownames(mat1) <- paste0('Gene',1:nrow(mat1)) +#' colnames(mat1) <- paste0('Sample',1:ncol(mat1)) +#' pc <- stats::prcomp(t(mat1))$x +#' pred_label <- kmeans(pc,centers=4)$cluster ## this can use other cluster results +#' draw.2D.interactive(X=pc[,1],Y=pc[,2], +#' sample_label=rownames(pc), +#' color_label=pred_label, +#' pre_define = c('1'='blue','2'='red','3'='yellow','4'='green')) +#' draw.2D.interactive(X=pc[,1],Y=pc[,2], +#' sample_label=rownames(pc), +#' shape_label=pred_label) +#' @export +draw.2D.interactive <- function(X,Y,sample_label=NULL,color_label=NULL,shape_label=NULL, + xlab='PC1',ylab='PC2',main="",point_cex=1, + use_color=NULL,pre_define=NULL){ + if(!'plotly' %in% rownames(installed.packages())){ + message('plotly not installed!');return(FALSE); + } + # + all_input_para <- c('X','Y','sample_label','xlab','ylab','main','point_cex') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + sample_label <- clean_charVector(sample_label) + if(is.null(color_label)==FALSE){ + color_label <- clean_charVector(color_label) + } + if(is.null(shape_label)==FALSE){ + shape_label <- clean_charVector(shape_label) + } + # + if(base::length(X)!=base::length(Y)){ + message('Input two dimension vector with different length, please check and re-try !');return(FALSE); + } + if(base::length(X)!=base::length(sample_label)){ + message('Input dimension vector has different length with the sample_label, please check and re-try !');return(FALSE); + } + if(is.null(shape_label)==TRUE & is.null(color_label)==TRUE){ + message('Either color_label or shape_label is required!');return(FALSE); + } + if(is.null(shape_label)==FALSE & is.null(color_label)==FALSE){ + + if(base::length(X)!=base::length(color_label)){ + message('Input dimension vector has different length with the color_label, please check and re-try !');return(FALSE); + } + color_label.factor <- as.factor(color_label) + cls_cc <- get.class.color(levels(color_label.factor),use_color=use_color,pre_define=pre_define) ## get color for each label + if(base::length(X)!=base::length(shape_label)){ + message('Input dimension vector has different length with the shape_label, please check and re-try !');return(FALSE); + } + shape_label.factor <- as.factor(shape_label) + data <- data.frame(X=X,Y=Y,color_label=color_label.factor,shape_label=shape_label.factor); + display_text <- paste0(sample_label,':',color_label,':',shape_label) + p <- plotly::plot_ly(data = data, x = ~X, y = ~Y, + marker = list(size = point_cex*12),type='scatter',color=~color_label,colors=cls_cc, + hoverinfo='text',text=display_text, + mode='markers',symbol=~shape_label) %>% + plotly::layout(title = main, + yaxis = list(zeroline = FALSE,title=list(text=xlab)), + xaxis = list(zeroline = FALSE,title=list(text=ylab),showlegend=TRUE) + ) + } + if(is.null(shape_label)==TRUE & is.null(color_label)==FALSE){ + if(base::length(X)!=base::length(color_label)){ + message('Input dimension vector has different length with the color_label, please check and re-try !');return(FALSE); + } + color_label.factor <- as.factor(color_label) + cls_cc <- get.class.color(levels(color_label.factor),use_color=use_color,pre_define=pre_define) ## get color for each label + data <- data.frame(X=X,Y=Y,color_label=color_label.factor); + display_text <- paste0(sample_label,':',color_label) + p <- plotly::plot_ly(data = data, x = ~X, y = ~Y, + marker = list(size = point_cex*12),type='scatter',color=~color_label,colors=cls_cc, + hoverinfo='text',text=display_text, + mode='markers') %>% + plotly::layout(title = main, + yaxis = list(zeroline = FALSE,title=list(text=xlab)), + xaxis = list(zeroline = FALSE,title=list(text=ylab),showlegend=TRUE) + ) + } + if(is.null(shape_label)==FALSE & is.null(color_label)==TRUE){ + if(base::length(X)!=base::length(shape_label)){ + message('Input dimension vector has different length with the shape_label, please check and re-try !');return(FALSE); + } + shape_label.factor <- as.factor(shape_label) + data <- data.frame(X=X,Y=Y,shape_label=shape_label.factor); + display_text <- paste0(sample_label,':',shape_label) + p <- plotly::plot_ly(data = data, x = ~X, y = ~Y, + marker = list(size = point_cex*12),type='scatter',color = I('black'), + hoverinfo='text',text=display_text, + mode='markers',symbol=~shape_label) %>% + plotly::layout(title = main, + yaxis = list(zeroline = FALSE,title=list(text=xlab)), + xaxis = list(zeroline = FALSE,title=list(text=ylab)) + ) + } + return(p) +} + + +#' Visualize Sample Clustering Result in 2D Plot with Sample Names +#' +#' \code{draw.2D.text} creates a 2D plot with sample names labeled, to visualize the sample clustering result. +#' +#' @param X a vector of numerics, the x coordinates of points in the plot. If user would like to create a PCA biplot, this parameter should be the first component. +#' @param Y a vector of numerics, the y coordinates of points in the plot. If user would like to create a PCA biplot, this parameter should be the second component. +#' @param class_label a vector of characters, labels or categories of samples. The vector name should be sample names. +#' @param xlab character, the label for x-axis. Default is "PC1". +#' @param ylab character, the label for y-axis. Default is "PC2". +#' @param legend_cex numeric, giving the amount by which the text of legend should be magnified relative to the default. Default is 0.8. +#' @param main character, an overall title for the plot. Default is "". +#' @param point_cex numeric, giving the amount by which the size of the data points should be magnified relative to the default. Default is 1. +#' @param class_text a vector of characters, the user-defined sample names to label each data points in the plot. +#' If NULL, will use the names of \code{class_label}. Default is NULL. +#' @param text_cex numeric, giving the amount by which the text of \code{class_text} should be magnified relative to the default. Default is NULL. +#' @param use_color a vector of color codes, colors to be assigned to each member of display label. Default is brewer.pal(9, 'Set1'). +#' @param pre_define a vector of characters, pre-defined color codes for a certain input (e.g. c("blue", "red") with names c("A", "B")). Default is NULL. +#' +#' @return Return a logical value. If TRUE, the plot has been created successfully. +#' @examples +#' mat1 <- matrix(rnorm(2000,mean=0,sd=1),nrow=100,ncol=20) +#' rownames(mat1) <- paste0('Gene',1:nrow(mat1)) +#' colnames(mat1) <- paste0('Sample',1:ncol(mat1)) +#' pc <- stats::prcomp(t(mat1))$x +#' pred_label <- kmeans(pc,centers=4)$cluster ## this can use other cluster results +#' draw.2D.text(X=pc[,1],Y=pc[,2],class_label=pred_label, +#' point_cex=5,text_cex=0.5) +#' @export +draw.2D.text <- function(X,Y,class_label,class_text=NULL,xlab='PC1',ylab='PC2',legend_cex=0.8,main="", + point_cex=1,text_cex=NULL,use_color=NULL,pre_define=NULL){ + # + all_input_para <- c('X','Y','class_label','xlab','ylab','legend_cex','main','point_cex') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + class_label <- clean_charVector(class_label) + # + if(base::length(X)!=base::length(Y)){ + message('Input two dimension vector with different length, please check and re-try !');return(FALSE); + } + if(base::length(X)!=base::length(class_label)){ + message('Input dimension vector has different length with the class_label, please check and re-try !');return(FALSE); + } + par(mai = c(1, 1, 1, 0.5+base::max(strwidthMod(class_label,units='inch',cex=legend_cex,ori=FALSE,mod=FALSE)))) + if(is.null(class_text)==TRUE){ + class_text <- names(class_label) + } + cc <- 10/base::length(class_label) + if(cc<0.05) cc<-0.05 + if(cc>1) cc<-1 + if(is.null(text_cex)==FALSE) cc <- text_cex + cls_cc <- get.class.color(class_label,use_color=use_color,pre_define=pre_define) ## get color for each label + graphics::plot(Y ~ X,pch = 16,cex = point_cex,col = cls_cc,main=main,xlab=xlab,ylab=ylab) + graphics::text(x=X,y=Y,labels=class_text,cex=cc,xpd=TRUE,adj=0.5) + #print(cc);print(str(nn)) + graphics::legend(par()$usr[2],par()$usr[4],sort(base::unique(class_label)),fill = cls_cc[sort(base::unique(class_label))], + horiz = FALSE,xpd = TRUE,border = NA,bty = 'n',cex=legend_cex) + return(TRUE) +} + +#' Visualize Sample Clustering Result in 3D Plot +#' +#' \code{draw.3D} creates a 3D plot to visualize the sample clustering result. + +#' @param X a vector of numerics, the x coordinates of points in the plot. If user would like to create a PCA biplot, this parameter should be the first component. +#' @param Y a vector of numerics, the y coordinates of points in the plot. If user would like to create a PCA biplot, this parameter should be the second component. +#' @param Z a vector of numerics, the z coordinates of points in the plot. If user would like to create a PCA biplot, this parameter should be the third component. +#' @param class_label a vector of characters, labels or categories of samples. The vector name should be sample names. +#' @param xlab character, the label for x-axis. Default is "PC1". +#' @param ylab character, the label for y-axis. Default is "PC2". +#' @param zlab character, the label for z-axis. Default is "PC3". +#' @param legend_cex numeric, giving the amount by which the text of legend should be magnified relative to the default. Default is 0.8. +#' @param main character, an overall title for the plot. Default is "". +#' @param point_cex numeric, giving the amount by which the size of the data points should be magnified relative to the default. Default is 1. +#' @param legend_pos character, the position of legend. Default is "topright". +#' @param legend_ncol integer, number of columns of legend. Default is 1. +#' @param use_color a vector of color codes, colors to be assigned to each member of display label. Default is brewer.pal(9, 'Set1'). +#' @param pre_define a vector of characters, pre-defined color codes for a certain input (e.g. c("blue", "red") with names c("A", "B")). Default is NULL. +#' @param ... other paramters used in \code{scatter3D}. +#' +#' @return Return a logical value. If TRUE, the plot has been created successfully. +#' @examples +#' mat1 <- matrix(rnorm(2000,mean=0,sd=1),nrow=100,ncol=20) +#' rownames(mat1) <- paste0('Gene',1:nrow(mat1)) +#' colnames(mat1) <- paste0('Sample',1:ncol(mat1)) +#' pc <- stats::prcomp(t(mat1))$x +#' pred_label <- kmeans(pc,centers=4)$cluster ## this can use other cluster results +#' draw.3D(X=pc[,1],Y=pc[,2],Z=pc[,3],class_label=pred_label) +#' @export +draw.3D <- function(X,Y,Z,class_label,xlab='PC1',ylab='PC2',zlab='PC3', + legend_cex=0.8,main="",point_cex=1,legend_pos='topright',legend_ncol=1,use_color=NULL,pre_define=NULL,...){ + + # + all_input_para <- c('X','Y','Z','class_label','xlab','ylab','zlab','legend_cex','main','point_cex','legend_pos','legend_ncol') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + class_label <- clean_charVector(class_label) + # + if(base::length(X)!=base::length(Y) | base::length(X)!=base::length(Z) | base::length(Z)!=base::length(Y)){ + message('Input three dimension vectors with different length, please check and re-try !');return(FALSE); + } + if(base::length(X)!=base::length(class_label)){ + message('Input dimension vector has different length with the class_label, please check and re-try !');return(FALSE); + } + c1 <- factor(class_label,levels=sort(base::unique(class_label))) + cls_cc <- get.class.color(class_label,use_color=use_color,pre_define=pre_define) ## get color for each label + #print(str(class_label)) + #print(str(sort(base::unique(class_label)))) + #print(str(cls_cc)) + #print(cls_cc[sort(base::unique(class_label))]) + plot3D::scatter3D(X,Y,Z,pch = 16,xlab = xlab,ylab = ylab,zlab=zlab,bty='g', + colvar=1:base::length(c1), + col=cls_cc,colkey = FALSE,cex=point_cex,main=main,...) + graphics::legend(legend_pos,sort(base::unique(class_label)),fill = cls_cc[sort(base::unique(class_label))], + border = NA,bty = 'n',ncol = legend_ncol,cex = legend_cex) + return(TRUE) +} + +#' Visualize Sample Clustering Result in 2D Plot with Ellipse +#' +#' \code{draw.2D.ellipse} creates a 2D plot with an ellipse drawn around each cluster to visualize the sample clustering result. +#' +#' @param X a vector of numerics, the x coordinates of points in the plot. If user would like to creat a PCA biplot, this parameter should be the first component. +#' @param Y a vector of numerics, the y coordinates of points in the plot. If user would like to creat a PCA biplot, this parameter should be the second component. +#' @param class_label a vector of characters, labels or categories of samples. The vector name should be sample names. +#' @param xlab character, the label for x-axis. Default is "PC1". +#' @param ylab character, the label for y-axis. Default is "PC2". +#' @param legend_cex numeric, giving the amount by which the text of legend should be magnified relative to the default. Default is 0.8. +#' @param main character, an overall title for the plot. Default is "". +#' @param point_cex numeric, giving the amount by which the size of the data points should be magnified relative to the default. Default is 1. +#' @param use_color a vector of color codes, colors to be assigned to each member of display label. Default is brewer.pal(9, 'Set1'). +#' @param pre_define a vector of characters, pre-defined color codes for a certain input (e.g. c("blue", "red") with names c("A", "B")). Default is NULL. +#' +#' @return Return a logical value. If TRUE, the plot has been created successfully. +#' @examples +#' mat1 <- matrix(rnorm(2000,mean=0,sd=1),nrow=100,ncol=20) +#' rownames(mat1) <- paste0('Gene',1:nrow(mat1)) +#' colnames(mat1) <- paste0('Sample',1:ncol(mat1)) +#' pc <- stats::prcomp(t(mat1))$x +#' pred_label <- stats::kmeans(pc,centers=4)$cluster ## this can use other cluster results +#' draw.2D.ellipse(X=pc[,1],Y=pc[,2],class_label=pred_label) +#' @export +draw.2D.ellipse <- function(X,Y,class_label,xlab='PC1',ylab='PC2',legend_cex=0.8,main="",point_cex=1,use_color=NULL,pre_define=NULL){ + # + all_input_para <- c('X','Y','class_label','xlab','ylab','legend_cex','main','point_cex') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + class_label <- clean_charVector(class_label) + # + if(base::length(X)!=base::length(Y)){ + message('Input two dimension vector with different length, please check and re-try !');return(FALSE); + } + if(base::length(X)!=base::length(class_label)){ + message('Input dimension vector has different length with the class_label, please check and re-try !');return(FALSE); + } + par(mar=c(5,5,5,5)) + get_transparent <- function(x,alpha=0.1){ + grDevices::rgb(t(grDevices::col2rgb(x)/255),alpha=alpha) + } + if(is.null(names(class_label))==TRUE) names(class_label) <- paste0('S_',1:base::length(class_label)) + if(is.null(names(X))==TRUE) names(X) <- names(class_label) + if(is.null(names(Y))==TRUE) names(Y) <- names(class_label) + X <- X[names(class_label)] + Y <- Y[names(class_label)] + cls_cc <- get.class.color(class_label,use_color=use_color,pre_define=pre_define) ## get color for each label + par(mar=c(5,8,5,8)) + graphics::plot(Y~X,pch = 19,cex = point_cex,xlim=c(base::min(X)-IQR(X)/3,IQR(X)/3+base::max(X)), + col = cls_cc,bty='n',bg='white',xlab=xlab,ylab=ylab,main=main) + S_dist <- stats::dist(base::cbind(X,Y)); S_dist<-as.matrix(S_dist);#print(str(S_dist)) + ## add ellipse + for(i in base::unique(class_label)){ + w0 <- which(class_label==i);w00 <- which(class_label!=i); + w1 <- base::cbind(X[w0],Y[w0]) + if(is.null(dim(w1))){ + w1 <- t(as.matrix(w1)) + } + c1 <- get_transparent(cls_cc[which(class_label==i)][1]) + m1 <- base::colMeans(w1) + d1 <- unlist(lapply(1:nrow(w1),function(x)sqrt(sum((w1[x,]-m1)^2)))) + if(base::length(d1)==1){ + plotrix::draw.ellipse(m1[1],m1[2],a=(par()$usr[2]-par()$usr[1])/30,b=(par()$usr[2]-par()$usr[1])/30,col=c1,border=NA,xpd=TRUE) + graphics::text(m1[1],m1[2],i,xpd=TRUE,adj=0,cex=legend_cex) + } + if(base::length(d1)==2){ + plotrix::draw.ellipse(m1[1],m1[2],a=d1[1],b=d1[1],col=c1,border=NA,xpd=TRUE) + graphics::text(m1[1],m1[2],i,xpd=TRUE,adj=0,cex=legend_cex) + } + if(base::length(d1)>=3){ + w2 <- order(d1,decreasing=TRUE) + d1 <- d1[w2] + w1 <- w1[w2,] + d3 <- do.call(rbind,lapply(1:nrow(w1),function(x)w1[x,]-m1)) + a <- sqrt(d3[1,1]^2+d3[1,2]^2) + angle <- 360/(2*pi)*atan(d3[1,2]/d3[1,1]) + beta <- 360/(2*pi)*atan(d3[,2]/d3[,1]) + dx <- d1*cos(2*pi*(beta-angle)/360) + dy <- d1*sin(2*pi*(beta-angle)/360) + # + b <- sqrt((dy^2)/(1-dx^2/a^2)) + b <- base::max(b[-1]) + plotrix::draw.ellipse(m1[1],m1[2],a=a*1.05,b=b*1.05,col=c1,border=NA,angle=360/(2*pi)*atan(d3[1,2]/d3[1,1]),xpd=TRUE) + # + #s1 <- sample(1:nrow(w1),1) ## random select one to mark label + s1 <- S_dist[names(class_label)[w0],names(class_label)[w00]] ## select one point with largest distance to nodes outside the cluster + s1 <- names(class_label)[w0][base::which.max(apply(s1,1,min))]; + d1 <- (par()$usr[2]-par()$usr[1])/30 + m2 <- w1[s1,] + if(m2[1]>m1[1]){ + boxtext(m2[1]+d1,m2[2],labels=i,adj=0,cex=legend_cex,col.bg=c1) + graphics::segments(x0=w1[s1,1],y0=w1[s1,2],x1=m2[1]+d1,y1=m2[2],col='dark grey') + } else{ + boxtext(m2[1]-d1,m2[2],labels=i,adj=1,cex=legend_cex,col.bg=c1) + graphics::segments(x0=w1[s1,1],y0=w1[s1,2],x1=m2[1]-d1,y1=m2[2],col='dark grey') + } + } + } + return(TRUE) +} + +#' QC plots for ExpressionSet class object. +#' +#' \code{draw.eset.QC} is a function to draw a set of plots for quality control analysis. +#' 4 types of plots will be created, including heatmap, pca, density and meansd. +#' +#' @param eset ExpressionSet class, quality control analysis target. +#' @param outdir character, the directory path for saving output files. +#' @param do.logtransform logical, if TRUE, the log transformation will be performed on the gene expression value. Default is FALSE. +#' @param intgroup a vector of characters, the interested phenotype groups from the ExpressionSet. +#' If NULL, it will automatcially extract all possible groups by \code{get_int_group}. +#' Default is NULL. +#' @param prefix character, the prefix for the QC figures' name. Default is "". +#' @param choose_plot a vector of characters, +#' choose one or many from 'heatmap', 'pca', 'density','correlation' and 'meansd' plots. +#' Only useful when generate_html=FALSE. +#' Default is 'heatmap', 'pca', 'density' and 'correlation' +#' @param generate_html logical, if TRUE, it will generate a html file by R Markdown. Otherwise, it will generate separate PDF files. +#' Default is TRUE. +#' @param correlation_strategy character, the strategy to calculate the sample correlation, +#' choose from 'pearson' and 'spearman'. Default is 'pearson'. +#' @param plot_all_point logical, if TRUE, the scatterplot will plot all points in the correlation, +#' otherwise will plot the main trend for reducing the figure size. Default is FALSE. +#' @param pca_plot_type character, plot type for pca. Users can choose from "2D","2D.interactive", "2D.ellipse", "2D.text" and "3D". Default is "2D.ellipse". +#' @param use_color a vector of color codes, colors to be assigned to each member of display label. Default is brewer.pal(9, 'Set1'). +#' @param pre_define a vector of characters, pre-defined color codes for a certain input (e.g. c("blue", "red") with names c("A", "B")). Default is NULL. +#' @examples +#' \dontrun{ +#' network.par <- list() +#' network.par$out.dir.DATA <- system.file('demo1','network/DATA/',package = "NetBID2") +#' NetBID.loadRData(network.par=network.par,step='exp-QC') +#' intgroups <- get_int_group(network.par$net.eset) +#' network.par$out.dir.QC <- getwd() ## set the output directory +#' draw.eset.QC(network.par$net.eset,outdir=network.par$out.dir.QC,intgroup=intgroups, +#' pre_define=c('WNT'='blue','SHH'='red','G4'='green')) +#' draw.eset.QC(network.par$net.eset,outdir=network.par$out.dir.QC,intgroup=intgroups, +#' pre_define=c('WNT'='blue','SHH'='red','G4'='green'), +#' pca_plot_type = '2D.interactive',choose_plot = 'pca') +#' } +#' @export +draw.eset.QC <- function(eset,outdir = '.',do.logtransform = FALSE,intgroup=NULL,prefix = '', + choose_plot=c('heatmap','pca','density','correlation'),generate_html=TRUE, + correlation_strategy='pearson',plot_all_point=FALSE, + pca_plot_type='2D.ellipse', + use_color=NULL,pre_define=NULL) { + # + all_input_para <- c('eset','outdir','do.logtransform','prefix','choose_plot','generate_html','correlation_strategy','plot_all_point') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('do.logtransform',c(TRUE,FALSE),envir=environment()), + check_option('plot_all_point',c(TRUE,FALSE),envir=environment()), + check_option('correlation_strategy',c('pearson','spearman'),envir=environment()), + check_option('pca_plot_type',c('2D','2D.ellipse','3D','2D.text','2D.interactive'),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + w1 <- base::setdiff(choose_plot,c('heatmap','pca','density','correlation','meansd')) + if(base::length(w1)>0){ + message(sprintf('Wrong input for choose_plot, %s not included (Only accept "heatmap","pca","density","correlation","meansd"). + Please check and re-try!', + w1));return(FALSE) + } + # + if (!file.exists(outdir)) { + dir.create(outdir, recursive = TRUE) + message(paste0("The output directory: \"", outdir, "\" is created!")) + }else + message(paste0("The output will overwrite the files in directory: \"",outdir,"\"")) + if(is.null(intgroup)){ + intgroup <- get_int_group(eset) + if(base::length(intgroup)>6){ + intgroup <- intgroup[1:6] + message('Warning, too many meaningful sample phenotype columns, will use the first 6 columns for visualization !') + } + } + if(base::length(intgroup)==0){ + message('No intgroup, please check and re-try!');return(FALSE) + } + message('Preparing the data...') + #x <- prepdata(eset, do.logtransform = do.logtransform, intgroup = intgroup) + use_mat <- Biobase::exprs(eset) + if(nrow(use_mat)<=3){ + message('Too small gene number for plot (<=3), please check and re-try!');return(FALSE); + } + if(do.logtransform==TRUE){ + if(base::min(as.numeric(use_mat))<=0){ + message('Warning, the original expression matrix has values not larger than 0, the log-transformation may introduce NA, please manually modifiy and re-try !') + } + use_mat <- log2(use_mat) + } + if(generate_html==TRUE){ + if(pandoc_available()==FALSE){ + stop('pandoc not available, please set Sys.setenv(RSTUDIO_PANDOC=$pandoc_installed_path), or set generate_html=FALSE') + } + output_rmd_file <- sprintf('%s/%sQC.Rmd',outdir,prefix) + file.copy(from=system.file('Rmd/eset_QC.Rmd',package = "NetBID2"),to=output_rmd_file) + rmarkdown::render(output_rmd_file, html_document(toc = TRUE)) + return(TRUE) + } + ## pca + if('pca' %in% choose_plot){ + fp <- file.path(outdir, sprintf("%s%s.pdf", prefix, 'pca')) + pdf(fp, width = 12, height = 6) + pp <- 0.3+0.7-(ncol(use_mat)-10)*(0.7/900) + if(ncol(use_mat)<=10) pp <- 1 + if(ncol(use_mat)>1000) pp <- 0.3 + for (i in 1:base::length(intgroup)){ + tmp1 <- draw.pca.kmeans(use_mat,obs_label=get_obs_label(Biobase::pData(eset),intgroup[i]),verbose=FALSE,point_cex=pp,main=intgroup[i], + use_color=use_color,pre_define=pre_define,plot_type = pca_plot_type) + } + dev.off() + message('Finish PCA plot !') + } + + ## heatmap + if('heatmap' %in% choose_plot){ + fp <- file.path(outdir, sprintf("%s%s.pdf", prefix, 'heatmap')) + pdf(fp, width = 12, height = 9) + par(mar = c(6, 6, 6, 6)) + m <- dist2.mod(use_mat) + dend <- as.dendrogram(hclust(as.dist(m), method = "single")) + ord <- order.dendrogram(dend) + m <- m[ord, ord] + draw.heatmap(mat=m,phenotype_info = Biobase::pData(eset),use_phe=intgroup,use_color=use_color,pre_define=pre_define) + dev.off() + message('Finish Heatmap plot !') + } + + ## meansd + if('meansd' %in% choose_plot){ + fp <- file.path(outdir, sprintf("%s%s.pdf", prefix, 'meansd')) + pdf(fp, width = 12, height = 9) + draw.meanSdPlot(eset) + dev.off() + message('Finish MeanSD plot !') + } + + ## density + if('density' %in% choose_plot){ + fp <- file.path(outdir, sprintf("%s%s.pdf", prefix, 'density')) + pdf(fp, width = 12, height = 9) + for(i in 1:base::length(intgroup)){ + all_dens <- list() + for (j in 1:ncol(use_mat)) { + all_dens[[j]] <- stats::density(use_mat[,j],na.rm=TRUE) + } + graphics::plot(1,col = 'white',xlim=c(base::min(unlist(lapply(all_dens,function(x)base::min(x$x)))),base::max(unlist(lapply(all_dens,function(x)base::max(x$x))))), + type = 'l',xlab = "",ylab='Density',main = sprintf('Density plot for %s',intgroup[i]), + ylim=c(base::min(unlist(lapply(all_dens,function(x)base::min(x$y)))),base::max(unlist(lapply(all_dens,function(x)base::max(x$y)))))) + class_label <- get_obs_label(Biobase::pData(eset),intgroup[i]) + cls_cc <- get.class.color(class_label,use_color=use_color,pre_define=pre_define) ## get color for each label + for (j in 1:ncol(use_mat)) { + lines(all_dens[[j]], col = cls_cc[j]) + } + legend('topright',legend=base::unique(class_label), + fill = cls_cc[base::unique(class_label)], + xpd = TRUE,border = NA,bty = 'n',horiz = FALSE) + } + dev.off() + message('Finish Density plot !') + } + + ## correlation + if('correlation' %in% choose_plot){ + fp <- file.path(outdir, sprintf("%s%s.pdf", prefix, 'correlation')) + pdf(fp, width = 12, height = 12); + par(mar=c(3,3,3,3)) + for(i in 1:base::length(intgroup)){ + class_label <- get_obs_label(Biobase::pData(eset),intgroup[i]) + draw.correlation(use_mat,class_label,main=intgroup[i],correlation_strategy=correlation_strategy,plot_all_point=plot_all_point, + use_color=use_color,pre_define=pre_define) + } + dev.off() + message('Finish Density plot !') + } + return(TRUE) +} +# two inner functions +draw.meanSdPlot <- function(eset){ + exp_mat <- Biobase::exprs(eset) + mean_g <- apply(exp_mat,1,mean) + mean_g <- rank(mean_g) + sd_g <- apply(exp_mat,1,sd) + n <- 50 + mean_g_c <- cut(mean_g,breaks = n) + sd_g_c <- cut(sd_g,breaks = n) + mat <- base::table(base::cbind(list(mean_g_c,sd_g_c))) + tmp1 <- stats::aggregate(sd_g,list(mean_g_c),mean); rownames(tmp1) <- tmp1$Group.1 + sd_g_v <- tmp1[levels(mean_g_c),'x'] + # + mean_g_l <- t(sapply(levels(mean_g_c),function(x)as.numeric(strsplit(gsub('\\(|\\]','\\1',x),',')[[1]]))) + sd_g_l <- t(sapply(levels(sd_g_c),function(x)as.numeric(strsplit(gsub('\\(|\\]','\\1',x),',')[[1]]))) + mean_g_l_m <- base::rowMeans(mean_g_l) + sd_g_l_m <- base::rowMeans(sd_g_l) + # col=get_transparent(brewer.pal(8,'Set1')[2],alpha=0.1) + par(mai=c(1,1,1,2)) + dat <- spline(x=mean_g_l_m,y=sd_g_v,n=n*10) + graphics::plot(y~x,data=dat,type='l',col=brewer.pal(8,'Set1')[1],lwd=2,xlab='rank(mean)',ylab='sd', + ylim=c(0,base::max(sd_g)),xlim=c(0,base::max(mean_g)),cex.lab=1.2) + pp <- par()$usr + ag <- 30 + r <- base::max(mean_g)/(2*nrow(mat)) + rx <- r/cos(ag*pi/180)*par.pos2inch()[1] # x-inch + ry <- rx + x1 <- c(rx*cos(ag*pi/180),rx*cos(ag*pi/180),0,-rx*cos(ag*pi/180),-rx*cos(ag*pi/180),0,rx*cos(ag*pi/180)) + y1 <- c(ry*sin(ag*pi/180),-ry*sin(ag*pi/180),-ry,-ry*sin(ag*pi/180),+ry*sin(ag*pi/180),ry,ry*sin(ag*pi/180)) + #print(x1/par.pos2inch()[1]);print(y1/par.pos2inch()[2]) + simplehexbin <- function(x,y,r,col){ + graphics::polygon(x=x+x1/par.pos2inch()[1],y=y+y1/par.pos2inch()[2],col=col,border = 'white',lwd=0.1) + } + mm <- base::max(as.numeric(mat)) + cc <- grDevices::colorRampPalette(brewer.pal(9,'Blues')[c(2:9)])(mm) + cc <- rev(cc) + for(i in 1:nrow(mat)){ + for(j in 1:ncol(mat)){ + if(mat[i,j]>0){ + if(j%%2==0) simplehexbin(x=mean_g_l_m[i],y=sd_g_l_m[j],col=cc[mat[i,j]]) + if(j%%2==1) simplehexbin(x=mean_g_l_m[i]-r,y=sd_g_l_m[j],col=cc[mat[i,j]]) + } + } + } + lines(y~x,data=dat,col=brewer.pal(8,'Set1')[1],lwd=2) + ypos <- base::seq(pp[4]-par.char2pos()[2]*2,pp[3]+(pp[4]-pp[3])/2,length.out=base::length(cc)+1) + graphics::text(x=pp[2]+par.char2pos()[1]*1.15,y=pp[4]-par.char2pos()[2],pos=4,'Count',xpd=TRUE) + graphics::rect(xleft=pp[2]+par.char2pos()[1],xright=pp[2]+par.char2pos()[1]*1.5,ybottom=ypos[1:(base::length(ypos)-1)],ytop=ypos[2:base::length(ypos)],col=rev(cc),border=NA,xpd=TRUE) + graphics::text(x=pp[2]+par.char2pos()[1]*1.5,y=quantile(ypos,probs=c(0,0.5,1)),c(1,round(mm/2),mm),xpd=TRUE,pos=4) + return() +} +draw.correlation <- function(use_mat,class_label,main='',correlation_strategy='pearson',plot_all_point=FALSE,use_color=NULL,pre_define=NULL){ + # plot part + n_sample <- ncol(use_mat); + if(n_sample>30){ + message('Too many samples for drawing the correlation plot, will select the first 30 samples ! + If want to specify the sample list, please choose the subset of eset as input !') + use_mat <- use_mat[,1:30] + class_label <- class_label[1:30] + n_sample <- 30 + #return(FALSE) + } + all_s <- colnames(use_mat) + uni_class <- base::unique(class_label) + class_label <- class_label[order(factor(class_label,levels=uni_class))] + use_mat <- use_mat[,names(class_label)] + cor_res <- cor(use_mat,method=correlation_strategy) + cc1 <- get.class.color(class_label,use_color=use_color,pre_define=pre_define); + cc1 <- get_transparent(cc1,0.5); + p1 <- cumsum(base::table(class_label)[uni_class]); p1 <- c(0,p1) + p2 <- p1[1:(base::length(p1)-1)]/2+p1[2:base::length(p1)]/2 + #### + graphics::plot(1,xlim=c(-2,n_sample+1),ylim=c(-2,n_sample+1),xaxt='n',yaxt='n',xlab='',ylab='',col='white',bty='n',xaxs='i',yaxs='i',main=main) + pp <- 15/n_sample + if(pp>1) pp <- 1 + if(pp<0.2) pp <- 0.2 + graphics::abline(v=1:n_sample,lwd=0.5,col='grey'); graphics::abline(h=1:n_sample,lwd=0.5,col='grey'); + graphics::rect(xleft=0:(n_sample-1),xright=1:n_sample,ybottom=n_sample,ytop=n_sample+1,col=cc1,border=NA,xpd=TRUE) + graphics::rect(ybottom=0:(n_sample-1),ytop=1:n_sample,xleft=n_sample,xright=n_sample+1,col=cc1,border=NA,xpd=TRUE) + graphics::abline(v=p1);graphics::abline(h=p1); + graphics::text(p2,n_sample+1/2,uni_class,adj=0.5,cex=pp,xpd=TRUE); + graphics::text(n_sample+1/2,p2,uni_class,adj=0.5,srt=270,cex=pp,xpd=TRUE); + graphics::text(x=1:n_sample-0.5,y=0,all_s,adj=1,xpd=TRUE,srt=90,cex=pp,xpd=TRUE) + graphics::text(y=1:n_sample-0.5,x=0,all_s,pos=2,xpd=TRUE,srt=0,cex=pp,xpd=TRUE) + for(i in 1:(n_sample-1)){ + for(j in (i+1):n_sample){ + graphics::text(y=i-0.5,x=j-0.5,format(cor_res[i,j],digits=2),cex=pp,xpd=TRUE) + } + } + n_box <- round(500/n_sample) + bb <- base::seq(0,1,length.out=n_box) + bb1 <- cut(bb,breaks=bb) + for(j in 1:(n_sample-1)){ + for(i in (j+1):n_sample){ + #graphics::text(y=i-0.5,x=j-0.5,format(cor_res[i,j],digits=2),cex=pp,col=2) + ei <- use_mat[,i]; ej <- use_mat[,j] + mmin <- base::min(c(ei,ej)); mmax <- base::max(c(ei,ej)) + ei_m <- (ei-mmin)/(mmax-mmin) + ej_m <- (ej-mmin)/(mmax-mmin) + if(plot_all_point==TRUE){ + graphics::points(y=ei_m+i-1,x=ej_m+j-1,cex=0.1,pch=16,col=get_transparent('dark grey',0.4)) ## memory consuming !!! + }else{ + ei_mc <- cut(ei_m,breaks = bb) + ej_mc <- cut(ej_m,breaks = bb) + tt <- base::table(list(ei_mc,ej_mc)) + mm_t <- quantile(tt,probs=0.99) + tt_thre <- mm_t/100 + for(ii in 1:(n_box-1)){ + for(jj in 1:(n_box-1)){ + ap <- (tt[ii,jj]/mm_t)^(1/4); if(ap>0.9) ap <- 0.9 + if(tt[ii,jj]>tt_thre) graphics::points(y=bb[ii]+i-1,x=bb[jj]+j-1,col=get_transparent('black',ap),pch=16,cex=5/n_box) + } + } + } + } + } + for(i in 1:n_sample){ + ei <- use_mat[,i] + dei <- stats::density(ei) + dei$x <- (dei$x-base::min(dei$x))/(base::max(dei$x)-base::min(dei$x)) + dei$y <- (dei$y-base::min(dei$y))/(base::max(dei$y)-base::min(dei$y)) + lines(x=dei$x+i-1,y=dei$y+i-1,col='black',lwd=1) + } +## +} + +#' Visualize the K-means Clustering Result by PCA Biplot +#' +#' \code{draw.pca.kmeans} is a data visualization function to show the K-means clustering result of a data matrix. +#' A PCA biplot is generated to visualize the clustering. Two biplots side-by-side will show the comparison +#' between real observation labels (left) and the K-means predicted labels (right). +#' +#' This function is mainly used to check the sample clustering result, in aim to detect if any abnormal (outlier) sample(s) exsist. +#' The input is a high-throughput expression matrix. +#' Each row is a gene/transcript/probe and each column is a sample. +#' Users need to provide the real observation label for each sample. +#' A K-value yielding the optimal classification result will be used to generate the predicted labels. +#' A comparision score (choose from ARI, NMI, Jaccard) will be calculated and shown in the figure. +#' +#' @param mat a numeric data matrix, the columns (e.g. sample) will be clustered using the feature (e.g. genes) rows. +#' @param all_k a vector of integers, a pre-defined K value. K is the number of final clusters. +#' If NULL, the function will try all possible K values. Default is NULL. +#' @param obs_label a vector of characters, a vector describes each sample's selected phenotype information, +#' using sample name as vector name. Can be obtained by calling \code{get_obs_label}. +#' @param legend_pos character, position of the plot legend. Default is 'topleft'. +#' @param legend_cex numeric, text size of the plot legend. Default is 0.8. +#' @param plot_type character, plot type. Users can choose from "2D", "2D.ellipse", "2D.interactive","2D.text" and "3D". Default is "2D.ellipse". +#' @param point_cex numeric, size of the point in the plot. Default is 1. +#' @param kmeans_strategy character, K-means clustering algorithm. +#' Users can choose "basic" or "consensus". "consensus" is performed by \code{ConsensusClusterPlus}. +#' Default is "basic". +#' @param choose_k_strategy character, method to choose the K-value. +#' Users can choose from "ARI (adjusted rand index)", "NMI (normalized mutual information)" and "Jaccard". +#' Default is "ARI". +#' @param return_type character, the type of result returned. +#' Users can choose "optimal" or "all". "all", all the K-values in \code{all_k} will be returned. +#' "optimal", only the K-value yielding the optimal classification result will be returned. +#' Default is "optimal". +#' @param main character, title for the plot. +#' @param verbose logical, if TRUE, print out detailed information during calculation. Default is TRUE. +#' @param use_color a vector of color codes, colors to be assigned to each member of display label. Default is brewer.pal(9, 'Set1'). +#' @param pre_define a vector of characters, pre-defined color codes for a certain input (e.g. c("blue", "red") with names c("A", "B")). Default is NULL. +#' +#' @return Return a vector of predicted labels, if \code{return_type} is set to "optimal". +#' Or a list of all possible K-values, if \code{return_type} is set to be "all". +#' If plot_type='2D.interactive', will return a plotly class object for interactive display. +#' @examples +#' network.par <- list() +#' network.par$out.dir.DATA <- system.file('demo1','network/DATA/',package = "NetBID2") +#' NetBID.loadRData(network.par=network.par,step='exp-QC') +#' mat <- Biobase::exprs(network.par$net.eset) +#' phe <- Biobase::pData(network.par$net.eset) +#' intgroup <- get_int_group(network.par$net.eset) +#' for(i in 1:base::length(intgroup)){ +#' print(intgroup[i]) +#' pred_label <- draw.pca.kmeans(mat=mat,all_k = NULL,obs_label=get_obs_label(phe,intgroup[i])) +#' print(base::table(list(pred_label=pred_label,obs_label=get_obs_label(phe,intgroup[i])))) +#' } +#' pred_label <- draw.pca.kmeans(mat=mat,all_k = NULL, +#' obs_label=get_obs_label(phe,'subgroup'), +#' kmeans_strategy='consensus') +#' ## interactive display +#' draw.pca.kmeans(mat=mat,all_k = NULL, +#' obs_label=get_obs_label(phe,'subgroup'), +#' plot_type='2D.interactive', +#' pre_define=c('WNT'='blue','SHH'='red','G4'='green')) +#' @export +draw.pca.kmeans <- function(mat=NULL,all_k=NULL,obs_label=NULL,legend_pos = 'topleft',legend_cex = 0.8, + plot_type='2D.ellipse',point_cex=1, + kmeans_strategy='basic',choose_k_strategy='ARI', + return_type='optimal',main='',verbose=TRUE, + use_color=NULL,pre_define=NULL){ + # + all_input_para <- c('mat','obs_label','legend_pos','legend_cex','plot_type','point_cex','kmeans_strategy','choose_k_strategy', + 'return_type','main','verbose') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('verbose',c(TRUE,FALSE),envir=environment()), + check_option('plot_type',c("2D","2D.interactive", "2D.ellipse","2D.text","3D"),envir=environment()), + check_option('kmeans_strategy',c('basic','consensus'),envir=environment()), + check_option('choose_k_strategy',c('ARI','NMI','Jaccard'),envir=environment()), + check_option('return_type',c('optimal','all'),envir=environment()), + check_option('legend_pos',c("bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right","center"),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + obs_label <- clean_charVector(obs_label) + # + if(is.null(all_k)==TRUE){ + all_k <- 2:base::min(base::length(obs_label)-1,2*base::length(base::unique(obs_label))) + } + if(base::length(base::setdiff(all_k,2:base::length(obs_label)))>0){ + message('some value in all_k exceed the maximum sample size, check and re-try !');return(FALSE); + } + pca <- stats::prcomp(t(mat)) + #pca <- stats::prcomp(na.exclude(t(mat))) + cluster_mat <- pca$x + all_jac <- list() + all_k_res <- list() + if(kmeans_strategy=='basic'){ + for(k in all_k){ + tmp_k <- list() + for(i in 1:10){ + tmp_k[[i]] <- stats::kmeans(cluster_mat,centers=as.numeric(k))$cluster + } + pred_label <- tmp_k + jac <- unlist(lapply(pred_label,function(x){get_clustComp(x, obs_label,strategy=choose_k_strategy)})) + top_i <- base::which.max(jac) + all_k_res[[as.character(k)]] <- tmp_k[[top_i]] + } + }else{ + all_k_res <- get_consensus_cluster(mat=mat,all_k=all_k) + } + for(k in all_k){ + pred_label <- all_k_res[[as.character(k)]] + jac <- get_clustComp(pred_label, obs_label,strategy = choose_k_strategy) + all_jac[[as.character(k)]] <- signif(jac,4) + } + if(verbose==TRUE) message('Optimal k is chosen by Score between predicted and observed label') + if(verbose==TRUE) print(all_jac) + use_k <- all_k[base::which.max(all_jac)] + pred_label <- all_k_res[[as.character(use_k)]] + if(verbose==TRUE) message(sprintf('Best Score occurs when k=%s, with value=%s',use_k,all_jac[as.character(use_k)])) + ## + d1 <- data.frame(id=colnames(mat),X=cluster_mat[,1],Y=cluster_mat[,2],Z=cluster_mat[,3],label=pred_label,stringsAsFactors=FALSE) + if(plot_type=='2D.interactive'){ + p <- draw.2D.interactive(d1$X,d1$Y, + sample_label=names(obs_label), + color_label=obs_label[d1$id], + shape_label=d1$label, + xlab=sprintf('PC1(%s%s variance)',format(summary(pca)$importance[2,'PC1']*100,digits=3),'%'), + ylab=sprintf('PC2(%s%s variance)',format(summary(pca)$importance[2,'PC2']*100,digits=3),'%'), + point_cex=point_cex,main=main, + use_color=use_color,pre_define=pre_define) + return(p) + #if(return_type=='optimal') return(pred_label) + #if(return_type=='all') return(all_k_res) + } + graphics::layout(t(matrix(1:2))) + if(plot_type=='2D.ellipse'){ + draw.2D.ellipse(d1$X,d1$Y,class_label=obs_label[d1$id], + xlab=sprintf('PC1(%s%s variance)',format(summary(pca)$importance[2,'PC1']*100,digits=3),'%'), + ylab=sprintf('PC2(%s%s variance)',format(summary(pca)$importance[2,'PC2']*100,digits=3),'%'), + legend_cex=legend_cex,point_cex=point_cex,main=main, + use_color=use_color,pre_define=pre_define) + draw.2D.ellipse(d1$X,d1$Y,class_label=d1$label, + xlab=sprintf('PC1(%s%s variance)',format(summary(pca)$importance[2,'PC1']*100,digits=3),'%'), + ylab=sprintf('PC2(%s%s variance)',format(summary(pca)$importance[2,'PC2']*100,digits=3),'%'), + legend_cex=legend_cex,point_cex=point_cex, + main=sprintf('%s:%s',choose_k_strategy,format(all_jac[as.character(use_k)],digits=4)), + use_color=use_color,pre_define=pre_define) + } + if(plot_type=='2D'){ + draw.2D(d1$X,d1$Y,class_label=obs_label[d1$id], + xlab=sprintf('PC1(%s%s variance)',format(summary(pca)$importance[2,'PC1']*100,digits=3),'%'), + ylab=sprintf('PC2(%s%s variance)',format(summary(pca)$importance[2,'PC2']*100,digits=3),'%'), + legend_cex=legend_cex,point_cex=point_cex,main=main, + use_color=use_color,pre_define=pre_define) + draw.2D(d1$X,d1$Y,class_label=d1$label, + xlab=sprintf('PC1(%s%s variance)',format(summary(pca)$importance[2,'PC1']*100,digits=3),'%'), + ylab=sprintf('PC2(%s%s variance)',format(summary(pca)$importance[2,'PC2']*100,digits=3),'%'), + legend_cex=legend_cex,point_cex=point_cex, + main=sprintf('%s:%s',choose_k_strategy,format(all_jac[as.character(use_k)],digits=4)), + use_color=use_color,pre_define=pre_define) + } + + if(plot_type=='2D.text'){ + draw.2D.text(d1$X,d1$Y,class_label=obs_label[d1$id],class_text=d1$id, + xlab=sprintf('PC1(%s%s variance)',format(summary(pca)$importance[2,'PC1']*100,digits=3),'%'), + ylab=sprintf('PC2(%s%s variance)',format(summary(pca)$importance[2,'PC2']*100,digits=3),'%'), + legend_cex=legend_cex,point_cex=point_cex,main=main, + use_color=use_color,pre_define=pre_define) + draw.2D.text(d1$X,d1$Y,class_label=d1$label,class_text=d1$id, + xlab=sprintf('PC1(%s%s variance)',format(summary(pca)$importance[2,'PC1']*100,digits=3),'%'), + ylab=sprintf('PC2(%s%s variance)',format(summary(pca)$importance[2,'PC2']*100,digits=3),'%'), + legend_cex=legend_cex,point_cex=point_cex, + main=sprintf('%s:%s',choose_k_strategy,format(all_jac[as.character(use_k)],digits=4)), + use_color=use_color,pre_define=pre_define) + } + if(plot_type=='3D'){ + draw.3D(d1$X,d1$Y,d1$Z,class_label=obs_label[d1$id], + xlab=sprintf('PC1(%s%s variance)',format(summary(pca)$importance[2,'PC1']*100,digits=3),'%'), + ylab=sprintf('PC2(%s%s variance)',format(summary(pca)$importance[2,'PC2']*100,digits=3),'%'), + zlab=sprintf('PC3(%s%s variance)',format(summary(pca)$importance[2,'PC3']*100,digits=3),'%'), + legend_cex=legend_cex,point_cex=point_cex,legend_pos=legend_pos,main=main, + use_color=use_color,pre_define=pre_define) + draw.3D(d1$X,d1$Y,d1$Z,class_label=d1$label, + xlab=sprintf('PC1(%s%s variance)',format(summary(pca)$importance[2,'PC1']*100,digits=3),'%'), + ylab=sprintf('PC2(%s%s variance)',format(summary(pca)$importance[2,'PC2']*100,digits=3),'%'), + zlab=sprintf('PC3(%s%s variance)',format(summary(pca)$importance[2,'PC3']*100,digits=3),'%'), + legend_cex=legend_cex,point_cex=point_cex,legend_pos=legend_pos, + main=sprintf('%s:%s',choose_k_strategy,format(all_jac[as.character(use_k)],digits=4)), + use_color=use_color,pre_define=pre_define) + } + graphics::layout(1); + if(return_type=='optimal') return(pred_label) + if(return_type=='all') return(all_k_res) +} + +#' Draw Cluster Plot Using UMAP (visulization algorithm) and Kmeans (cluster algorithm). +#' +#' \code{draw.umap.kmeans} is a function to visualize the cluster result for the input data matrix. It is mainly used for check sample clustering result. +#' Warning, it is not suggested to use when sample size is small. +#' +#' @param mat numeric matrix, the expression matrix. Rows are genes/features, columns are samples. Columns will be clustered based on row features. +#' @param all_k a vector of integers, the pre-defined K-values. If NULL, will use all possible K. Default is NULL. +#' @param obs_label a vector of characters, the observed sample labels or categories, name of the vector is sample names. +#' @param legend_pos character, the legend position. Default is "topleft". +#' @param legend_cex numeric, giving the amount by which the text of legend should be magnified relative to the default. Default is 0.8. +#' @param plot_type character, type of the plot. +#' Users can choose from "2D","2D.interactive", "2D.ellipse", "2D.text" and "3D". Default is "2D.ellipse". +#' @param point_cex numeric, giving the amount by which the size of the data points should be magnified relative to the default. Default is 1. +#' @param kmeans_strategy character, K-means clustering algorithm. Users can choose "basic" or "consensus". +#' "consensus" is performed by \code{ConsensusClusterPlus}. Default is "basic". +#' @param choose_k_strategy character, method to choose the K-value. Users can choose from "ARI (adjusted rand index)", "NMI (normalized mutual information)" and "Jaccard". +#' Default is "ARI". +#' @param return_type character, the type of result returned. Users can choose "optimal" or "all". +#' "all", all the K-values in all_k will be returned. +#' "optimal", only the K-value yielding the optimal classification result will be returned. +#' Default is "optimal". +#' @param main character, title for the plot. +#' @param verbose logical, if TRUE, print out detailed information during calculation. Default is TRUE. +#' @param use_color a vector of color codes, colors to be assigned to each member of display label. Default is brewer.pal(9, 'Set1'). +#' @param pre_define a vector of characters, pre-defined color codes for a certain input (e.g. c("blue", "red") with names c("A", "B")). Default is NULL. +#' +#' @return Return a vector of the predicted label (if \code{return_type} is "optimal") and a list of all possible K-values (if \code{return_type} is "all"). +#' If plot_type='2D.interactive', will return a plotly class object for interactive display. +#' @examples +#' network.par <- list() +#' network.par$out.dir.DATA <- system.file('demo1','network/DATA/',package = "NetBID2") +#' NetBID.loadRData(network.par=network.par,step='exp-QC') +#' mat <- Biobase::exprs(network.par$net.eset) +#' phe <- Biobase::pData(network.par$net.eset) +#' intgroup <- get_int_group(network.par$net.eset) +#' for(i in 1:base::length(intgroup)){ +#' print(intgroup[i]) +#' pred_label <- draw.umap.kmeans(mat=mat,all_k = NULL,obs_label=get_obs_label(phe,intgroup[i])) +#' print(base::table(list(pred_label=pred_label,obs_label=get_obs_label(phe,intgroup[i])))) +#' } +#' pred_label <- draw.umap.kmeans(mat=mat,all_k = NULL, +#' obs_label=get_obs_label(phe,intgroup[i]), +#' kmeans_strategy='consensus') +#' ## interactive display +#' draw.umap.kmeans(mat=mat,all_k = NULL, +#' obs_label=get_obs_label(phe,'subgroup'), +#' plot_type='2D.interactive', +#' pre_define=c('WNT'='blue','SHH'='red','G4'='green')) +#' @export +draw.umap.kmeans <- function(mat=NULL,all_k=NULL,obs_label=NULL, + legend_pos = 'topleft',legend_cex = 0.8, + kmeans_strategy='basic',choose_k_strategy='ARI', + plot_type='2D.ellipse',point_cex=1,return_type='optimal',main='',verbose=TRUE, + use_color=NULL,pre_define=NULL){ + # + all_input_para <- c('mat','obs_label','legend_pos','legend_cex','plot_type','point_cex','kmeans_strategy','choose_k_strategy', + 'return_type','main','verbose') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('verbose',c(TRUE,FALSE),envir=environment()), + check_option('plot_type',c("2D",'2D.interactive', "2D.ellipse","2D.text","3D"),envir=environment()), + check_option('kmeans_strategy',c('basic','consensus'),envir=environment()), + check_option('choose_k_strategy',c('ARI','NMI','Jaccard'),envir=environment()), + check_option('return_type',c('optimal','all'),envir=environment()), + check_option('legend_pos',c("bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right","center"),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + obs_label <- clean_charVector(obs_label) + # + if(is.null(all_k)==TRUE){ + all_k <- 2:base::min(base::length(obs_label)-1,2*base::length(base::unique(obs_label))) + } + if(base::length(base::setdiff(all_k,2:base::length(obs_label)))>0){ + message('some value in all_k exceed the maximum sample size, check and re-try !');return(FALSE); + } + ori_cc <- umap.defaults; + ori_cc$n_epochs <- 2000; + ori_cc$n_neighbors <- base::min(15,round(ncol(mat)/2)); + if(plot_type=='3D') ori_cc$n_components <- 3 + cluster_mat <- umap::umap(as.matrix(t(mat)),config=ori_cc) + # + all_jac <- list() + all_k_res <- list() + if(kmeans_strategy=='basic'){ + for(k in all_k){ + tmp_k <- list() + for(i in 1:10){ + tmp_k[[i]] <- stats::kmeans(cluster_mat$layout,centers=as.numeric(k))$cluster + } + pred_label <- tmp_k + jac <- unlist(lapply(pred_label,function(x){get_clustComp(x, obs_label,strategy = choose_k_strategy)})) + top_i <- base::which.max(jac) + all_k_res[[as.character(k)]] <- tmp_k[[top_i]] + } + }else{ + all_k_res <- get_consensus_cluster(mat=mat,all_k=all_k) + } + for(k in all_k){ + pred_label <- all_k_res[[as.character(k)]] + jac <- get_clustComp(pred_label, obs_label,strategy = choose_k_strategy) + all_jac[[as.character(k)]] <- signif(jac,4) + } + if(verbose==TRUE) message('Optimal k is chosen by Score between predicted and observed label') + if(verbose==TRUE) print(all_jac) + use_k <- all_k[base::which.max(all_jac)] + if(verbose==TRUE) message(sprintf('Best Score occurs when k=%s, with value=%s',use_k,all_jac[as.character(use_k)])) + pred_label <- all_k_res[[as.character(use_k)]] + + d1 <- data.frame(id=colnames(mat),X=cluster_mat$layout[,1],Y=cluster_mat$layout[,2],label=pred_label,stringsAsFactors=FALSE) + if(plot_type=='3D') d1 <- data.frame(id=colnames(mat),X=cluster_mat$layout[,1],Y=cluster_mat$layout[,2],Z=cluster_mat$layout[,3],label=pred_label,stringsAsFactors=FALSE) + if(plot_type=='2D.interactive'){ + p <- draw.2D.interactive(d1$X,d1$Y, + sample_label=names(obs_label), + color_label=obs_label[d1$id], + shape_label=d1$label, + xlab="",ylab="", + point_cex=point_cex,main=main, + use_color=use_color,pre_define=pre_define) + return(p) + #if(return_type=='optimal') return(pred_label) + #if(return_type=='all') return(all_k_res) + } + graphics::layout(t(matrix(1:2))) + if(plot_type=='2D.ellipse'){ + draw.2D.ellipse(d1$X,d1$Y,class_label=obs_label[d1$id],xlab="",ylab="",legend_cex=legend_cex,point_cex=point_cex, + use_color=use_color,pre_define=pre_define,main=main) + draw.2D.ellipse(d1$X,d1$Y,class_label=d1$label,xlab="",ylab="",legend_cex=legend_cex,point_cex=point_cex, + main=sprintf('%s:%s',choose_k_strategy,format(all_jac[as.character(use_k)],digits=4)), + use_color=use_color,pre_define=pre_define) + } + if(plot_type=='2D'){ + draw.2D(d1$X,d1$Y,class_label=obs_label[d1$id],xlab="",ylab="",legend_cex=legend_cex,point_cex=point_cex, + use_color=use_color,pre_define=pre_define,main=main) + draw.2D(d1$X,d1$Y,class_label=d1$label,xlab="",ylab="",legend_cex=legend_cex,point_cex=point_cex, + main=sprintf('%s:%s',choose_k_strategy,format(all_jac[as.character(use_k)],digits=4)), + use_color=use_color,pre_define=pre_define) + } + if(plot_type=='2D.text'){ + draw.2D.text(d1$X,d1$Y,class_label=obs_label[d1$id],class_text=d1$id,xlab='',ylab='',legend_cex=legend_cex,point_cex=point_cex, + use_color=use_color,pre_define=pre_define,main=main) + draw.2D.text(d1$X,d1$Y,class_label=d1$label,class_text=d1$id,xlab='',ylab='',legend_cex=legend_cex,point_cex=point_cex, + main=sprintf('%s:%s',choose_k_strategy,format(all_jac[as.character(use_k)],digits=4)), + use_color=use_color,pre_define=pre_define) + } + if(plot_type=='3D'){ + draw.3D(d1$X,d1$Y,d1$Z,class_label=obs_label[d1$id],xlab='',ylab='',zlab='',legend_cex=legend_cex,point_cex=point_cex,legend_pos=legend_pos, + use_color=use_color,pre_define=pre_define,main=main) + draw.3D(d1$X,d1$Y,d1$Z,class_label=d1$label,xlab='',ylab='',zlab='',legend_cex=legend_cex,point_cex=point_cex,legend_pos=legend_pos, + main=sprintf('%s:%s',choose_k_strategy,format(all_jac[as.character(use_k)],digits=4)), + use_color=use_color,pre_define=pre_define) + } + graphics::layout(1); + if(return_type=='optimal') return(pred_label) + if(return_type=='all') return(all_k_res) +} + + +## get consensus Kmeans results, using M3C(toooo slow),ConsensusClusterPlus +# return cluster results, RCSI score, P-value +# plot==TRUE, plot RSCI+BETA_P +# get_consensus_cluster +get_consensus_cluster <-function(mat,all_k=2:12,clusterAlg="km",plot='png',...) +{ + maxK <- base::max(all_k) + res1 <- ConsensusClusterPlus::ConsensusClusterPlus(mat,maxK=maxK,clusterAlg=clusterAlg,plot=plot,...) + cls_res <- lapply(all_k,function(x){ + x1 <- res1[[x]]$consensusClass + x1 + }) + names(cls_res) <- as.character(all_k) + cluster_res <- cls_res + return(cluster_res) +} + + +#' Draw Cluster Plot Using MICA (cluster algorithm) +#' +#' \code{draw.MICA} is a function to visualize the cluster result for the samples using MICA (mutual information based clustering analysis) algorithm. +#' Users need to give the MICA project information (directory and name), and the samples real labels. +#' MICA returns the K-value that yields the best clustering performance. Users can pick one comparison score to show in the plot, "ARI", "NMI" or "Jaccard". +#' MICA is not suggested, when sample size is small. +#' This function may not work well for the updated version of MICA. +#' +#' @param outdir character, the output directory for running MICA. +#' @param prjname charater, the project name for running MICA. +#' @param all_k a vector of integers, the pre-defined K-values. +#' If NULL, will use all possible K. Default is NULL. +#' @param obs_label a vector of characters, the observed sample labels or categories. +#' @param legend_pos character, position of the legend in the plot. Default is "topleft". +#' @param legend_cex numeric, giving the amount by which the text of legend should be magnified relative to the default. Default is 0.8. +#' @param plot_type character, type of the plot. Users can choose from "2D", "2D.ellipse", "2D.text" and "3D". Default is "2D.ellipse". +#' @param point_cex numeric, giving the amount by which the size of the data points should be magnified relative to the default. Default is 1. +#' @param choose_k_strategy character, method to choose the K-value. +#' Users can choose from "ARI (adjusted rand index)", "NMI (normalized mutual information)" and "Jaccard". Default is "ARI". +#' @param visualization_type character, users can choose from "tsne", "umap" and "mds". Default is "tsne". +#' @param return_type character, the type of result returned. Users can choose "optimal" or "all". +#' "all", all the K-values in all_k will be returned. +#' "optimal", only the K-value yielding the optimal classification result will be returned. Default is "optimal". +#' @param main character, title for the plot. +#' @param verbose logical, if TRUE, print out detailed information during calculation. Default is TRUE. +#' @param use_color a vector of color codes, colors to be assigned to each member of display label. Default is brewer.pal(9, 'Set1'). +#' @param pre_define a vector of characters, pre-defined color codes for a certain input (e.g. c("blue", "red") with names c("A", "B")). Default is NULL. +#' +#' @return Return a vector of the predicted label (if return_type is "optimal") and a list of all possible K- values (if return_type is "all"). +#' @export +draw.MICA <- function(outdir=NULL,prjname=NULL,all_k=NULL,obs_label=NULL, + legend_pos = 'topleft',legend_cex = 0.8, + point_cex=1,plot_type='2D.ellipse', + choose_k_strategy='ARI', + visualization_type='tsne',return_type='optimal', + main='',verbose=TRUE, + use_color=NULL,pre_define=NULL) { + # + all_input_para <- c('outdir','prjname','obs_label','legend_pos','legend_cex','plot_type','point_cex','choose_k_strategy','visualization_type', + 'return_type','main','verbose','all_k') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('verbose',c(TRUE,FALSE),envir=environment()), + check_option('plot_type',c("2D", "2D.ellipse","2D.text","3D"),envir=environment()), + check_option('choose_k_strategy',c('ARI','NMI','Jaccard'),envir=environment()), + check_option('visualization_type',c('tsne','umap','mds'),envir=environment()), + check_option('return_type',c('optimal','all'),envir=environment()), + check_option('legend_pos',c("bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right","center"),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(plot_type=='3D' & visualization_type=='tsne'){ + message('Current tsne not support for 3D');return(FALSE) + } + # choose best k + res1 <- get_clustComp_MICA(outdir=outdir, all_k=all_k, obs_label=obs_label, prjname = prjname,strategy = choose_k_strategy) + all_k_res <- res1$all_k_res + all_jac <- res1$all_jac + use_k <- all_k[base::which.max(all_jac)] + message(sprintf('Best Score occurs when k=%s, with value=%s',use_k,all_jac[as.character(use_k)])) + # + use_file <- sprintf('%s/%s_k%s_ClusterMem.txt', + outdir,prjname,use_k,prjname) + d1 <- read.delim(use_file, stringsAsFactors = FALSE) ## get cluster results + if(visualization_type=='umap' | visualization_type=='mds'){ + use_file <- sprintf('%s/%s_reduced.h5', + outdir,prjname) + fid <- rhdf5::H5Fopen(use_file) + dist_mat <- fid$`mds`$block0_values[1:19,] ### + if(visualization_type=='mds'){ + X <- fid$mds$block0_values[1,]; + Y <- fid$mds$block0_values[2,]; + Z <- fid$mds$block0_values[3,]; + d1$X <- X; d1$Y <- Y;d1$Z <- Z + }else{ + ori_cc <- umap.defaults; + ori_cc$n_epochs <- 2000; + ori_cc$n_neighbors <- round(ncol(dist_mat)/use_k) + ori_cc$min_dist <- 0.01 # 0.1 + if(plot_type=='3D'){ + ori_cc$n_components <- 3 + use_mat_umap <- umap::umap(t(dist_mat),config=ori_cc) + X <- use_mat_umap$layout[,1];Y <- use_mat_umap$layout[,2]; Z <- use_mat_umap$layout[,3]; + d1$X <- X; d1$Y <- Y;d1$Z <- Z + }else{ + use_mat_umap <- umap::umap(t(dist_mat),config=ori_cc) + X <- use_mat_umap$layout[,1];Y <- use_mat_umap$layout[,2] + d1$X <- X; d1$Y <- Y; + } + } + rhdf5::H5Fclose(fid) + } + graphics::layout(t(matrix(1:2))) + if(plot_type=='2D.ellipse'){ + draw.2D.ellipse(d1$X,d1$Y,class_label=obs_label[d1$id],xlab='MICA-1',ylab='MICA-2',legend_cex=legend_cex,point_cex=point_cex, + use_color=use_color,pre_define=pre_define,main=main) + draw.2D.ellipse(d1$X,d1$Y,class_label=d1$label,xlab='MICA-1',ylab='MICA-2',legend_cex=legend_cex,point_cex=point_cex, + main=sprintf('%s:%s',choose_k_strategy,format(all_jac[as.character(use_k)],digits=4)), + use_color=use_color,pre_define=pre_define) + } + if(plot_type=='2D'){ + draw.2D(d1$X,d1$Y,class_label=obs_label[d1$id],xlab='MICA-1',ylab='MICA-2',legend_cex=legend_cex,point_cex=point_cex, + use_color=use_color,pre_define=pre_define,main=main) + draw.2D(d1$X,d1$Y,class_label=d1$label,xlab='MICA-1',ylab='MICA-2',legend_cex=legend_cex,point_cex=point_cex, + main=sprintf('%s:%s',choose_k_strategy,format(all_jac[as.character(use_k)],digits=4)), + use_color=use_color,pre_define=pre_define) + } + if(plot_type=='2D.text'){ + draw.2D.text(d1$X,d1$Y,class_label=obs_label[d1$id],class_text=d1$id,xlab='MICA-1',ylab='MICA-2',legend_cex=legend_cex,point_cex=point_cex, + use_color=use_color,pre_define=pre_define,main=main) + draw.2D.text(d1$X,d1$Y,class_label=d1$label,class_text=d1$id,xlab='MICA-1',ylab='MICA-2',legend_cex=legend_cex,point_cex=point_cex, + main=sprintf('%s:%s',choose_k_strategy,format(all_jac[as.character(use_k)],digits=4)), + use_color=use_color,pre_define=pre_define) + } + if(plot_type=='3D'){ + draw.3D(d1$X,d1$Y,d1$Z,class_label=obs_label[d1$id],xlab='MICA-1',ylab='MICA-2',zlab='MICA-3',legend_cex=legend_cex,point_cex=point_cex,legend_pos=legend_pos, + use_color=use_color,pre_define=pre_define,main=main) + draw.3D(d1$X,d1$Y,d1$Z,class_label=d1$label,xlab='MICA-1',ylab='MICA-2',zlab='MICA-3',legend_cex=legend_cex,point_cex=point_cex,legend_pos=legend_pos, + main=sprintf('%s:%s',choose_k_strategy,format(all_jac[as.character(use_k)],digits=4)), + use_color=use_color,pre_define=pre_define) + } + ##Score + rownames(d1) <- d1$id + pred_label <- d1[names(obs_label), ]$label + names(pred_label) <- names(obs_label) + jac <- get_clustComp(pred_label, obs_label,strategy = choose_k_strategy) + print(sprintf('Best Score:%s', jac)) + if(return_type=='optimal') return(pred_label) + if(return_type=='all') return(all_k_res) +} + + +# get allScore for MICA +get_clustComp_MICA <- function(outdir, all_k, obs_label, prjname = NULL,strategy = 'ARI') { + all_jac <- list() + all_k_res <- list() + for (k in all_k) { + use_file <- sprintf('%s/%s_k%s_ClusterMem.txt', + outdir,prjname,use_k,prjname) + d1 <- read.delim(use_file, stringsAsFactors = FALSE) + ##Score + rownames(d1) <- d1$id + pred_label <- + d1[names(obs_label), ]$label + names(pred_label) <- names(obs_label) + jac <- get_clustComp(pred_label, obs_label,strategy=strategy) + all_jac[[as.character(k)]] <- jac + all_k_res[[as.character(k)]] <- pred_label + print(sprintf('Best Score for %d:%s', k, jac)) + } + return(list(all_k_res=all_k_res,all_jac=all_jac)) +} + +#' Draw Volcano Plot for Top DE (differentiated expressed) Genes or DA (differentiated activity) Drivers +#' +#' \code{draw.volcanoPlot} draws the volcano plot to identify and visualize DE genes or DA drivers with fold change threshold and significant P-value from the input dataset. +#' The function will return a data.frame of these highlighted genes/drivers. +#' +#' Top genes or drivers will be colored (blue for down-regulated and red for up-regulated) and labeled with their names. +#' This function requires the input of master table and two thresholds of logFC and P-value. +#' +#' @param dat data.frame, the master table created by function \code{generate.masterTable}. Or a table with columns of the following parameters. +#' @param label_col character, the name of the column in \code{dat} contains gene/driver names. +#' @param logFC_col character, the name of the column in \code{dat} contains logFC values. +#' @param Pv_col character, the name of the column in \code{dat} contains P-values. +#' @param logFC_thre numeric, the threshold of logFC. Genes or drivers with absolute logFC value higher than the threshold will be kept. +#' Default is 0.1. +#' @param Pv_thre numeric, the threshold of P-values. Genes or drivers with P-values lower than the threshold will be kept. +#' Default is 0.01. +#' @param show_plot logical, if TRUE, the plot will be shown in the plot pane. Default is TRUE. +#' @param xlab character, a title for the X axis. +#' @param ylab character, a title for the Y axis. +#' @param show_label logical, if TRUE, labels of selected genes or drivers will be displayed in the plot. Default is FALSE. +#' @param label_cex numeric, giving the amount by which the text of genes/drivers label should be magnified relative to the default. Default is 0.5. +#' @param legend_cex numeric, giving the amount by which the text of legend should be magnified relative to the default. Default is 0.7. +#' @param label_type character, users can choose between "origin" and "distribute". If "origin", all the labels will be displayed without location modification. +#' If "distribute", location of labels will be rearranged to avoid overlap. Default is "distribute". +#' @param main character, an overall title for the plot. +#' @param pdf_file character, the path to save the plot as PDF file. If NULL, no PDF file will be created. Default is NULL. +#' +#' @return Return a data.frame of selected significant genes or drivers, with columns contain \code{label_col}, \code{logFC_col} and \code{Pv_col}. +#' +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' +#' \dontrun{ +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' analysis.par$out.dir.PLOT <- getwd() ## directory for saving the pdf files +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' sig_gene <- draw.volcanoPlot(dat=ms_tab,label_col='geneSymbol', +#' logFC_col='logFC.G4.Vs.others_DE', +#' Pv_col='P.Value.G4.Vs.others_DE', +#' logFC_thre=2,Pv_thre=1e-3, +#' main='Volcano Plot for G4.Vs.others_DE', +#' show_label=FALSE, +#' pdf_file=sprintf('%s/vocalno_nolabel_DE.pdf', +#' analysis.par$out.dir.PLOT)) +#'} +#' @export +draw.volcanoPlot <- function(dat=NULL,label_col=NULL,logFC_col=NULL,Pv_col=NULL,logFC_thre=0.1, Pv_thre=0.01, + show_plot=TRUE, + xlab='log2 Fold Change',ylab='P-value',show_label=FALSE,label_cex=0.5,legend_cex=0.8, + label_type='distribute',main="",pdf_file=NULL){ + # + all_input_para <- c('dat','label_col','logFC_col','Pv_col','logFC_thre','Pv_thre','show_plot','xlab','ylab', + 'show_label','label_cex','legend_cex','label_type','main') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('show_plot',c(TRUE,FALSE),envir=environment()), + check_option('show_label',c(TRUE,FALSE),envir=environment()), + check_option('label_type',c("origin", "distribute"),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + dat <- base::unique(dat[,c(label_col,logFC_col,Pv_col)]) + dat[,label_col] <- as.character(dat[,label_col]) + dat <- dat[order(dat[,3],decreasing=FALSE),] + dat <- dat[which(is.na(dat[,2])==FALSE),] + x <- as.numeric(dat[,logFC_col]) + y <- as.numeric(dat[,Pv_col]); + y <- -log10(y) + s1 <- which(abs(x)>=logFC_thre & y>= -log10(Pv_thre)) + if(show_plot==FALSE){ + sig_info <- dat[s1,,drop=FALSE] + return(sig_info) + } + plot_part <- function(ori=FALSE,before_off=FALSE){ + geneWidth <- 0 + geneHeight <- 0 + if(base::length(s1)>0){ + s11 <- s1[which(x[s1]>=0)] + s12 <- s1[which(x[s1]<0)] + geneWidth <- base::max(strwidthMod(dat[s1,label_col],'inches',cex=label_cex,ori=ori))*1.05 + geneHeight <- base::max(strheightMod(dat[s1,label_col],'inches',cex=label_cex))*base::max(base::length(s11),base::length(s12))*1.25+par.char2inch()[2] + } + + if(before_off==TRUE) dev.off() + # width: 2|genewidth|5|genewidth|1 + # height: 1.5|geneHeight|1.5 + if(is.null(pdf_file)==FALSE){ + if(show_label==TRUE & label_type=='distribute' & base::length(s1)>0){ + pdf(pdf_file,width=8+geneWidth*2,height=3+base::max(5,geneHeight)) + }else{ + pdf(pdf_file,width=8,height=8) + } + } + par(mai=c(1.5,2,1.5,1)) + mm <- base::max(abs(x)) + xr <- 1.15*mm/2.5*(2.5+geneWidth) ## not not consider 4% overflow by setting xaxs='i' + if(show_label==TRUE & label_type=='distribute'){ + graphics::plot(y~x,pch=16,col=get_transparent('grey',0.7),xlab=xlab,ylab="", + xlim=c(-xr,xr),ylim=c(0,base::max(y)*1.5),yaxt='n',main=main,cex.lab=1.2,cex.main=1.6,xaxs='i') + }else{ + graphics::plot(y~x,pch=16,col=get_transparent('grey',0.7),xlab=xlab,ylab="", + xlim=c(-mm*1.5,mm*1.5),ylim=c(0,base::max(y)*1.5),yaxt='n',main=main,cex.lab=1.2,cex.main=1.6,xaxs='i') + } + yyy <- c(1,round(base::seq(1,base::max(y)*1.5,length.out=base::min(base::length(y),5)))) ## max:5 + graphics::axis(side=2,at=c(0,yyy),labels=c(1,format(10^-yyy,scientific = TRUE)),las=2) + graphics::mtext(side=2,line = 4,ylab,cex=1.2) + z_val <- sapply(dat[,Pv_col]*sign(x),combinePvalVector)[1,] + if(logFC_thre>0){graphics::abline(v=logFC_thre,lty=2,lwd=0.5);graphics::abline(v=-logFC_thre,lty=2,lwd=0.5)} + if(Pv_thre<1) graphics::abline(h=-log10(Pv_thre),lty=2,lwd=0.5); + graphics::points(y~x,pch=16,col=get_transparent('grey',0.7)) + #s1 <- which(abs(x)>=logFC_thre & y>= -log10(Pv_thre)) + s_col <- z2col(c(10,-10),sig_thre=0); names(s_col) <- as.character(c(1,-1)) + graphics::points(y[s1]~x[s1],pch=16,col=s_col[as.character(sign(x[s1]))]) + graphics::legend(0,par()$usr[4],c('Down-regulated','Not-Significant','Up-regulated'), + fill=c(s_col[2],get_transparent('grey',0.7),s_col[1]),border=NA,bty='o', + bg='white',box.col='white',horiz=TRUE,xjust=0.5,cex=legend_cex) + if(show_label==TRUE){ + s11 <- s1[which(x[s1]>=0)] + s12 <- s1[which(x[s1]<0)] + dd <- (par()$usr[2]-par()$usr[1])/100 + if(label_type == 'origin'){ + if(base::length(s11)>0) graphics::text(x[s11]+dd,y[s11],dat[s11,label_col],cex=label_cex,adj=0) + if(base::length(s12)>0) graphics::text(x[s12]-dd,y[s12],dat[s12,label_col],cex=label_cex,adj=1) + }else{ + if(base::length(s11)>0){ + dd <- (par()$usr[4]-par()$usr[3]-par.char2pos()[2])/(base::length(s11)+1) + graphics::rect(xright=par()$usr[2],ybottom=par()$usr[3],xleft=base::max(abs(x))*1.1,ytop=par()$usr[4],col='white',border='white') + ypos <- base::seq(from=par()$usr[3],by=dd,length.out=base::length(s11))+dd; ypos <- rev(ypos); + graphics::text(base::max(abs(x))*1.15,ypos,dat[s11,label_col],cex=label_cex,adj=0) + graphics::segments(x0=base::max(abs(x))*1.1,x1=x[s11],y0=ypos,y1=y[s11],lwd=0.4,col=get_transparent(s_col[1],alpha=0.5)) + graphics::segments(x0=base::max(abs(x))*1.1,x1=base::max(abs(x))*1.14,y0=ypos,y1=ypos,lwd=0.4,col=get_transparent(s_col[1],alpha=0.5)) + } + if(base::length(s12)>0){ + dd <- (par()$usr[4]-par()$usr[3]-par.char2pos()[2])/(base::length(s12)+1) + graphics::rect(xleft=par()$usr[1],ybottom=par()$usr[3],xright=-base::max(abs(x))*1.1,ytop=par()$usr[4],col='white',border='white') + ypos <- base::seq(from=par()$usr[3],by=dd,length.out=base::length(s12))+dd; ypos <- rev(ypos); + text(-base::max(abs(x))*1.15,ypos,dat[s12,label_col],cex=label_cex,adj=1) + graphics::segments(x0= -base::max(abs(x))*1.1,x1=x[s12],y0=ypos,y1=y[s12],lwd=0.4,col=get_transparent(s_col[2],alpha=0.5)) + graphics::segments(x0= -base::max(abs(x))*1.1,x1=-base::max(abs(x))*1.14,y0=ypos,y1=ypos,lwd=0.4,col=get_transparent(s_col[2],alpha=0.5)) + } + } + } + graphics::rect(xleft=par()$usr[1],xright=par()$usr[2],ybottom=par()$usr[3],ytop=par()$usr[4]) + } + if(is.null(pdf_file)==FALSE){plot_part(ori=TRUE);plot_part(ori=TRUE,before_off=TRUE);while (!is.null(dev.list())) dev.off()} else {plot_part()} + sig_info <- dat[s1,,drop=FALSE] + return(sig_info) + #return(TRUE) +} +### + +#' Plot for combined DE (differentiated expressed)/DA (differentiated activity) Vs. original DE/DA +#' +#' \code{draw.combineDE} draw the image for the combined DE/DA Vs. original DE/DA. +#' +#' This plot function need to input the output from \code{combineDE}. +#' +#' @param DE_list list, a list of DE/DA results, with one more component named "combine" that include the combined results. +#' Strongly suggest to use the output from \code{combineDE}. +#' @param main_id character, the main id for display in the figure, must be one of the name in DE_list. +#' If NULL, will use the first name. Default is NULL. +#' @param top_number number for the top significant genes/drivers in the combine results to be displayed on the plot. +#' Default is 30. +#' @param display_col character, column names used to display. Default is 'P.Value'. +#' @param z_col character, column names for Z statistics used for background color bar. Default is 'Z-statistics'. +#' @param digit_num integer, number of digits to display on the plot. Default is 2. +#' @param row_cex numeric, \code{cex} for the row labels displayed on the plot. Default is 1 +#' @param column_cex numeric, \code{cex} for the col labels displayed on the plot. Default is 1 +#' @param text_cex numeric, \code{cex} for the text displayed on the plot. Default is 1 +#' @param pdf_file character, file path for the pdf file to save the figure into pdf format.If NULL, will not generate pdf file. Default is NULL. +#' +#' @return logical value indicating whether the plot has been successfully generated +#' +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' phe_info <- Biobase::pData(analysis.par$cal.eset) +#' each_subtype <- 'G4' +#' G0 <- rownames(phe_info)[which(phe_info$`subgroup`!=each_subtype)] # get sample list for G0 +#' G1 <- rownames(phe_info)[which(phe_info$`subgroup`==each_subtype)] # get sample list for G1 +#' DE_gene_limma_G4 <- getDE.limma.2G(eset=analysis.par$cal.eset, +#' G1=G1,G0=G0, +#' G1_name=each_subtype, +#' G0_name='other') +#' each_subtype <- 'SHH' +#' G0 <- rownames(phe_info)[which(phe_info$`subgroup`!=each_subtype)] # get sample list for G0 +#' G1 <- rownames(phe_info)[which(phe_info$`subgroup`==each_subtype)] # get sample list for G1 +#' DE_gene_limma_SHH <- getDE.limma.2G(eset=analysis.par$cal.eset, +#' G1=G1,G0=G0, +#' G1_name=each_subtype, +#' G0_name='other') +#' DE_list <- list(G4=DE_gene_limma_G4,SHH=DE_gene_limma_SHH) +#' res2 <- combineDE(DE_list,transfer_tab=NULL) +#' draw.combineDE(res2) +#' @export +draw.combineDE <- function(DE_list=NULL,main_id=NULL,top_number=30, + display_col='P.Value',z_col='Z-statistics', + digit_num=2,row_cex=1,column_cex=1,text_cex=1,pdf_file=NULL){ + # + all_input_para <- c('DE_list','top_number','display_col','z_col', + 'digit_num','row_cex','column_cex','text_cex') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + DE_name <- base::setdiff(names(DE_list),'combine') + if(top_number > nrow(DE_list$combine)) top_number <- nrow(DE_list$combine) + w1 <- DE_list$combine[order(DE_list$combine$P.Value)[1:top_number],] + dd_Z <- do.call(base::cbind,lapply(DE_name,function(x){ + x1 <- DE_list[[x]] + x2 <- x1[w1[,x],] + x2[,z_col] + })) + colnames(dd_Z) <- DE_name + dd <- do.call(base::cbind,lapply(DE_name,function(x){ + x1 <- DE_list[[x]] + x2 <- x1[w1[,x],] + x2[,display_col] + })) + colnames(dd) <- DE_name + if(is.null(main_id)==TRUE) main_id <- DE_name[1] + dat <- data.frame(main_id=w1[,main_id],dd_Z,combine=w1[,z_col],stringsAsFactors=FALSE) + mat <- as.matrix(dat[,-1]) + rownames(mat) <- dat$main_id + dd <- base::cbind(dd,combine=w1[,display_col]) + mat1 <- signif(dd,digits=digit_num) + plot_part <- function(ori=FALSE,before_off=FALSE){ + geneWidth <- base::max(strwidthMod(rownames(mat),units='inch',cex=row_cex,ori=ori))*1.05+par.char2inch()[1] + textWidth <- base::max(strwidthMod(as.character(mat1),units='inch',cex=text_cex,ori=ori))*ncol(mat1)*1.5 + geneHeight <- base::max(strheightMod(rownames(mat),units='inch',cex=row_cex,ori=ori))*nrow(mat)*1.75 + textHeight <- base::max(strheightMod(as.character(mat1),units='inch',cex=text_cex,ori=ori))*nrow(mat)*1.75 + geneHeight <- base::max(geneHeight,textHeight) + if(before_off==TRUE) dev.off() + # geneWidth|textWidth|par.char2inch()[1]*8 + if(is.null(pdf_file)==FALSE) pdf(pdf_file,width=geneWidth+textWidth+par.char2inch()[1]*8,height=1.5+geneHeight) + par(mai=c(0.5,geneWidth,1,par.char2inch()[1]*8));graphics::layout(1) + draw.heatmap.local(mat,inner_line=TRUE,out_line=TRUE,col_srt=0,display_text_mat=mat1,row_cex=row_cex,column_cex=column_cex,text_cex=text_cex,inner_col='white') + #legend + pp <- par()$usr + cc1 <- grDevices::colorRampPalette(c(brewer.pal(9,'Blues')[8],'white',brewer.pal(9,'Reds')[7]))(5) + draw.colorbar(col=rev(cc1),min_val=-base::max(abs(mat)),max_val=base::max(abs(mat)),n=5,xleft=pp[2]+par.char2pos()[1]*2, + xright=pp[2]+par.char2pos()[1]*3,ytop=pp[3]+(pp[4]-pp[3])/2,ybottom=pp[3]+(pp[4]-pp[3])/2-par.char2pos()[2]*5*text_cex*0.8,cex=text_cex*0.8) + } + if(is.null(pdf_file)==FALSE){plot_part(ori=TRUE);plot_part(ori=TRUE,before_off=TRUE);while (!is.null(dev.list())) dev.off()} else {plot_part()} + return(TRUE) +} + +#' Create Plot to show Differential Expression and Differential Activity Analysis of Top Drivers +#' +#' \code{draw.NetBID} creates two side-by-side heatmaps to show the result of NetBID analysis. +#' Both the differential expression analysis (the right heatmap) and differential activity analysis (the left heatmap) of top drivers are shown in the plot. +#' +#' @param DA_list list, contains the differential activity (DA) analysis result. +#' @param DE_list list, contains the differential expression (DE) analysis result. +#' @param main_id character, the top genes/drivers from which DA comparison group. Must be one of the names in \code{DA_list}. +#' If NULL, the first element name of \code{DA_list} will be used. Default is NULL. +#' @param top_number integer, the number of the top significant genes/drivers to be displayed in the plot. +#' Default is 30. +#' @param DA_display_col character, which statistic column from NetBID analysis is to be used as the column of the left heatmap. +#' Default is "P.Value". +#' @param DE_display_col character, which statistic column from NetBID analysis is to be used as the column of the right heatmap. +#' Default is "logFC". +#' @param z_col character, which statistic column from NetBID analysis is to be used as the color scale of the heatmap. +#' Default is "Z-statistics". +#' @param digit_num integer, indicating the number of decimal places (round) or significant digits (signif) to be used. +#' Default is 2. +#' @param row_cex numeric, giving the amount by which the text of row names should be magnified relative to the default. +#' Default is 1. +#' @param column_cex numeric, giving the amount by which the text of column names should be maginified relative to the default. +#' Default is 1. +#' @param text_cex numeric, giving the amount by which the text of in the table cell should be maginified relative to the default. +#' Default is 1. +#' @param col_srt numeric, rotation angle of the column labels at the bottom of heatmap. +#' Default is 60. +#' @param pdf_file character, the path to save the plot as PDF file. +#' If NULL, PDF file will not be generated. Default is NULL. +#' +#' @return Return a logical value. If TRUE, the plot is created successfully. +#' +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' draw.NetBID(DA_list=analysis.par$DA,DE_list=analysis.par$DE) +#' @export +draw.NetBID <- function(DA_list=NULL,DE_list=NULL,main_id=NULL,top_number=30, + DA_display_col='P.Value',DE_display_col='logFC',z_col='Z-statistics',digit_num=2, + row_cex=1,column_cex=1,text_cex=1,col_srt=60,pdf_file=NULL){ + # + all_input_para <- c('DA_list','DE_list','top_number','DA_display_col','DE_display_col','z_col', + 'digit_num','row_cex','column_cex','text_cex','col_srt') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(top_number<1){message('top_number must be larger than 1!');return(FALSE)} + DA_name <- names(DA_list) + DE_name <- names(DE_list) + if(is.null(main_id)==TRUE) main_id <- DA_name[1] + if(!main_id %in% DA_name){message('main id not in DA list!');return(FALSE)} + if(top_number > nrow(DA_list[[main_id]])) top_number <- nrow(DA_list[[main_id]]) + w1 <- rownames(DA_list[[main_id]][order(DA_list[[main_id]]$P.Value)[1:top_number],]) ## display ID + w2 <- gsub('_TF','',w1); w2 <- gsub('_SIG','',w2) + # get display + dd_DA <- do.call(base::cbind,lapply(DA_name,function(x){ + x1 <- DA_list[[x]] + x2 <- x1[w1,] + x2[,DA_display_col] + })) + dd_DE <- do.call(base::cbind,lapply(DE_name,function(x){ + x1 <- DE_list[[x]] + x2 <- x1[w2,] + x2[,DE_display_col] + })) + colnames(dd_DA) <- DA_name; rownames(dd_DA) <- w1; + colnames(dd_DE) <- DE_name; rownames(dd_DE) <- w1; + # get Z + dd_DA_Z <- do.call(base::cbind,lapply(DA_name,function(x){ + x1 <- DA_list[[x]] + x2 <- x1[w1,] + x2[,z_col] + })) + dd_DE_Z <- do.call(base::cbind,lapply(DE_name,function(x){ + x1 <- DE_list[[x]] + x2 <- x1[w2,] + x2[,z_col] + })) + colnames(dd_DA_Z) <- DA_name; rownames(dd_DA_Z) <- w1; + colnames(dd_DE_Z) <- DE_name; #rownames(dd_DE_Z) <- w1; + # + mat1 <- signif(dd_DA,digits=digit_num); mat2 <- signif(dd_DE,digits=digit_num); + if(base::length(DA_list)==1) {mat1 <- as.matrix(mat1); dd_DA_Z<-as.matrix(dd_DA_Z);} + if(base::length(DE_list)==1) {mat2 <- as.matrix(mat2); dd_DE_Z<-as.matrix(dd_DE_Z);} + plot_part <- function(ori=FALSE,before_off=FALSE){ + geneWidth <- base::max(strwidthMod(rownames(dd_DA_Z),units='inch',cex=row_cex,ori=ori))*1.05+par.char2inch()[1] + textWidth1 <- base::max(strwidthMod(as.character(mat1),units='inch',cex=text_cex,ori=ori))*ncol(mat1)*1.5 + geneHeight1 <- base::max(strheightMod(rownames(dd_DA_Z),units='inch',cex=row_cex,ori=ori))*nrow(mat1)*1.75 + textHeight1 <- base::max(strheightMod(as.character(mat1),units='inch',cex=text_cex,ori=ori))*nrow(mat1)*1.75 + + textWidth2 <- base::max(strwidthMod(as.character(mat2),units='inch',cex=text_cex,ori=ori))*ncol(mat2)*1.5 + textHeight2 <- base::max(strheightMod(as.character(mat2),units='inch',cex=text_cex,ori=ori))*nrow(mat2)*1.75 + + geneHeight <- base::max(c(geneHeight1,textHeight1,textHeight2)) + + + if(before_off==TRUE) dev.off() + # geneWidth|textWidth1|0.3|textWidth2|par.char2inch()[1]*8 + # 1|geneHeight|0.5 + if(is.null(pdf_file)==FALSE) pdf(pdf_file,width=geneWidth+textWidth1+0.3+textWidth2+par.char2inch()[1]*8,height=2+geneHeight) + graphics::layout(t(as.matrix(c(rep(1,ncol(mat1)),rep(2,ncol(mat2)))))) + par(mai=c(1,geneWidth,1,0)) + mm <- max(abs(c(dd_DE_Z,dd_DA_Z)),na.rm=TRUE) + + draw.heatmap.local(dd_DA_Z,inner_line=TRUE,out_line=TRUE,col_srt=col_srt,display_text_mat=mat1, + row_cex=row_cex,column_cex=column_cex,text_cex=text_cex,text_col='down',inner_col='white',bb_max=mm) + pp <- par()$usr; graphics::rect(xleft=pp[1],xright=pp[2],ybottom=pp[4],ytop=pp[4]+2*(pp[4]-pp[3])/nrow(mat1),border='black',col='light grey',xpd=TRUE); + DA_width <- base::max(strwidthMod(sprintf('(Differential activity:%s)',DA_display_col),units='inch',cex=1,ori=ori))*1.05 + if(textWidth1>DA_width){ + graphics::text(x=pp[1]/2+pp[2]/2,y=pp[4]+1*(pp[4]-pp[3])/nrow(mat1),labels=sprintf('NetBID\n(Differential activity:%s)',DA_display_col),xpd=TRUE,cex=column_cex) + }else{ + graphics::text(x=pp[1]/2+pp[2]/2,y=pp[4]+1*(pp[4]-pp[3])/nrow(mat1),labels=sprintf('NetBID\n(DA:%s)',DA_display_col),xpd=TRUE,cex=column_cex) + } + DE_width <- base::max(strwidthMod('Differential expression',units='inch',cex=1,ori=ori))*1.05 + + par(mai=c(1,0.3,1,par.char2inch()[1]*8)) + draw.heatmap.local(dd_DE_Z,inner_line=TRUE,out_line=TRUE,col_srt=col_srt,display_text_mat=mat2, + row_cex=row_cex,column_cex=column_cex,text_cex=text_cex,text_col='down',inner_col='white',bb_max=mm) + pp <- par()$usr; graphics::rect(xleft=pp[1],xright=pp[2],ybottom=pp[4],ytop=pp[4]+2*(pp[4]-pp[3])/nrow(mat1),border='black',col='light grey',xpd=TRUE); + if(textWidth2>DA_width){ + graphics::text(x=pp[1]/2+pp[2]/2,y=pp[4]+1*(pp[4]-pp[3])/nrow(mat1),labels=sprintf('Differential expression:\n%s',DE_display_col),xpd=TRUE,cex=column_cex) + }else{ + graphics::text(x=pp[1]/2+pp[2]/2,y=pp[4]+1*(pp[4]-pp[3])/nrow(mat1),labels=sprintf('DE:\n%s',DE_display_col),xpd=TRUE,cex=column_cex) + } + + #legend + cc1 <- grDevices::colorRampPalette(c(brewer.pal(9,'Blues')[8],'white',brewer.pal(9,'Reds')[7]))(5) + draw.colorbar(col=rev(cc1),min_val=-mm,max_val=mm, + n=5,xleft=pp[2]+par.char2pos()[1]*2, + xright=pp[2]+par.char2pos()[1]*3,ytop=pp[3]+(pp[4]-pp[3])/2,ybottom=pp[3]+(pp[4]-pp[3])/2-par.char2pos()[2]*5*text_cex*0.8,cex=text_cex*0.8) + } + if(is.null(pdf_file)==FALSE){plot_part(ori=TRUE);plot_part(ori=TRUE,before_off=TRUE);while (!is.null(dev.list())) dev.off()} else {plot_part()} + return(TRUE) +} + +### local functions to draw heatmap +draw.heatmap.local <- function(mat,inner_line=FALSE,out_line=TRUE,inner_col='black', + n=20,col_srt=60,display_text_mat=NULL,row_cex=1,column_cex=1,text_cex=1,text_col='up', + bb_max=NULL){ + if(ncol(mat)>1) mat1 <- mat[nrow(mat):1,] else mat1 <- as.matrix(mat[nrow(mat):1,]) + if(is.null(display_text_mat)==FALSE){ + if(ncol(display_text_mat)>1) display_text_mat <- display_text_mat[nrow(display_text_mat):1,] else display_text_mat <- as.matrix(display_text_mat[nrow(display_text_mat):1,]) + } + colnames(mat1) <- colnames(mat) + cc1 <- grDevices::colorRampPalette(c(brewer.pal(9,'Blues')[8],'white',brewer.pal(9,'Reds')[7]))(n*2) + if(is.null(bb_max)==TRUE) bb_max <- base::max(abs(mat1),na.rm=TRUE) + bb1 <- base::seq(0,bb_max,length.out=n) + #mat1[which(is.na(mat1)==TRUE)] <- 0; + if(out_line==TRUE) graphics::image(t(mat1),col=cc1,breaks=sort(c(-bb1,0,bb1)),xaxt='n',yaxt='n') + if(out_line==FALSE) graphics::image(t(mat1),col=cc1,breaks=sort(c(-bb1,0,bb1)),xaxt='n',yaxt='n',bty='n') + pp <- par()$usr + xx <- base::seq(pp[1],pp[2],length.out=1+ncol(mat1)) + yy <- base::seq(pp[3],pp[4],length.out=1+nrow(mat1)) + xxx <- xx[1:(base::length(xx)-1)]/2+xx[2:base::length(xx)]/2 + yyy <- yy[1:(base::length(yy)-1)]/2+yy[2:base::length(yy)]/2 + if(inner_line==TRUE){ + graphics::abline(v=xx,col=inner_col) + graphics::abline(h=yy,col=inner_col) + } + graphics::text(pp[1]-par.char2pos()[1],yyy,rownames(mat1),adj=1,xpd=TRUE,cex=row_cex) ## distance for one character + if(text_col=='up'){ + if(col_srt==0){ + graphics::text(xxx,pp[4],colnames(mat1),pos=3,xpd=TRUE,srt=col_srt,cex=column_cex) ## do not use pos, for srt? + }else{ + graphics::text(xxx,pp[4]+base::max(strheightMod(colnames(mat1))/2)+base::max(strheightMod(colnames(mat1))/10),colnames(mat1),adj=0.5-col_srt/90,xpd=TRUE,srt=col_srt,cex=column_cex) ## do not use pos, for srt? + } + } + if(text_col=='down'){ + if(col_srt!=0) graphics::text(xxx,pp[3]-0.1*(pp[4]-pp[3])/nrow(mat1),colnames(mat1),adj=1,xpd=TRUE,srt=col_srt,cex=column_cex) + if(col_srt==0) graphics::text(xxx,pp[3]-0.1*(pp[4]-pp[3])/nrow(mat1),colnames(mat1),adj=0.5,xpd=TRUE,srt=col_srt,cex=column_cex) + } + if(is.null(display_text_mat)==FALSE){ + for(i in 1:nrow(display_text_mat)){ + for(j in 1:ncol(display_text_mat)){ + graphics::text(xxx[j],yyy[i],display_text_mat[i,j],cex=text_cex) + } + } + } + return(TRUE) +} +draw.colorbar <- function(col=NULL,min_val=NULL,max_val=NULL,n=5,digit_num=2,direction='vertical',xleft=0,xright=1,ytop=1,ybottom=0,cex=1){ + val_bar<-base::seq(max_val,min_val,length.out = n) + if(is.null(col)==TRUE) col <- z2col(val_bar) + if(direction=='vertical'){ + y_pos <- base::seq(ytop,ybottom,length.out=n+1) + yy <- y_pos[2:base::length(y_pos)]/2+y_pos[1:(base::length(y_pos)-1)]/2 + graphics::rect(xleft=xleft,xright=xright,ytop=y_pos[2:base::length(y_pos)],ybottom=y_pos[1:(base::length(y_pos)-1)],col=col,border='light grey',xpd=TRUE) + graphics::text(xright,yy,signif(val_bar,digits = digit_num),xpd=TRUE,cex=cex,pos=4) + graphics::text(xleft/2+xright/2,ytop,'Z value',cex=cex,xpd=TRUE,pos=3) + } +} + +#' Draw Heatmap Plot to Display the Expression Level or Activity Level of Genes and Drivers +#' +#' \code{draw.heatmap} plots the heatmap to see the expression level or activity level of genes and drivers across selected samples. +#' +#' @param mat numeric matrix, the expression/activity matrix. Rows are genes or drivers, columns are selected samples. +#' @param use_genes a vector of characters, selected genes (e.g. "originID"). Default is row names of \code{mat}. +#' @param use_gene_label a vector of characters, a vector of labels for \code{use_genes} (e.g. "geneSymbol" or "gene_label"). Default is \code{use_genes}. +#' @param use_samples a vector of characters, selected samples. Default is column names of \code{mat}. +#' @param use_sample_label a vector of characters, a vector of labels for \code{use_samples}. Default is \code{use_samples}. +#' @param phenotype_info data.frame, phenotype of samples. Users can call \code{Biobase::pData(eset)} to create. +#' The row names should match the column names in \code{mat}. Default is NULL. +#' @param use_phe a list of characters, selected phenotype of samples. A subset of columns from \code{phenotype_info}. +#' Default is NULL. +#' @param main character, an overall title for the plot. Default is "". +#' @param scale character, users can choose from "row", "column" and "none". Indicating if the values should be +#' centered and scaled in either the row direction or the column direction, or none. Default is "none". +#' @param pdf_file character, the file path to save plot as PDF file. If NULL, no PDF file will be saved. Default is NULL. +#' @param cluster_rows,cluster_columns logical, the same parameters in \code{Heatmap}. +#' Please check \code{?Heatmap} for more details. Default is TRUE. +#' @param clustering_distance_rows,clustering_distance_columns character, the same parameters used in \code{Heatmap}. +#' Please check \code{?Heatmap} for more details. Default is "pearson". +#' @param show_row_names,show_column_names logical, the same parameters used in \code{Heatmap}. +#' Please check \code{?Heatmap} for more details. Default is TRUE. +#' @param use_color a vector of color codes, colors to be assigned to each member of display label. Default is brewer.pal(9, 'Set1'). +#' @param pre_define a vector of characters, pre-defined color codes for a certain input (e.g. c("blue", "red") with names c("A", "B")). Default is NULL. +#' @param ..., for more options, please check \code{?Heatmap} for more details. +#' +#' @return Return a logical value. If TRUE, the plot has been created successfully. +#' +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1/driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' exp_mat <- Biobase::exprs(analysis.par$cal.eset) ## rownames matches originalID +#' ac_mat <- Biobase::exprs(analysis.par$merge.ac.eset) ## rownames matches originalID_label +#' phe_info <- Biobase::pData(analysis.par$cal.eset) ## phenotype information +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' sig_driver <- sig_driver[1:10,] +#' draw.heatmap(mat=exp_mat,use_genes=ms_tab[rownames(sig_driver),'originalID'], +#' use_gene_label=ms_tab[rownames(sig_driver),'gene_label'], +#' use_samples=colnames(exp_mat), +#' use_sample_label=phe_info[colnames(exp_mat),'geo_accession'], +#' phenotype_info=phe_info,use_phe=c('gender','pathology','subgroup'), +#' main='Expression for Top drivers',scale='row', +#' cluster_rows=TRUE,cluster_columns=TRUE, +#' clustering_distance_rows='pearson', +#' clustering_distance_columns='pearson', +#' row_names_gp = gpar(fontsize = 12), +#' pre_define=c('WNT'='blue','SHH'='red','G4'='green')) +#' draw.heatmap(mat=ac_mat,use_genes=ms_tab[rownames(sig_driver),'originalID_label'], +#' use_gene_label=ms_tab[rownames(sig_driver),'gene_label'], +#' use_samples=colnames(ac_mat), +#' use_sample_label=phe_info[colnames(ac_mat),'geo_accession'], +#' phenotype_info=phe_info,use_phe=c('gender','pathology','subgroup'), +#' main='Activity for Top drivers',scale='row', +#' cluster_rows=TRUE,cluster_columns=TRUE, +#' clustering_distance_rows='pearson', +#' clustering_distance_columns='pearson', +#' row_names_gp = gpar(fontsize = 6), +#' pre_define=c('WNT'='blue','SHH'='red','G4'='green')) +#' +#' \dontrun{ +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1/driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' exp_mat <- Biobase::exprs(analysis.par$cal.eset) ## rownames matches originalID +#' ac_mat <- Biobase::exprs(analysis.par$merge.ac.eset) ## rownames matches originalID_label +#' phe_info <- Biobase::pData(analysis.par$cal.eset) ## phenotype information +#' analysis.par$out.dir.PLOT <- getwd() ## directory for saving the pdf files +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' draw.heatmap(mat=exp_mat,use_genes=ms_tab[rownames(sig_driver),'originalID'], +#' use_gene_label=ms_tab[rownames(sig_driver),'gene_label'], +#' use_samples=colnames(exp_mat), +#' use_sample_label=phe_info[colnames(exp_mat),'geo_accession'], +#' phenotype_info=phe_info, +#' use_phe=c('gender','pathology','subgroup'), +#' main='Expression for Top drivers',scale='row', +#' cluster_rows=TRUE,cluster_columns=TRUE, +#' clustering_distance_rows='pearson', +#' clustering_distance_columns='pearson', +#' row_names_gp = gpar(fontsize = 12), +#' pdf_file=sprintf('%s/heatmap_demo1.pdf', +#' analysis.par$out.dir.PLOT), +#' pre_define=c('WNT'='blue','SHH'='red','G4'='green')) +#' draw.heatmap(mat=ac_mat,use_genes=ms_tab[rownames(sig_driver),'originalID_label'], +#' use_gene_label=ms_tab[rownames(sig_driver),'gene_label'], +#' use_samples=colnames(ac_mat), +#' use_sample_label=phe_info[colnames(ac_mat),'geo_accession'], +#' phenotype_info=phe_info, +#' use_phe=c('gender','pathology','subgroup'), +#' main='Activity for Top drivers',scale='row', +#' cluster_rows=TRUE,cluster_columns=TRUE, +#' clustering_distance_rows='pearson', +#' clustering_distance_columns='pearson', +#' row_names_gp = gpar(fontsize = 6), +#' pdf_file=sprintf('%s/heatmap_demo2.pdf', +#' analysis.par$out.dir.PLOT), +#' pre_define=c('WNT'='blue','SHH'='red','G4'='green')) +#'} +#' @export +draw.heatmap <- function(mat=NULL,use_genes=rownames(mat),use_gene_label=use_genes,use_samples=colnames(mat),use_sample_label=use_samples, + phenotype_info=NULL,use_phe=NULL,main="",scale='none',pdf_file=NULL, + cluster_rows=TRUE,cluster_columns=TRUE, + show_row_names=TRUE,show_column_names=TRUE, + clustering_distance_rows='pearson',clustering_distance_columns='pearson', + use_color=NULL,pre_define=NULL, + ...){ + # + all_input_para <- c('mat','use_genes','use_gene_label','use_samples','use_sample_label','main','scale', + 'cluster_rows','cluster_columns','show_row_names','show_column_names','clustering_distance_rows','clustering_distance_columns') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('cluster_rows',c(TRUE,FALSE),envir=environment()), + check_option('cluster_columns',c(TRUE,FALSE),envir=environment()), + check_option('show_row_names',c(TRUE,FALSE),envir=environment()), + check_option('show_column_names',c(TRUE,FALSE),envir=environment()), + check_option('scale',c("none", "row",'column'),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + names(use_gene_label) <- use_genes + names(use_sample_label) <- use_samples + if(is.null(rownames(phenotype_info))==FALSE){ + ori_phenotype_info <- phenotype_info + phenotype_info <- as.data.frame(phenotype_info[colnames(mat),],stringsAsFactors=FALSE) + colnames(phenotype_info) <- colnames(ori_phenotype_info) + } + for(i in colnames(phenotype_info)){ + phenotype_info[,i] <- clean_charVector(phenotype_info[,i]) + } + if(exists('row_names_gp')==FALSE) row_names_gp <- gpar(fontsize = 12) + if(exists('column_names_gp')==FALSE) column_names_gp <- gpar(fontsize = 12) + use_genes <- base::intersect(use_genes,rownames(mat)) + use_samples <- base::intersect(use_samples,colnames(mat)) + use_mat <- mat[use_genes,use_samples] + rownames(use_mat) <- use_gene_label[rownames(use_mat)] + colnames(use_mat) <- use_sample_label[colnames(use_mat)] + row_names_max_width <- base::max(strwidthMod(rownames(use_mat),'inches',cex=row_names_gp[[1]]/7)) + row_names_max_width <- unit(row_names_max_width,'inches') + column_names_max_height <- base::max(strwidthMod(colnames(use_mat),'inches',cex=column_names_gp[[1]]/7)) + column_names_max_height <- unit(column_names_max_height,'inches') + if(scale=='row'){use_mat <- t(apply(use_mat,1,do.std))} + if(scale=='column'){use_mat <- apply(use_mat,2,do.std)} + if(base::length(use_phe)==0){ + if(scale=='none'){ + ht1 <- ComplexHeatmap::Heatmap(use_mat, column_title = main,name='Raw value', + cluster_rows=cluster_rows,cluster_columns=cluster_columns, + clustering_distance_rows=clustering_distance_rows,clustering_distance_columns=clustering_distance_columns, + show_row_names=show_row_names,show_column_names=show_column_names, + row_names_max_width=row_names_max_width,column_names_max_height=column_names_max_height,...) + } + if(scale!='none'){ + ht1 <- ComplexHeatmap::Heatmap(use_mat, column_title = main, name='Z value', + cluster_rows=cluster_rows,cluster_columns=cluster_columns, + clustering_distance_rows=clustering_distance_rows,clustering_distance_columns=clustering_distance_columns, + show_row_names=show_row_names,show_column_names=show_column_names, + row_names_max_width=row_names_max_width,column_names_max_height=column_names_max_height,...) + } + }else{ + if(base::length(use_phe)==1){ + use_phe_info <- as.data.frame(phenotype_info[,use_phe],stringsAsFactors=FALSE) + rownames(use_phe_info) <- rownames(phenotype_info) + colnames(use_phe_info) <- gsub(' ','.',use_phe) + }else{ + use_phe_info <- phenotype_info[,use_phe] + colnames(use_phe_info) <- gsub(' ','.',use_phe) + } + use_phe <- colnames(use_phe_info) + l2c <- get.class.color(base::unique(as.character(as.matrix(use_phe_info))),use_color=use_color,pre_define=pre_define) + use_col <- lapply(use_phe,function(x)l2c[base::unique(use_phe_info[,x])]) + names(use_col) <- use_phe + ha_column <- ComplexHeatmap::HeatmapAnnotation(df = data.frame(use_phe_info),col = use_col) + if(scale=='none'){ + ht1 <- ComplexHeatmap::Heatmap(use_mat, column_title = main, top_annotation = ha_column,name='Raw value', + cluster_rows=cluster_rows,cluster_columns=cluster_columns, + show_row_names=show_row_names,show_column_names=show_column_names, + clustering_distance_rows=clustering_distance_rows,clustering_distance_columns=clustering_distance_columns, + row_names_max_width=row_names_max_width,column_names_max_height=column_names_max_height,...) + } + if(scale!='none'){ + ht1 <- ComplexHeatmap::Heatmap(use_mat, column_title = main, top_annotation = ha_column,name='Z value', + cluster_rows=cluster_rows,cluster_columns=cluster_columns, + clustering_distance_rows=clustering_distance_rows,clustering_distance_columns=clustering_distance_columns, + show_row_names=show_row_names,show_column_names=show_column_names, + row_names_max_width=row_names_max_width,column_names_max_height=column_names_max_height,...) + } + } + ht_list <- ht1 + if(is.null(pdf_file)==FALSE){ + ww <- 1.25*column_names_gp[[1]]/72*ncol(use_mat)+base::max(strwidthMod(rownames(use_mat),'inches',ori=TRUE))+5 + hh <- 1.25*row_names_gp[[1]]/72*nrow(use_mat)+base::max(strwidthMod(colnames(use_mat),'inches',ori=TRUE))+3 + pdf(pdf_file,width=ww,height=hh) + } + ComplexHeatmap::draw(ht_list,heatmap_legend_side='left',annotation_legend_side='right') + if(is.null(pdf_file)==FALSE) {while (!is.null(dev.list())) dev.off();} + return(TRUE) +} + +################################ Function enrichment related functions + +## +#' Merge Selected Major GeneSets from MsigDB +#' +#' \code{merge_gs} combines selected major gene set collections (e.g. "H", "C1" ) together, and return a list object with sub-collections as elements. +#' Each element contains a vector of genes belong to that sub-collection gene set. +#' +#' @param all_gs2gene list, the list returned by \code{gs.preload()}. +#' @param use_gs a vector of characters, names of major gene set collections. Users can call \code{all_gs2gene_info} to see all the available collections. +#' Default is c("H", "CP:BIOCARTA", "CP:REACTOME", "CP:KEGG"). +#' @return Return a list object with sub-collection gene sets as elements. Each element contains a vector of genes. +#' @examples +#' gs.preload(use_spe='Homo sapiens',update=FALSE) +#' use_gs2gene <- merge_gs(all_gs2gene=all_gs2gene, +#' use_gs=c('H','CP:BIOCARTA','CP:REACTOME','CP:KEGG','C5')) +#' +#' @export +merge_gs <- function(all_gs2gene=all_gs2gene,use_gs=c('H','CP:BIOCARTA','CP:REACTOME','CP:KEGG','C5')){ + # + all_input_para <- c('all_gs2gene') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(is.null(use_gs)==TRUE | 'all' %in% use_gs){ ## all gene sets + use_gs <- base::unique(all_gs2gene_info$Category) + } + # + use_gs <- base::unique(use_gs) + use_gs <- base::intersect(use_gs,names(all_gs2gene)) + if(base::length(use_gs)==0){ + message('Wrong setting for use_gs, no intersection with names(all_gs2gene), please check and re-try!');return(FALSE); + } + message(sprintf('%s will be merged !',paste(use_gs,collapse=';'))) + nn <- unlist(lapply(all_gs2gene[use_gs],names)) + use_gs2gene <- unlist(all_gs2gene[use_gs],recursive = FALSE) + names(use_gs2gene)<-nn + if(base::length(base::unique(nn))0){ + message(sprintf('Input %s not in all_gs2gene, please check all_gs2gene_info (items in Category or Sub-Category) and re-try!', + base::paste(base::setdiff(use_gs,c(all_gs2gene_info$Category,all_gs2gene_info$`Sub-Category`)),collapse=';'))); + return(FALSE) + } + if(base::length(use_gs)>1){ + gs2gene <- merge_gs(all_gs2gene,use_gs = use_gs) + }else{ + gs2gene <- all_gs2gene[[use_gs]] + } + }else{ + if(is.null(use_gs)==TRUE){ + use_gs <- 'all' + } + if(base::length(use_gs)>1){ + if(class(gs2gene[[1]])=='list') gs2gene <- merge_gs(gs2gene,use_gs = use_gs) + }else{ + if(use_gs == 'all'){ + if(class(gs2gene[[1]])=='list') gs2gene <- merge_gs(gs2gene,use_gs = names(gs2gene)) + }else{ + if(class(gs2gene[[1]])=='list') gs2gene <- gs2gene[[use_gs]] + } + } + } + all_gs <- names(gs2gene) + input_list <- base::unique(input_list) + bg_list <- base::unique(bg_list) + if(!is.null(bg_list)){ + use_gs2gene <- lapply(gs2gene,function(x){base::intersect(x,bg_list)}) + names(use_gs2gene) <- names(gs2gene) + }else{ + use_gs2gene <- gs2gene + } + bg_list <- base::unique(unlist(use_gs2gene)) + ## size selection + s1 <- unlist(lapply(use_gs2gene,length)) + w1 <- which(s1>=min_gs_size & s1<=max_gs_size) + use_gs2gene <- use_gs2gene[w1] + all_gs <- names(use_gs2gene) ## all tested gene set number + ## input filter + input_list <- base::intersect(input_list,bg_list) + bg_or <- base::length(input_list)/base::length(bg_list) + s1 <- unlist(lapply(use_gs2gene,function(x){ + base::length(base::intersect(input_list,x))/base::length(x) + })) + w1 <- which(s1>bg_or) + use_gs2gene <- use_gs2gene[w1] + empty_vec <- as.data.frame(matrix(NA,ncol=9));colnames(empty_vec) <- c('#Name','Total_item','Num_item','Num_list','Num_list_item','Ori_P','Adj_P','Odds_Ratio','Intersected_items') + if(base::length(w1)==0) return(empty_vec) + ## fisher~ + pv <- lapply(use_gs2gene,function(x){ + n11 <- base::length(base::intersect(input_list,x)) + n12 <- base::length(base::intersect(input_list,base::setdiff(bg_list,x))) + n21 <- base::length(base::setdiff(x,input_list)) + n22 <- base::length(base::setdiff(bg_list,base::unique(c(input_list,x)))) + ft <- fisher.test(base::cbind(c(n11,n12),c(n21,n22)))$p.value + or <- n11/n12/(n21/n22) + c(base::length(bg_list),base::length(x),base::length(input_list),n11,ft,or,base::paste(base::intersect(input_list,x),collapse=';')) + }) + pv <- do.call(rbind,pv) + pv <- as.data.frame(pv,stringsAsFactors=FALSE) + colnames(pv) <- c('Total_item','Num_item','Num_list','Num_list_item','Ori_P','Odds_Ratio','Intersected_items') + pv[1:6] <- lapply(pv[1:6],as.numeric) + pv$Adj_p <- p.adjust(pv$Ori_P,method=Pv_adj,n=base::length(all_gs)) + pv$`#Name` <- rownames(pv) + pv <- pv[,c(9,1:5,8,6:7)] + pv <- pv[order(pv$Ori_P),] + use_pv <- pv[which(pv$Adj_p<=Pv_thre),] + return(use_pv) +} + +#' Bar Plot for Gene Set Enrichment Analysis Result +#' +#' \code{draw.funcEnrich.bar} draws a horizontal bar plot to visualize the gene set enrichment analysis. +#' Users can choose to display P-values and the top intersected genes from each gene set. +#' +#' @param funcEnrich_res data.frame, containing the result of functional enrichment analysis. +#' It is highly suggested to use \code{funcEnrich.Fisher} to create this data frame. +#' If users decided to prepare the data.frame on their own, please make sure the column names match the following parameters. +#' @param top_number numeric, the number of top enriched gene sets to be displayed. Default is 30. +#' @param Pv_col character, the name of the column in \code{funcEnrich_res} which contains P-value. Default is "Ori_P". +#' @param name_col character, the name of the column in \code{funcEnrich_res} which contains gene set name. Default is "#Name". +#' @param item_col character, the name of the column in \code{funcEnrich_res} which contains intersected genes collapsed with ";". +#' Default is "Intersected_items". +#' @param Pv_thre numeric, threshold of P-values. Genes or drivers with P-values lower than the threshold will be kept. Default is 0.1. +#' @param display_genes logical, if TRUE, the intersected genes will be displayed. Default is FALSE. +#' @param gs_cex numeric, giving the amount by which the text of gene sets names should be magnified relative to the default. Default is 0.5. +#' @param gene_cex numeric, giving the amount by which the text of gene symbols should be magnified relative to the default. Default is 0.5. +#' @param main character, an overall title for the plot. +#' @param bar_col character, the color code used to plot the bars. Default is brewer.pal(8,'RdBu')[7]. +#' @param eg_num numeric, the number of intersected gene symbols to display on the right side of the bar. Default is 5. +#' @param pdf_file character, the file path to save as PDF file. If NULL, no PDF file will be saved. Default is NULL. +#' +#' @return Return a logical value. If TRUE, the plot has been created successfully. +#' +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' gs.preload(use_spe='Homo sapiens',update=FALSE) +#' res1 <- funcEnrich.Fisher(input_list=ms_tab[rownames(sig_driver),'geneSymbol'], +#' bg_list=ms_tab[,'geneSymbol'], +#' use_gs=c('H','C5'),Pv_thre=0.1, +#' Pv_adj = 'none') +#' draw.funcEnrich.bar(funcEnrich_res=res1,top_number=5, +#' main='Function Enrichment for Top drivers', +#' gs_cex=0.4,gene_cex=0.5) +#' draw.funcEnrich.bar(funcEnrich_res=res1,top_number=3, +#' main='Function Enrichment for Top drivers', +#' display_genes = TRUE,eg_num=3, +#' gs_cex=0.3,gene_cex=0.3) +#' \dontrun{ +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' gs.preload(use_spe='Homo sapiens',update=FALSE) +#' res1 <- funcEnrich.Fisher(input_list=ms_tab[rownames(sig_driver),'geneSymbol'], +#' bg_list=ms_tab[,'geneSymbol'], +#' use_gs=c('H','C5'),Pv_thre=0.1,Pv_adj = 'none') +#' analysis.par$out.dir.PLOT <- getwd() ## directory for saving the pdf files +#' draw.funcEnrich.bar(funcEnrich_res=res1,top_number=30, +#' main='Function Enrichment for Top drivers', +#' pdf_file=sprintf('%s/funcEnrich_bar_nogene.pdf', +#' analysis.par$out.dir.PLOT)) +#' draw.funcEnrich.bar(funcEnrich_res=res1,top_number=30, +#' main='Function Enrichment for Top drivers', +#' display_genes = TRUE,gs_cex=0.6, +#' pdf_file=sprintf('%s/funcEnrich_bar_withgene.pdf', +#' analysis.par$out.dir.PLOT)) +#' } +#' @export +draw.funcEnrich.bar <- function(funcEnrich_res=NULL,top_number=30, + Pv_col='Ori_P',item_col='Intersected_items', + Pv_thre=0.1,display_genes=FALSE,name_col='#Name', + gs_cex=0.5,gene_cex=0.5,main="",bar_col=brewer.pal(8,'RdBu')[7],eg_num=5, + pdf_file=NULL){ + # + all_input_para <- c('funcEnrich_res','Pv_col','item_col','Pv_thre','display_genes','name_col', + 'gs_cex','gene_cex','main','bar_col','eg_num') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('display_genes',c(TRUE,FALSE),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(is.null(top_number)==TRUE) top_number <- nrow(funcEnrich_res) + funcEnrich_res <- funcEnrich_res[which(funcEnrich_res[,Pv_col]<=Pv_thre),] + if(nrow(funcEnrich_res)>top_number) funcEnrich_res <- funcEnrich_res[1:top_number,] + pv_val <- funcEnrich_res[,Pv_col] + s1 <- funcEnrich_res[[item_col]] + s1 <- sapply(s1,function(x){ + x1 <- unlist(strsplit(x,';')) + if(base::length(x1)>eg_num){x2 <- sprintf('Total %d items, e.g: %s',base::length(x1),base::paste(x1[1:5],collapse=';'));x<-x2;} + return(x) + }) + ## plot design (inch) + # W: |textWidth|main(4)|textWidth1| + # H: |1|textHeight|1 + plot_part <- function(ori=FALSE,before_off=FALSE){ + #print(strwidth('W',units = 'inch')) + textWidth <- base::max(strwidthMod(funcEnrich_res[,name_col],units='inch',cex=gs_cex,ori=ori))+par.char2inch()[1] + textWidth1 <- base::max(strwidthMod(s1,units='inch',cex=gene_cex,ori=ori))+par.char2inch()[1] + each_textHeight <- base::max(strheightMod(funcEnrich_res[,name_col],units='inch',cex=gs_cex)) + if(display_genes==TRUE) each_textHeight <- base::max(each_textHeight,base::max(strheightMod(s1,units='inch',cex=gene_cex))) + textHeight <- nrow(funcEnrich_res)*1.5*each_textHeight*1.04 # default space=0.2, set 0.5 here + if(before_off==TRUE) dev.off() + if(is.null(pdf_file)==FALSE){ + if(display_genes==TRUE){ + ww <- textWidth+4+textWidth1; hh <- 2+textHeight + }else{ + ww <- textWidth+4+1; hh <- 2+textHeight + } + #print(sprintf('pdf_file with width: %s and height %s',signif(ww,2),signif(hh,2))) + pdf(pdf_file,width=ww,height=hh) + } + if(display_genes==TRUE) par(mai=c(1,textWidth,1,textWidth1)) else par(mai=c(1,textWidth,1,1)) + a<-graphics::barplot(rev(-log10(pv_val)),horiz=TRUE,border = NA,col=bar_col,main=main,xaxt='n', + xlim=c(0,max(-log10(pv_val))),space=0.5,width=each_textHeight*par.inch2pos()[2]) + graphics::mtext(side=1,'P-value',line=0.5/par.lineHeight2inch(),xpd=TRUE) ## middle position + graphics::axis(side=1,at=0:round(par()$usr[2]),labels=10^-(0:round(par()$usr[2])),xpd=TRUE) + graphics::text(par()$usr[1]-0.5*par.char2pos()[1],a,rev(funcEnrich_res[,name_col]),adj=1,xpd=TRUE,cex=gs_cex); + if(display_genes==TRUE) graphics::text(rev(-log10(pv_val))+0.5*par.char2pos()[1],a,rev(s1),adj=0,xpd=TRUE,cex=gene_cex) + } + if(is.null(pdf_file)==FALSE){plot_part(ori=TRUE);plot_part(ori=TRUE,before_off=TRUE);while (!is.null(dev.list())) dev.off()} else {plot_part()} + return(TRUE) +} + +#' Cluster Plot for Gene Set Enrichment Analysis Result +#' +#' \code{draw.funcEnrich.cluster} draws a cluster plot based on binary matrix, to visualize the existence of genes in the enriched gene sets. +#' The P-value of enrichment is also displayed in the plot. +#' +#' @param funcEnrich_res data.frame, containing the result of functional enrichment analysis. +#' It is highly suggested to use \code{funcEnrich.Fisher} to create this data frame. +#' If users decided to prepare the data.frame on their own, please make sure the column names match the following parameters. +#' @param top_number numeric, the number of top enriched gene sets to be displayed. Default is 30. +#' @param Pv_col character, the name of the column in \code{funcEnrich_res} which contains P-value. Default is "Ori_P". +#' @param name_col character, the name of the column in \code{funcEnrich_res} which contains gene set name. Default is "#Name". +#' @param item_col character, the name of the column in \code{funcEnrich_res} which contains intersected genes collapsed with ";". +#' Default is "Intersected_items". +#' @param Pv_thre numeric, threshold of P-values. Genes or drivers with P-values lower than the threshold will be kept. Default is 0.1. +#' @param gs_cex numeric, giving the amount by which the text of gene sets names should be magnified relative to the default. Default is 0.5. +#' @param gene_cex numeric, giving the amount by which the text of gene symbols should be magnified relative to the default. Default is 0.8. +#' @param pv_cex numeric, giving the amount by which the text of P-values should be magnified relative to the default. Default is 0.7. +#' @param main character, an overall title for the plot. +#' @param h numeric, the height where the cluster tree should be cut. The same parameter as \code{cutree}. Default is 0.95. +#' @param cluster_gs logical, if TRUE, gene sets will be clustered. Default is TRUE. +#' @param cluster_gene logical, if TRUE, genes will be clustered. Default is TRUE. +#' @param use_genes a vector of characters, a vector of gene symbols to display. +#' If NULL, all the genes in the top enriched gene sets will be displayed. Default is NULL. +#' @param pdf_file character, the file path to save as PDF file. If NULL, no PDF file will be saved. Default is NULL. +#' @param return_mat logical, if TRUE, return a binary matrix. Rows are gene sets, columns are genes. Default if FALSE. +#' +#' @return If \code{return_mat==FALSE}, return a logical value. If TRUE, plot has been created successfully. +#' If \code{return_mat == TRUE}, return a binary matrix of the cluster. Rows are gene sets, columns are genes. +#' +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' gs.preload(use_spe='Homo sapiens',update=FALSE) +#' res1 <- funcEnrich.Fisher(input_list=ms_tab[rownames(sig_driver),'geneSymbol'], +#' bg_list=ms_tab[,'geneSymbol'], +#' use_gs=c('H','C5'),Pv_thre=0.1,Pv_adj = 'none') +#' draw.funcEnrich.cluster(funcEnrich_res=res1,top_number=30,gs_cex = 0.5, +#' gene_cex=0.9,pv_cex=0.8) +#' draw.funcEnrich.cluster(funcEnrich_res=res1,top_number=10,gs_cex = 0.6, +#' gene_cex=1,pv_cex=1, +#' cluster_gs=TRUE,cluster_gene = TRUE) +#' draw.funcEnrich.cluster(funcEnrich_res=res1,top_number=15,gs_cex = 0.8, +#' gene_cex=0.9,pv_cex=0.8, +#' cluster_gs=TRUE,cluster_gene = FALSE) +#' draw.funcEnrich.cluster(funcEnrich_res=res1,top_number=20,gs_cex = 0.8, +#' gene_cex=0.9,pv_cex=0.8, +#' cluster_gs=FALSE,cluster_gene = TRUE) +#' draw.funcEnrich.cluster(funcEnrich_res=res1,top_number=20,gs_cex = 1, +#' gene_cex=1,pv_cex=0.8, +#' cluster_gs=FALSE,cluster_gene = FALSE) +#' \dontrun{ +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' gs.preload(use_spe='Homo sapiens',update=FALSE) +#' res1 <- funcEnrich.Fisher(input_list=ms_tab[rownames(sig_driver),'geneSymbol'], +#' bg_list=ms_tab[,'geneSymbol'], +#' use_gs=c('H','C5'),Pv_thre=0.1,Pv_adj = 'none') +#' analysis.par$out.dir.PLOT <- getwd() ## directory for saving the pdf files +#' draw.funcEnrich.cluster(funcEnrich_res=res1,top_number=30,gs_cex = 0.8, +#' gene_cex=0.9,pv_cex=0.8, +#' pdf_file = sprintf('%s/funcEnrich_cluster.pdf', +#' analysis.par$out.dir.PLOT)) +#' draw.funcEnrich.cluster(funcEnrich_res=res1,top_number=30,gs_cex = 1.4, +#' gene_cex=1.5,pv_cex=1.2, +#' pdf_file = sprintf('%s/funcEnrich_clusterBOTH.pdf', +#' analysis.par$out.dir.PLOT), +#' cluster_gs=TRUE,cluster_gene = TRUE) +#' draw.funcEnrich.cluster(funcEnrich_res=res1,top_number=30,gs_cex = 0.8, +#' gene_cex=0.9,pv_cex=0.8, +#' pdf_file = sprintf('%s/funcEnrich_clusterGS.pdf', +#' analysis.par$out.dir.PLOT), +#' cluster_gs=TRUE,cluster_gene = FALSE) +#' draw.funcEnrich.cluster(funcEnrich_res=res1,top_number=30,gs_cex = 0.8, +#' gene_cex=0.9,pv_cex=0.8, +#' pdf_file = sprintf('%s/funcEnrich_clusterGENE.pdf', +#' analysis.par$out.dir.PLOT), +#' cluster_gs=FALSE,cluster_gene = TRUE) +#' draw.funcEnrich.cluster(funcEnrich_res=res1,top_number=30,gs_cex = 1.5, +#' gene_cex=1.4,pv_cex=1.2, +#' pdf_file = sprintf('%s/funcEnrich_clusterNO.pdf', +#' analysis.par$out.dir.PLOT), +#' cluster_gs=FALSE,cluster_gene = FALSE) +#' } +#' @export +draw.funcEnrich.cluster <- function(funcEnrich_res=NULL,top_number=30,Pv_col='Ori_P',name_col='#Name',item_col='Intersected_items',Pv_thre=0.1, + gs_cex=0.7,gene_cex=0.8,pv_cex=0.7,main="",h=0.95,cluster_gs=TRUE,cluster_gene=TRUE, + pdf_file=NULL,use_genes=NULL,return_mat=FALSE){ + # + all_input_para <- c('funcEnrich_res','Pv_col','item_col','Pv_thre','name_col', + 'gs_cex','gene_cex','pv_cex','main','h','cluster_gs','cluster_gene','return_mat') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('cluster_gs',c(TRUE,FALSE),envir=environment()), + check_option('cluster_gene',c(TRUE,FALSE),envir=environment()), + check_option('return_mat',c(TRUE,FALSE),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(is.null(top_number)==TRUE) top_number <- nrow(funcEnrich_res) + funcEnrich_res <- funcEnrich_res[which(funcEnrich_res[,Pv_col]<=Pv_thre),] + if(nrow(funcEnrich_res)>top_number) funcEnrich_res <- funcEnrich_res[1:top_number,] + pv_val <- funcEnrich_res[,Pv_col]; names(pv_val) <- rownames(funcEnrich_res) + all_g2s <- lapply(funcEnrich_res[,item_col],function(x1)unlist(strsplit(x1,';'))) + names(all_g2s) <- funcEnrich_res[,name_col] + mat1 <- t(list2mat(all_g2s)) + mat1 <- mat1[rev(funcEnrich_res[,name_col]),] + if(is.null(use_genes)==FALSE) mat1 <- mat1[,base::intersect(colnames(mat1),use_genes)] + if(ncol(mat1)==0){message('No genes left, please check and re-try!');return(FALSE)} + #mat1 <- mat1[,order(colnames(mat1))] + h_gs <- hclust(dist(mat1,method='binary')) + h_gene <- hclust(dist(t(mat1),method='binary')) + gs_cluster <- cutree(h_gs,h=h) + gene_cluster <- cutree(h_gene,h=h) + if(cluster_gs==FALSE){gs_cluster <- rep(1,length.out=nrow(mat1));names(gs_cluster)<-rownames(mat1);} + if(cluster_gene==FALSE){gene_cluster <- rep(1,length.out=ncol(mat1));names(gene_cluster)<-colnames(mat1);} + cc1 <- grDevices::colorRampPalette(brewer.pal(8,'Dark2'))(base::length(base::unique(gs_cluster))) + cc2 <- grDevices::colorRampPalette(brewer.pal(9,'Pastel1'))(base::length(base::unique(gene_cluster))) + cc3 <- grDevices::colorRampPalette(brewer.pal(9,'Reds')[3:9])(100) + # get gs order + if(cluster_gs==TRUE) gs_cluster <- gs_cluster[h_gs$order] + tmp2 <- vec2list(gs_cluster,sep=NULL) + tmp2 <- tmp2[rev(order(unlist(lapply(tmp2,function(x)base::min(funcEnrich_res[x,Pv_col])))))] + mat1 <- mat1[unlist(tmp2),] + if(cluster_gene==TRUE) mat1 <- mat1[,h_gene$order] + gs_cluster <- gs_cluster[rownames(mat1)] + gene_cluster <- gene_cluster[colnames(mat1)] + #a <- heatmap(mat1,scale='none',col=c('white','red'),ColSideColors = cc2[gene_cluster[colnames(mat1)]], + # RowSideColors = cc1[gs_cluster[rownames(mat1)]],distfun = function(x){dist(x,method='binary')},margins=c(5,20),Rowv=NA,Colv=NA) + pv <- pv_val[rownames(mat1)] + pv1 <- format(pv,scientific = TRUE,digits = 3) + plot_part <- function(ori=TRUE,before_off=FALSE){ + gsWidth <- base::max(strwidthMod(rownames(mat1),units='inch',cex=gs_cex,ori=ori))+par.char2inch()[1] + gsHeight <- base::max(strheightMod(rownames(mat1),units='inch',cex=gs_cex))*nrow(mat1)*1.75 + geneWidth <- base::max(strheightMod(colnames(mat1),units='inch',cex=gene_cex))*ncol(mat1)*1.5 + geneHeight <- base::max(strwidthMod(colnames(mat1),units='inch',cex=gene_cex,ori=ori))*1.05+par.char2inch()[2]*1.1 ## add bar height + pvWidth <- base::max(strwidthMod(pv1,units='inch',cex=pv_cex,ori=ori))+par.char2inch()[1] + pvHeight <- base::max(strheightMod(pv1,units='inch',cex=pv_cex,ori=ori))*nrow(mat1)*1.75 + gsHeight <- base::max(gsHeight,pvHeight) + ## + ## + mr <- 1/pvWidth + geneWidth1 <- ceiling((geneWidth+0.5)*mr) + pvWidth1 <- 1 + gsWidth1 <- ceiling((gsWidth+0.5)*mr) + if(geneWidth1+pvWidth1+gsWidth1>200){ ## avoid too many graphics::layout values + mr <- 180/(geneWidth1+pvWidth1+gsWidth1) + geneWidth1 <- round(geneWidth1*mr) + pvWidth1 <- round(pvWidth1*mr) + gsWidth1 <- ceiling(gsWidth1*mr) + } + ##### + ww <- (gsWidth+pvWidth+geneWidth)+1 + hh <- geneHeight+gsHeight+0.5 + #print(c(ww,hh)) + ## pdf + if(before_off==TRUE) dev.off() + if(is.null(pdf_file)==FALSE) pdf(file=pdf_file,width=ww,height=hh) + ## graphics::layout: 1|2|3 + graphics::layout(t(matrix(c(rep(1,geneWidth1),rep(2,pvWidth1),rep(3,gsWidth1)),byrow=TRUE))) + #print(t(matrix(c(rep(1,geneWidth1),rep(2,pvWidth1),rep(3,gsWidth1)),byrow=TRUE))) + #print(ww);print(hh) + par(mai=c(0.5,0.5,geneHeight,0)); + graphics::image(t(mat1),col=c('white',cc3[1]),xaxt='n',yaxt='n',bty='n') + pp <- par()$usr; + gs_cs <- cumsum(base::table(gs_cluster)[base::unique(gs_cluster)]) + gene_cs <- cumsum(base::table(gene_cluster)[base::unique(gene_cluster)]) + xx <- (pp[2]-pp[1])/base::length(gene_cluster); + yy <- (pp[4]-pp[3])/base::length(gs_cluster) + graphics::abline(h=gs_cs*yy+pp[3],col='black',lwd=0.25) + graphics::abline(v=gene_cs*xx+pp[1],col='black',lwd=0.25) + graphics::abline(v=pp[1:2],col='black',lwd=0.5); + graphics::abline(h=pp[3:4],col='black',lwd=0.5) + ## draw gene name + yy <- par.char2pos()[2]*0.9 + if(cluster_gene==TRUE){ + graphics::text(c(1:base::length(gene_cluster))*xx+pp[1]-xx/2,pp[4]+yy,colnames(mat1),xpd=TRUE,adj=0,cex=gene_cex,srt=90) + graphics::rect(xleft=c(1:base::length(gene_cluster))*xx+pp[1]-xx,xright=c(1:base::length(gene_cluster))*xx+pp[1],ybottom=pp[4],ytop=pp[4]+yy*0.8, + col=cc2[gene_cluster[colnames(mat1)]],xpd=TRUE,border=NA) + }else{ + graphics::text(c(1:base::length(gene_cluster))*xx+pp[1]-xx/2,pp[4]+0.5*yy,colnames(mat1),xpd=TRUE,adj=0,cex=gene_cex,srt=90) + } + # draw p-value + #pp <- par()$usr; + par(mai=c(0.5,0,geneHeight,0)); + graphics::plot(1,xaxt='n',yaxt='n',bty='n',xlim=c(pp[1],pp[2]),ylim=c(pp[3],pp[4]),col='white',xlab="",ylab="") + pp <- par()$usr; + yy <- (pp[4]-pp[3])/base::length(gs_cluster) + pv_c <- z2col(-qnorm(pv)) + graphics::rect(xleft=pp[1],xright=pp[2],ybottom=c(1:base::length(gs_cluster))*yy+pp[3]-yy, + ytop=c(1:base::length(gs_cluster))*yy+pp[3], + col=pv_c,border = NA) + graphics::text(0.5,c(1:base::length(gs_cluster))*yy+pp[3]-yy/2,pv1,xpd=TRUE,adj=0.5,cex=pv_cex) + graphics::abline(v=pp[1],col='black',lwd=2); + # draw gs name + par(mai=c(0.5,0,geneHeight,0.5)); + zz <- base::min(c(xx,yy)) + graphics::plot(1,xaxt='n',yaxt='n',bty='n',xlim=c(pp[1],pp[2]),ylim=c(pp[3],pp[4]),col='white',xlab="",ylab="") + pp <- par()$usr; + yy <- (pp[4]-pp[3])/base::length(gs_cluster) + graphics::text(pp[1]+zz*0.2,c(1:base::length(gs_cluster))*yy+pp[3]-yy/2,rownames(mat1),xpd=TRUE,adj=0,cex=gs_cex) + # get region for p-value + #graphics::abline(v=pp[1:2],col='black',lwd=0.5); + graphics::abline(h=pp[3:4],col='black',lwd=0.5); + graphics::abline(v=pp[1],col='black',lwd=0.5); + graphics::abline(h=gs_cs*yy+pp[3],col='black',lwd=0.25) + ## + } + if(is.null(pdf_file)==FALSE){plot_part(ori=TRUE);plot_part(ori=TRUE,before_off=TRUE);while (!is.null(dev.list())) dev.off()} else {plot_part()} + if(return_mat==TRUE){ + return(mat1) + }else{ + return(TRUE) + } +} +#' Heat Bubble Matrix Plot for Top Drivers in NetBID2 Analysis +#' +#' \code{draw.bubblePlot} combines the matrix bubble chart and the heat map, using bubble color to compare P-values (performed by Fisher's Exact Test) and bubble size to compare the intersected size for target genes. +#' Rows are enriched gene set, columns are top drivers. Users can also check number of protein-coding genes targetted by each driver. +#' +#' +#' @param driver_list a vector of characters, the names of top drivers. +#' @param show_label a vector of characters, the names of top drivers to be displayed in the plot. +#' If NULL, the names in \code{driver_list} will be displayed. Default is NULL. +#' @param Z_val a vector of numerics, the Z statistics of the \code{driver_list}. +#' It is highly suggested to assign names to this vector. If the vector is nameless, the function will use the names of \code{driver_list} by default. +#' @param driver_type a vector of characters, the biotype or other characteristics of the driver. +#' In the demo, we use \code{"gene_biotype"} column in the master table as input. +#' It is highly suggested to assign names to this vector. If the vector is nameless, the function will use the names of \code{driver_list} by default. +#' Default is NULL. +#' @param target_list list, the driver-to-target list object. The names of the list elements are drivers. +#' Each element is a data frame, usually contains at least three columns. "target", target gene names; "MI", mutual information; "spearman", spearman correlation coefficient. +#' Users can call \code{get_net2target_list} to create this list and follow the suggested pipeline. +#' @param transfer2symbol2type data.frame, the ID-conversion table for converting the original ID into gene symbol and gene biotype (at gene level), +#' or into transcript symbol and transcript biotype (at transcript level). +#' It is highly suggested to use \code{get_IDtransfer2symbol2type} to create this ID-conversion table. +#' @param gs2gene list, a list contains elements of gene sets. The name of the element is gene set, each element contains a vector of genes in that gene set. +#' If NULL, will use \code{all_gs2gene}, which is created by function \code{gs.preload}. Default is NULL. +#' @param use_gs a vector of characters, the names of gene sets. If \code{gs2gene} is NULL, \code{all_gs2gene} will be used. And the \code{use_gs} must be the subset of names(all_gs2gene). +#' Please check \code{all_gs2gene_info} for detailed cateogory description. +#' Default is c("H", "CP:BIOCARTA", "CP:REACTOME", "CP:KEGG"). +#' @param display_gs_list a vector of characters, the names of gene sets to be displayed in the plot. +#' If NULL, all the gene sets will be displayed in descending order of their significance. Default is NULL. +#' @param bg_list a vector of characters, a vector of background gene symbols. If NULL, genes in \code{gs2gene} will be used as background. Default is NULL. +#' @param min_gs_size numeric, the minimum size of gene set to analysis. Default is 5. +#' @param max_gs_size numeric, the maximum size of gene set to analysis, Default is 500. +#' @param Pv_adj character, method to adjust P-value. Default is "none". For details, please check \code{p.adjust.methods}. +#' @param Pv_thre numeric, threshold for the adjusted P-values. Default is 0.1. +#' @param top_geneset_number integer, the number of top enriched gene sets to be displayed in the plot. Default is 30. +#' @param top_driver_number integer, the number of top significant drivers to be displayed in the plot. Default is 30. +#' @param main character, an overall title for the plot. +#' @param pdf_file character, the file path to save as PDF file. If NULL, no PDF file will be save. Default is NULL. +#' @param mark_gene a vector of characters, a vector of gene symbols to be highlighted red in the plot. Default is NULL. +#' @param driver_cex numeric, giving the amount by which the text of driver symbols should be magnified relative to the default. Default is 1. +#' @param gs_cex numeric, giving the amount by which the text of gene set names should be magnified relative to the default. Default is 1. +#' @param only_return_mat logicial, if TRUE, the function will only return the gene set Vs. driver matrix with value representing the Z-statistics of the significance test; +#' and the plot will not be generated. Default is FALSE. +#' +#' @return Return a logical value if only_return_mat=FALSE. If TRUE, the plot has been created successfully. +#' +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' gs.preload(use_spe='Homo sapiens',update=FALSE) +#' db.preload(use_level='gene',use_spe='human',update=FALSE) +#' use_genes <- base::unique(analysis.par$merge.network$network_dat$target.symbol) +#' transfer_tab <- get_IDtransfer2symbol2type(from_type = 'external_gene_name', +#' use_genes=use_genes, +#' dataset='hsapiens_gene_ensembl') +#' ## get transfer table !!! +#' draw.bubblePlot(driver_list=rownames(sig_driver), +#' show_label=ms_tab[rownames(sig_driver),'gene_label'], +#' Z_val=ms_tab[rownames(sig_driver),'Z.G4.Vs.others_DA'], +#' driver_type=ms_tab[rownames(sig_driver),'gene_biotype'], +#' target_list=analysis.par$merge.network$target_list, +#' transfer2symbol2type=transfer_tab, +#' min_gs_size=5, +#' max_gs_size=500,use_gs=c('H'), +#' top_geneset_number=5,top_driver_number=5, +#' main='Bubbleplot for top driver targets', +#' gs_cex = 0.4,driver_cex = 0.5) +#' ## the cex is set just in case of figure margin too large, +#' ## in real case, user could set cex larger or input pdf file name +#' \dontrun{ +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' gs.preload(use_spe='Homo sapiens',update=FALSE) +#' use_genes <- base::unique(analysis.par$merge.network$network_dat$target.symbol) +#' transfer_tab <- get_IDtransfer2symbol2type(from_type = 'external_gene_name', +#' use_genes=use_genes, +#' dataset='hsapiens_gene_ensembl') +#' ## get transfer table !!! +#' analysis.par$out.dir.PLOT <- getwd() ## directory for saving the pdf files +#' mark_gene <- c('KCNA1','EOMES','KHDRBS2','RBM24','UNC5D') ## marker for Group4 +#' draw.bubblePlot(driver_list=rownames(sig_driver), +#' show_label=ms_tab[rownames(sig_driver),'gene_label'], +#' Z_val=ms_tab[rownames(sig_driver),'Z.G4.Vs.others_DA'], +#' driver_type=ms_tab[rownames(sig_driver),'gene_biotype'], +#' target_list=analysis.par$merge.network$target_list, +#' transfer2symbol2type=transfer_tab, +#' min_gs_size=5,max_gs_size=500, +#' use_gs=use_gs=c('CP:KEGG','CP:BIOCARTA','H'), +#' top_geneset_number=30,top_driver_number=50, +#' pdf_file = sprintf('%s/bubbledraw.pdf', +#' analysis.par$out.dir.PLOT), +#' main='Bubbleplot for top driver targets', +#' mark_gene=ms_tab[which(ms_tab$geneSymbol %in% mark_gene), +#' 'originalID_label']) +#' } +#' @export +draw.bubblePlot <- function(driver_list=NULL,show_label=driver_list,Z_val=NULL,driver_type=NULL, + target_list=NULL,transfer2symbol2type=NULL, + bg_list=NULL,min_gs_size=5,max_gs_size=500, + gs2gene=NULL,use_gs=NULL, + display_gs_list=NULL, + Pv_adj='none',Pv_thre=0.1, + top_geneset_number=30,top_driver_number=30, + pdf_file=NULL,main="",mark_gene=NULL,driver_cex=1,gs_cex=1,only_return_mat=FALSE){ + # + all_input_para <- c('driver_list','show_label','Z_val','target_list','transfer2symbol2type', + 'min_gs_size','max_gs_size','Pv_adj','Pv_thre','top_geneset_number','top_driver_number', + 'driver_cex','gs_cex','only_return_mat') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('only_return_mat',c(TRUE,FALSE),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(is.null(names(show_label))==TRUE){names(show_label) <- driver_list} + if(is.null(names(Z_val))==TRUE){names(Z_val) <- driver_list} + if(is.null(driver_type)==FALSE){ + if(is.null(names(driver_type))==TRUE){names(driver_type) <- driver_list} + } + driver_list <- driver_list[order(abs(Z_val),decreasing = TRUE)] + if(base::length(driver_list)>top_driver_number){ + driver_list <- driver_list[1:top_driver_number] + } + driver_list <- driver_list[order(Z_val[driver_list],decreasing=TRUE)] + ## get target gene for driver_list + transfer_tab <- transfer2symbol2type + if(base::length(base::intersect(c('gene_biotype'),colnames(transfer_tab)))==0 & base::length(base::intersect(c('transcript_biotype'),colnames(transfer_tab)))==0){ + message('Input transfer table must contain the biotype information, please use get_IDtransfer2symbol2type() function to generate it!');return(FALSE); + } + #rownames(transfer_tab) <- transfer_tab[,1] + target_gene <- lapply(driver_list,function(x){ + x1 <- target_list[[x]]$target + w1 <- which.max(unlist(lapply(transfer_tab,function(x)length(intersect(x,x1))))) + w2 <- which(colnames(transfer_tab) %in% c('gene_biotype','transcript_biotype'))[1] + x1 <- x1[which(x1 %in% transfer_tab[,w1])] + x2 <- transfer_tab[which(transfer_tab[,w1] %in% x1),] + x3 <- x2[which(x2[,w2]=='protein_coding'),] + target <- base::unique(x2[,'external_gene_name']) + target_pc <- base::unique(x3[,'external_gene_name']) + return(list(target,target_pc)) + }) + names(target_gene) <- driver_list + ## + f_res <- lapply(target_gene,function(x){ + funcEnrich.Fisher(input_list=x[[1]],bg_list=bg_list,gs2gene=gs2gene,use_gs=use_gs, + min_gs_size=min_gs_size,max_gs_size=max_gs_size, + Pv_adj='none',Pv_thre=Pv_thre) + }) + names(f_res) <- names(target_gene) + ## get display matrix + all_path <- base::unique(unlist(lapply(f_res,function(x){x[[1]]}))) + all_path <- all_path[which(is.na(all_path)==FALSE)] ## get all sig path + if(is.null(display_gs_list)==FALSE){ + all_path <- base::intersect(display_gs_list,all_path) + if(base::length(all_path)<3){ + message('The number for passed gene sets is smaller than 3, please check the display_gs_list and re-try!') + return(FALSE) + } + } + f_mat <- lapply(f_res,function(x){ + as.data.frame(x)[all_path,5:6] + }) + f_mat2 <- do.call(rbind,lapply(f_mat,function(x)-qnorm(x[[2]]))) + # print(do.call(rbind,lapply(f_mat,function(x)x[[2]]))) + f_mat2[which(is.na(f_mat2)==TRUE | f_mat2==-Inf)] <- 0 + f_mat1 <- do.call(rbind,lapply(f_mat,function(x)x[,1])) + colnames(f_mat1) <- all_path + colnames(f_mat2) <- all_path + f_mat3 <- t(apply(f_mat2,1,function(x){ + o1 <- order(abs(x),decreasing = TRUE)[1:3]; + x1<-x;x1[base::setdiff(1:base::length(x1),o1)] <- 0;x1 + })) + ## use top + min_path_num <- 5 + max_path_num <- top_geneset_number + all_path_order <- apply(f_mat3,2,max) + all_path_order <- sort(all_path_order,decreasing = TRUE) + if(base::length(all_path_order) > max_path_num){ + w1 <- 1:base::length(all_path_order) + if(base::length(w1)>=min_path_num & base::length(w1)<=max_path_num){ + all_path <- names(sort(all_path_order[w1],decreasing=TRUE)) + }else{ + if(base::length(w1)0) graphics::segments(y0=1,y1=0.5,x0=use_pos_P,x1=use_pos_P,col=pos_col) + if(length(use_pos_N)>0) graphics::segments(y0=0,y1=0.5,x0=use_pos_N,x1=use_pos_N,col=neg_col) + graphics::abline(h=0.5,col='light grey') + graphics::text(0,0.75,pos=2,sprintf('Pos_Size:%s',base::length(use_pos_P)),xpd=TRUE) + graphics::text(0,0.25,pos=2,sprintf('Neg_Size:%s',base::length(use_pos_N)),xpd=TRUE) + }else{ + graphics::abline(v=use_pos) + } + ## GSEA ES, 4 + par(mar=c(0,6,5,2)) + # get ES score + es <- '' + if(is.null(use_direction)==FALSE){ + es_res_pos <- get_ES(rank_profile,pos_genes) + es_res_neg <- get_ES(rank_profile,neg_genes) + es_res <- es_res_pos + #print(es_res_pos$RES);print(es_res_neg$RES) + y2 <- base::seq(base::min(c(es_res_pos$RES,es_res_neg$RES),na.rm=T),base::max(c(es_res_pos$RES,es_res_neg$RES),na.rm=T),length.out=7); y2 <- round(y2,1) + graphics::plot(es_res_pos$RES,col=pos_col,xaxt='n',xlab="",ylab="",bty='n', + xlim=c(1,r_len),type='l',lwd=3,ylim=c(base::min(c(es_res_pos$RES,es_res_neg$RES),na.rm=T),base::max(y2)),main=main,xpd=TRUE,xaxs='i',las=1,cex.main=annotation_cex) + lines(es_res_neg$RES,col=neg_col,lwd=3,xpd=TRUE) + w1 <- base::which.max(abs(es_res_pos$RES)); + if(length(w1)==1) graphics::segments(x0=w1,x1=w1,y0=0,y1=es_res_pos$RES[w1],lty=2,col='grey') + w1 <- base::which.max(abs(es_res_neg$RES)); + if(length(w1)==1) graphics::segments(x0=w1,x1=w1,y0=0,y1=es_res_neg$RES[w1],lty=2,col='grey') + }else{ + es_res <- get_ES(rank_profile,use_genes) + y2 <- base::seq(base::min(es_res$RES),base::max(es_res$RES),length.out=7); y2 <- round(y2,1) + graphics::plot(es_res$RES,col='green',xaxt='n',xlab="",ylab="",bty='n', + xlim=c(1,r_len),type='l',lwd=3,ylim=c(base::min(es_res$RES),base::max(y2)),main=main,xpd=TRUE,xaxs='i',las=1) + w1 <- base::which.max(abs(es_res$RES)); + graphics::segments(x0=w1,x1=w1,y0=0,y1=es_res$RES[w1],lty=2,col='grey') + es <- sprintf('ES: %s',format(es_res$RES[w1],digits=3)) + } + graphics::abline(h=0,lty=2,col='dark grey') + pp <- par()$usr + graphics::mtext(side=2,line = 3,'Enrichment score (ES)',cex=annotation_cex) + # add annotation + if(is.null(annotation)==TRUE){ + if(is.null(use_direction)==FALSE){ + pv <- ks.test(new_rank_profile,new_rank_profile[new_use_genes])$p.value + }else{ + pv <- ks.test(rank_profile,rank_profile[use_genes])$p.value + } + #print(t.test(rank_profile,rank_profile[use_genes])) + if(pv==0){ + pv <- '<2.2e-16' + }else{ + if(pv<0.01) pv <- format(pv,digits = 3,scientific = TRUE) else pv <- signif(pv,3) + } + annotation <- sprintf("%s\nKS test p-value:%s",es,pv) + } + pp <- par()$usr + if(es_res$RES[base::which.max(abs(es_res$RES))]>0) + graphics::text(r_len,pp[4]-2*par.char2pos()[2],annotation,pos=2,cex=annotation_cex,xpd=TRUE) + else + graphics::text(0,pp[3]+2*par.char2pos()[2],annotation,pos=4,cex=annotation_cex,xpd=TRUE) + } + if(is.null(pdf_file)==FALSE){plot_part(ori=TRUE);dev.off()} else {plot_part()} + return(TRUE) +} + +## get enrichment score +get_ES <- function(rank_profile=NULL,use_genes=NULL,weighted.score.type=1){ + gene.list <- names(rank_profile) + correl.vector <- rank_profile + tag.indicator <- sign(match(gene.list, use_genes, nomatch=0)) # notice that the sign is 0 (no tag) or 1 (tag) + no.tag.indicator <- 1 - tag.indicator + N <- base::length(gene.list) + Nh <- base::length(use_genes) + Nm <- N - Nh + if (weighted.score.type == 0) { + correl.vector <- rep(1, N) + } + alpha <- weighted.score.type + correl.vector <- abs(correl.vector**alpha) + sum.correl.tag <- sum(correl.vector[tag.indicator == 1]) + norm.tag <- 1.0/sum.correl.tag + norm.no.tag <- 1.0/Nm + RES <- cumsum(tag.indicator * correl.vector * norm.tag - no.tag.indicator * norm.no.tag) + max.ES <- base::max(RES) + min.ES <- base::min(RES) + if(is.na(max.ES)==TRUE) max.ES <- 0 + if(is.na(min.ES)==TRUE) min.ES <- 0 + if (max.ES > - min.ES) { + # ES <- max.ES + ES <- signif(max.ES, digits = 5) + arg.ES <- base::which.max(RES) + } else { + # ES <- min.ES + ES <- signif(min.ES, digits=5) + arg.ES <- base::which.min(RES) + } + return(list(ES = ES, arg.ES = arg.ES, RES = RES, indicator = tag.indicator)) +} +## +get_z2p <- function(x,use_star=FALSE,digit_num=2){ + x[which(is.na(x)==TRUE)] <- 0 + #if(is.na(x[1])==TRUE) return('NA') + x <- abs(x) + x[which(is.na(x)==TRUE)] <- 0 ## + if(base::max(x)<5){ + use_pv <- 1-pnorm(x) + use_p <- format(use_pv,digits=digit_num,scientific = TRUE) + }else{ + low_p <- .Machine$double.xmin + low_z <- sapply(10^(-(1:(1+-log10(low_p)))),combinePvalVector) + use_pv <- sapply(x,function(x1){ + low_z[2,which(low_z[1,]>=x1)[1]]} + ) + use_p <- format(use_pv, digits=3,scientific = TRUE) + use_p[which(use_p=='NA')] <- '<1e-308' + use_p <- as.character(use_p) + } + x_star <- rep('',length.out=base::length(use_pv)) + x_star[which(use_pv<0.05)] <-'*' + x_star[which(use_pv<0.01)] <-'**' + x_star[which(use_pv<0.001)] <-'***' + if(use_star==TRUE) use_p<-paste0(use_p,x_star) + return(use_p) +} + +#' Draw GSEA (gene set enrichment analysis) Plot with NetBID Analysis of Drivers +#' +#' \code{draw.GSEA.NetBID} creates a GSEA plot for drivers with more NetBID analysis information. Such as number of target genes, ranking of target genes in +#' differential expressed file, differential expression (DE) and differential activity (DA) values. +#' +#' @param DE data.frame, a data.frame created either by function \code{getDE.limma.2G} or \code{getDE.BID.2G}. Row names are gene/driver names, +#' columns must include gene/driver name and calculated differencial values (e.g. "ID", "logFC", "AveExpr", "P.Value" etc.). +#' @param name_col character, the name of the column in \code{DE} contains gene names. If NULL, will use the row names of \code{DE}. +#' Default is NULL. +#' @param profile_col character, the name of the column in \code{DE} contains calculated differencial value (e.g. "logFC" or "P.Value"). +#' If \code{DE} is created by \code{getDE.limma.2G} or \code{getDE.BID.2G}, this parameter should be set to "logFC" or "t". +#' @param profile_trend character, users can choose between "pos2neg" and "neg2pos". "pos2neg" means high \code{profile_col} in target group will be shown on the left. +#' "neg2pos" means high \code{profile_col} in control group will be shown on the left. Default is "pos2neg". +#' For details, please check online tutorial. +#' @param driver_list a vector of characters, the names of top drivers. +#' @param show_label a vector of characters, the names of top drivers. +#' If NULL, will display the names in \code{driver_list}. Default is NULL. +#' @param driver_DA_Z a vector of numerics, the Z statistics of differential activity (DA) value of the \code{driver_list}. +#' It is highly suggested to give names to the vector, otherwise the names of \code{driver_list} will be used. +#' @param driver_DE_Z a vector of numerics, the Z statistics of differential expressed (DE) value of the \code{driver_list}. +#' It is highly suggested to give names to the vector, otherwise the names of \code{driver_list} will be used. +#' @param target_list list, the driver-to-target list object. The names of the list elements are drivers. +#' Each element is a data frame, usually contains at least three columns. "target", target gene names; "MI", mutual information; "spearman", spearman correlation coef- ficient. +#' Users can call \code{get_net2target_list} to create this list. +#' @param top_driver_number numeric, number for the top significant drivers to be displayed in the plot. Default is 30. +#' @param target_nrow numeric, users can choose between 1 and 2. Number of panels to mark the ranking of target genes. +#' If 1, the ranking of target genes will be marked in one panel. +#' If 2, the ranking of target genes will be marked in two panels. Upper panel for positively-regulated, lower panel for negatively-regulated. +#' Default is 2. For details, please check online tutorial. +#' @param target_col character, name of the color palette used for display marker line in the panel. Users can choose between "black" and "RdBu". +#' If "black", the marker line in the panel is black. +#' If "RdBu", the marker line in the panel is Red to Blue. +#' If \code{target_col_type} is set as 'PN', the positive regulated genes will be colored in red and negative regulated genes in blue. +#' If \code{target_col_type} is set as 'DE', the color for the target genes is set according to its value in the differentiated expression profile, +#' with significant high set for red and low for blue. The significant threshold is set by \code{profile_sig_thre}. +#' Default is 'RdBu'. +#' @param target_col_type character, name of the color palette used for display target genes. This parameter works only when \code{target_col} is set as "RdBu". +#' Users can choose between "PN" and "DE". +#' If "PN", positively-regulated genes will be colored red and negatively-regulated genes will be colored blue. +#' If "DE", the color shades is decided by its differentiated value. +#' Default is "PN". +#' @param left_annotation character, annotation on the left of profile curve, indicating high in control group or target group. +#' Default is "". +#' @param right_annotation character, annotation on the right of profile curve, indicating high in the opposite group of \code{left_annotation}. +#' Default is "". +#' @param main character, an overall title for the plot. Default is "". +#' @param profile_sig_thre numeric, threshold value for target genes. This parameter works only when \code{target_col_type} is set as "DE" and \code{target_col} is set as "RdBu". +#' Non-significant target genes will be colored grey. Default is 0. +#' @param Z_sig_thre numeric, threshold value of Z statistics from \code{driver_DA_Z} and \code{driver_DE_Z}. Significant values will have background color. +#' Default is 1.64. +#' @param pdf_file character, the file path to save figure as PDF file. If NULL, no PDF file will be saved. Default is NULL. +#' +#' @return Return a logical value. If TRUE, the plot has been created successfully. + +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' comp <- 'G4.Vs.others' +#' DE <- analysis.par$DE[[comp]] +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' driver_list <- rownames(sig_driver) +#' draw.GSEA.NetBID(DE=DE,profile_col='t', +#' name_col='ID', +#' profile_trend='neg2pos', +#' driver_list = driver_list, +#' show_label=ms_tab[driver_list,'gene_label'], +#' driver_DA_Z=ms_tab[driver_list,'Z.G4.Vs.others_DA'], +#' driver_DE_Z=ms_tab[driver_list,'Z.G4.Vs.others_DE'], +#' target_list=analysis.par$merge.network$target_list, +#' top_driver_number=5, +#' target_nrow=2,target_col='RdBu', +#' left_annotation = 'test_left', +#' right_annotation = 'test_right', +#' main='test',target_col_type='DE', +#' Z_sig_thre=1.64, +#' profile_sig_thre = 1.64) +#' \dontrun{ +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' comp <- 'G4.Vs.others' +#' DE <- analysis.par$DE[[comp]] +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' driver_list <- rownames(sig_driver) +#' analysis.par$out.dir.PLOT <- getwd() ## directory for saving the pdf files +#' draw.GSEA.NetBID(DE=DE,profile_col='logFC',profile_trend='neg2pos', +#' driver_list = driver_list, +#' show_label=ms_tab[driver_list,'gene_label'], +#' driver_DA_Z=ms_tab[driver_list,'Z.G4.Vs.others_DA'], +#' driver_DE_Z=ms_tab[driver_list,'Z.G4.Vs.others_DE'], +#' target_list=analysis.par$merge.network$target_list, +#' top_driver_number=30, +#' target_nrow=2, +#' target_col='RdBu', +#' left_annotation = 'test_left', +#' right_annotation = 'test_right', +#' main='test', +#' target_col_type='DE', +#' Z_sig_thre=1.64, +#' profile_sig_thre = 1.64, +#' pdf_file=sprintf('%s/NetBID_GSEA_demo1.pdf', +#' analysis.par$out.dir.PLOT)) +#'draw.GSEA.NetBID(DE=DE,profile_col='t',profile_trend='neg2pos', +#' driver_list = driver_list, +#' show_label=ms_tab[driver_list,'gene_label'], +#' driver_DA_Z=ms_tab[driver_list,'Z.G4.Vs.others_DA'], +#' driver_DE_Z=ms_tab[driver_list,'Z.G4.Vs.others_DE'], +#' target_list=analysis.par$merge.network$target_list, +#' top_driver_number=30, +#' target_nrow=1, +#' target_col='RdBu', +#' left_annotation = 'test_left', +#' right_annotation = 'test_right', +#' main='test',target_col_type='PN', +#' Z_sig_thre=1.64,profile_sig_thre = 1.64, +#' pdf_file=sprintf('%s/NetBID_GSEA_demo2.pdf', +#' analysis.par$out.dir.PLOT)) +#'} +#' @export +draw.GSEA.NetBID <- function(DE=NULL,name_col=NULL,profile_col=NULL,profile_trend='pos2neg', + driver_list=NULL,show_label=driver_list,driver_DA_Z=NULL,driver_DE_Z=NULL,target_list=NULL, + top_driver_number=30,target_nrow=2,target_col='RdBu',target_col_type='PN', + left_annotation="",right_annotation="",main="", + profile_sig_thre=0,Z_sig_thre=1.64,pdf_file=NULL){ + # + all_input_para <- c('DE','profile_col','profile_trend','driver_list','show_label', + 'driver_DA_Z','driver_DE_Z','target_list','top_driver_number','target_nrow', + 'target_col','target_col_type','left_annotation','right_annotation','main', + 'profile_sig_thre','Z_sig_thre') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('profile_trend',c('pos2neg','neg2pos'),envir=environment()), + check_option('target_nrow',c(1,2),envir=environment()), + check_option('target_col',c('RdBu','black'),envir=environment()), + check_option('target_col_type',c('PN','DE'),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + pos_col <- brewer.pal(12,'Paired')[8] + neg_col <- brewer.pal(12,'Paired')[4] + if(!profile_col %in% colnames(DE)){ + message(sprintf('%s not in colnames of DE, please check and re-try!',profile_col)) + return(FALSE) + } + if(is.null(names(driver_DA_Z))) names(driver_DA_Z) <- driver_list + if(is.null(names(driver_DE_Z))) names(driver_DE_Z) <- driver_list + if(is.null(names(show_label))) names(show_label) <- driver_list + if(base::length(driver_list)>top_driver_number){ + driver_DA_Z <- driver_DA_Z[driver_list] + driver_DE_Z <- driver_DE_Z[driver_list] + driver_list <- driver_list[order(abs(driver_DA_Z[driver_list]),decreasing = TRUE)][1:top_driver_number] + } + if(profile_trend=='pos2neg') + driver_list <- driver_list[order(driver_DA_Z[driver_list],decreasing = FALSE)] + else + driver_list <- driver_list[order(driver_DA_Z[driver_list],decreasing = TRUE)] + show_label <- show_label[driver_list] + driver_DA_Z <- driver_DA_Z[driver_list] + driver_DE_Z <- driver_DE_Z[driver_list] + ################### + ## calculate graphics::layout + if(is.null(name_col)==TRUE){ + DE <- base::cbind(DE[,base::setdiff(colnames(DE),'name')],name=rownames(DE),stringsAsFactors=FALSE) + name_col <- 'name' + } + w1 <- which(is.na(DE[,profile_col])==FALSE) + DE <- DE[w1,] + if(profile_trend=='pos2neg') DE <- DE[order(DE[,profile_col],decreasing = TRUE),] else DE <- DE[order(DE[,profile_col],decreasing = FALSE),] + DE_profile <- DE[,profile_col] + DE_profile_name <- DE[,name_col] + n_gene <- base::length(DE_profile) + + plot_part <- function(ori=FALSE,before_off=FALSE){ + if(target_nrow==2){ + n_driver <- base::length(driver_list)*2 + ratio1 <- ceiling(n_driver/15) ## profile height to rows + ratio2 <- 4 ## width of profile to DA/DE + rr <- 1 + } else { + n_driver <- base::length(driver_list) + ratio1 <- ceiling(1.2*n_driver/15) ## profile height to rows + ratio2 <- 4 ## width of profile to DA/DE + rr <- 1 + } + # + if(before_off==TRUE) dev.off() + if(is.null(pdf_file)==FALSE){ + pdf(pdf_file,width=(rr*2+ratio2)*1.5,height=(ratio1+rr)*1.5) + } + # get graphics::layout + graphics::layout(matrix(c(rep(0,length.out=rr),rep(1,length.out=ratio2),rep(0,length.out=rr*1), + rep(c(rep(4,length.out=rr),rep(2,length.out=ratio2),rep(3,length.out=rr*1)), + length.out=ratio1*(ratio2+rr*2))), + ncol=c(ratio2+rr*2),byrow=TRUE)) + ## graphics::layout + # |0|1|0 + # |4|2|3 + ## plot 1 + par(mar=c(1.5,1.5,4,0)) + mm <- quantile(DE_profile,probs=c(0.0001,0.9999)); + mm <- base::max(abs(mm)); mm <- c(-mm,mm) + y1 <- base::seq(mm[1],mm[2],length.out=5); y1 <- round(y1,1) + unit <- n_gene/10; unit <- round(unit/100)*100 + x1 <- base::seq(0,n_gene,by=unit);x1 <- base::unique(x1); x1 <- c(x1[1:(base::length(x1)-1)],n_gene) + par(usr=c(0,base::length(DE_profile),mm[1],mm[2])) + graphics::plot(DE_profile,col='white',pch=16,xaxt='n',yaxt='n',xlab="",ylab="",bty='n',type='n',ylim=c(mm[1],mm[2]),main=main,cex.main=1.8) + pp <- par()$usr; rr <- (pp[2]-pp[1])/n_gene + graphics::polygon(x=c(pp[1],c(1:n_gene)*rr+pp[1],pp[2]),y=c(0,DE_profile,0),col='grey',border='grey',xpd=TRUE,lwd=0.3) + if(profile_trend=='pos2neg'){ + if(is.null(left_annotation)==FALSE) graphics::text(pp[1]+(pp[2]-pp[1])/100,mm[2]*0.8,adj=0,left_annotation,col=brewer.pal(9,'Reds')[6],xpd=TRUE,cex=1.2) + if(is.null(right_annotation)==FALSE) graphics::text(pp[2]-(pp[2]-pp[1])/100,mm[1]*0.8,adj=1,right_annotation,col=brewer.pal(9,'Blues')[6],xpd=TRUE,cex=1.2) + }else{ + if(is.null(left_annotation)==FALSE) graphics::text(pp[1]+(pp[2]-pp[1])/100,mm[1]*0.8,adj=0,left_annotation,col=brewer.pal(9,'Blues')[6],xpd=TRUE,cex=1.2) + if(is.null(right_annotation)==FALSE) graphics::text(pp[2]-(pp[2]-pp[1])/100,mm[2]*0.8,adj=1,right_annotation,col=brewer.pal(9,'Reds')[6],xpd=TRUE,cex=1.2) + } + graphics::axis(side=2,at=y1,labels=y1) + graphics::mtext(side=2,line = 2.5,profile_col,cex=1) + graphics::segments(pp[1],mm[1],pp[2],mm[1],xpd=TRUE) + graphics::segments(x1*rr+pp[1],mm[1]-(mm[2]-mm[1])/30,x1*rr+pp[1],mm[1],xpd=TRUE) + graphics::text(x1*rr+pp[1],mm[1]-(mm[2]-mm[1])/10,get_label_manual(x1),adj=1,xpd=TRUE) + ## plot2 + par(mar=c(2,1.5,2,0)) + graphics::plot(1,col='white',xlab="",ylab="",xlim=c(0,n_gene),xaxt='n',yaxt='n') + pp <- par()$usr;rr <- (pp[2]-pp[1])/n_gene + yy1 <- base::seq(from=pp[3],to=pp[4],length.out=n_driver+1) + yy2 <- base::seq(from=pp[3],to=pp[4],length.out=base::length(driver_list)+1) + #graphics::segments(x0=pp[1],x1=pp[2],y0=yy1,y1=yy1,lwd=0.2,col='light grey') + #graphics::segments(x0=pp[1],x1=pp[2],y0=yy2,y1=yy2,lwd=2,col='white') + #graphics::segments(x0=pp[1],x1=pp[2],y0=yy2,y1=yy2,lwd=1.2,col='dark grey') + # add columns + use_target_list <- target_list[driver_list] + if(target_col_type=='DE'){ + cc <- z2col(DE_profile,sig_thre=profile_sig_thre,n_len=100,red_col = brewer.pal(9,'Reds')[5:9],blue_col=brewer.pal(9,'Blues')[5:9], + col_max_thre=base::max(abs(DE_profile))) + #names(cc) <- DE_profile_name + cc[which(cc=='white')] <- 'light grey' + } + if(target_nrow==1){ + for(i in 1:base::length(driver_list)){ + t1 <- use_target_list[[driver_list[[i]]]] + w0 <- which(DE_profile_name %in% t1$target) + w1 <- w0*rr+pp[1] + if(target_col=='black'){ + graphics::segments(x0=w1,x1=w1,y0=yy1[i],y1=yy1[i+1],col='black',lwd=1) + }else{ + if(target_col_type=='DE'){ + graphics::segments(x0=w1,x1=w1,y0=yy1[i],y1=yy1[i+1],lwd=1.5, + col=cc[w0]) + }else{ + graphics::segments(x0=w1,x1=w1,y0=yy1[i],y1=yy1[i+1],lwd=1.5, + col=z2col(t1$spearman,sig_thre=0,col_max_thre=1,col_min_thre=0.01, + red_col = pos_col,blue_col=neg_col)) + } + } + } + } + if(target_nrow==2){ + for(i in 1:base::length(driver_list)){ + t1 <- use_target_list[[driver_list[[i]]]] + t11 <- t1[which(t1$spearman>=0),]$target + t12 <- t1[which(t1$spearman<0),]$target + w0 <- which(DE_profile_name %in% t11) + w1 <- w0*rr+pp[1] + if(base::length(w1)>0){ + if(target_col=='black'){ + graphics::segments(x0=w1,x1=w1,y0=yy1[2*i],y1=yy1[2*i+1],col='black',lwd=1) + }else{ + if(target_col_type=='DE'){ + graphics::segments(x0=w1,x1=w1,y0=yy1[2*i],y1=yy1[2*i+1],col=cc[w0],lwd=1.5) + }else{ + graphics::segments(x0=w1,x1=w1,y0=yy1[2*i],y1=yy1[2*i+1],col=pos_col,lwd=1.5) + } + } + } + w0 <- which(DE_profile_name %in% t12) + w1 <- w0*rr+pp[1] + if(base::length(w1)>0){ + if(target_col=='black'){ + graphics::segments(x0=w1,x1=w1,y0=yy1[2*i-1],y1=yy1[2*i],col='black',lwd=1) + }else{ + if(target_col_type=='DE'){ + graphics::segments(x0=w1,x1=w1,y0=yy1[2*i-1],y1=yy1[2*i],col=cc[w0],lwd=1.5) + }else{ + graphics::segments(x0=w1,x1=w1,y0=yy1[2*i-1],y1=yy1[2*i],col=neg_col,lwd=1.5) + } + } + } + } + } + graphics::segments(x0=pp[1],x1=pp[2],y0=yy1,y1=yy1,lwd=0.2,col='light grey') + graphics::segments(x0=pp[1],x1=pp[2],y0=yy2,y1=yy2,lwd=2,col='white') + graphics::segments(x0=pp[1],x1=pp[2],y0=yy2,y1=yy2,lwd=1.2,col='dark grey') + ## plot 3 + par(mar=c(2,0.5,2,2)) + graphics::plot(1,col='white',xlab="",ylab="",xlim=c(0,2),xaxt='n',yaxt='n',bty='n') + pp <- par()$usr + graphics::rect(xleft=pp[1],xright=pp[2],ybottom=pp[3],ytop=pp[4]) + yy2 <- base::seq(from=pp[3],to=pp[4],length.out=base::length(driver_list)+1) + graphics::segments(x0=pp[1],x1=pp[2],y0=yy2,y1=yy2,lwd=2,col='white',xpd=TRUE) + graphics::segments(x0=pp[1],x1=pp[2],y0=yy2,y1=yy2,lwd=1.2,col='dark grey',xpd=TRUE) + graphics::abline(v=c(pp[1],(pp[1]+pp[2])/2,pp[2])) + ## add text + mm_min <- base::min(base::min(abs(driver_DA_Z[driver_list]),na.rm=TRUE)*0.9,base::min(abs(driver_DE_Z[driver_list]),na.rm=TRUE)*0.9) + mm_min <- base::max(mm_min,Z_sig_thre) + mm_max <- base::max(base::max(abs(driver_DA_Z[driver_list]),na.rm=TRUE)*1.1,base::max(abs(driver_DE_Z[driver_list]),na.rm=TRUE)*1.1) + c1 <- z2col(driver_DA_Z[driver_list],sig_thre=Z_sig_thre,n_len=100,red_col = brewer.pal(9,'Reds')[7],blue_col=brewer.pal(9,'Blues')[7], + col_min_thre=mm_min,col_max_thre=mm_max) + c2 <- z2col(driver_DE_Z[driver_list],sig_thre=Z_sig_thre,n_len=100,red_col = brewer.pal(9,'Reds')[7],blue_col=brewer.pal(9,'Blues')[7], + col_min_thre=mm_min,col_max_thre=mm_max) + for(i in 1:base::length(driver_list)){ + z1 <- driver_DA_Z[driver_list[i]] + z2 <- driver_DE_Z[driver_list[i]] + p1 <- get_z2p(z1) + p2 <- get_z2p(z2) + graphics::rect(xleft=pp[1],xright=(pp[1]+pp[2])/2,ybottom=yy2[i],ytop=yy2[i+1],col=c1[i],border='dark grey',xpd=TRUE) + graphics::rect(xright=pp[2],xleft=(pp[1]+pp[2])/2,ybottom=yy2[i],ytop=yy2[i+1],col=c2[i],border='dark grey',xpd=TRUE) + graphics::text(x=(pp[1]+(pp[1]+pp[2])/2)/2,y=(yy2[i]+yy2[i+1])/2,p1,adj=0.5) + graphics::text(x=(pp[2]+(pp[1]+pp[2])/2)/2,y=(yy2[i]+yy2[i+1])/2,p2,adj=0.5) + } + text((pp[1]+(pp[1]+pp[2])/2)/2,pp[4],'DA',xpd=TRUE,cex=1.5,pos=3) + text((pp[2]+(pp[1]+pp[2])/2)/2,pp[4],'DE',xpd=TRUE,cex=1.5,pos=3) + ## plot 4 + par(mar=c(2,6,2,0.2)) + graphics::plot(1,col='white',xlab="",ylab="",xlim=c(0,2),xaxt='n',yaxt='n',bty='n') + pp <- par()$usr + #yy1 <- base::seq(from=pp[3],to=pp[4],length.out=n_driver+1) + #yy11 <- (yy1[1:(base::length(yy1)-1)]+yy1[2:base::length(yy1)])/2 + yy2 <- base::seq(from=pp[3],to=pp[4],length.out=base::length(driver_list)+1) + yy22 <- (yy2[1:(base::length(yy2)-1)]+yy2[2:base::length(yy2)])/2 + dyy <- yy2[2]-yy2[1] + graphics::text(show_label,x=(pp[1]+pp[2])/2,y=yy22,xpd=TRUE,adj=1) + # add target size + target_size <- do.call(rbind,lapply(use_target_list,function(x){ + x1 <- base::length(which(x$spearman>=0)) + x2 <- base::length(which(x$spearman<0)) + c(x1,x2) + })) + if(target_nrow==2){ + mm <- base::max(target_size) + tt <- pp[2]-(pp[1]+pp[2])*0.55 + for(i in 1:base::length(driver_list)){ + graphics::rect(xleft=(pp[1]+pp[2])*0.55,xright=(pp[1]+pp[2])*0.55+target_size[i,1]/mm*tt, + ybottom=yy22[i],ytop=yy22[i]+dyy*0.35,col=pos_col,border=NA) + graphics::rect(xleft=(pp[1]+pp[2])*0.55,xright=(pp[1]+pp[2])*0.55+target_size[i,2]/mm*tt, + ytop=yy22[i],ybottom=yy22[i]-dyy*0.35,col=neg_col,border=NA) + } + graphics::segments(x0=(pp[1]+pp[2])*0.55,x1=pp[2],y0=pp[4],y1=pp[4],xpd=TRUE) + sst <- round(base::seq(0,mm,length.out=3)) + ss <- sst*tt/mm+(pp[1]+pp[2])*0.55 + graphics::segments(x0=ss,x1=ss,y0=pp[4],y1=pp[4]+(pp[4]-pp[3])/150,xpd=TRUE) + graphics::text(x=ss,y=pp[4]+(pp[4]-pp[3])/100,srt=90,sst,xpd=TRUE,adj=0,cex=0.8) + text('Target Size',x=(pp[1]+pp[2])*0.45,y=pp[4]+(pp[4]-pp[3])/50,adj=1,xpd=TRUE,cex=0.8) + graphics::legend(2,pp[3],c('Positively-regulated','Negatively-regulated'),fill=c(pos_col,neg_col),border = NA,bty='n',cex=0.6,xpd=TRUE,xjust=1,yjust=1) + } + # + if(target_nrow==1){ + target_size <- base::rowSums(target_size) + mm <- base::max(target_size) + tt <- pp[2]-(pp[1]+pp[2])*0.55 + for(i in 1:base::length(driver_list)){ + graphics::rect(xleft=(pp[1]+pp[2])*0.55,xright=(pp[1]+pp[2])*0.55+target_size[i]/mm*tt, + ybottom=yy22[i]-dyy*0.2,ytop=yy22[i]+dyy*0.2,col='dark grey',border=NA) + } + graphics::segments(x0=(pp[1]+pp[2])*0.55,x1=pp[2],y0=pp[4],y1=pp[4],xpd=TRUE) + sst <- round(base::seq(0,mm,length.out=3)) + ss <- sst*tt/mm+(pp[1]+pp[2])*0.55 + graphics::segments(x0=ss,x1=ss,y0=pp[4],y1=pp[4]+(pp[4]-pp[3])/150,xpd=TRUE) + graphics::text(x=ss,y=pp[4]+(pp[4]-pp[3])/100,srt=90,sst,xpd=TRUE,adj=0,cex=0.8) + text('Target Size',x=(pp[1]+pp[2])*0.45,y=pp[4]+(pp[4]-pp[3])/50,adj=1,xpd=TRUE,cex=0.8) + } + } + ## + if(is.null(pdf_file)==FALSE){plot_part(ori=TRUE);dev.off()} else {plot_part()} + return(TRUE) +} +### +#' Draw GSEA (gene set enrichment analysis) Plot with NetBID Analysis of Gene Sets +#' +#' \code{draw.GSEA.NetBID.GS} creates a GSEA plot for gene sets with more NetBID analysis information. +#' Such as, number of genes in each gene set, marking the rank of annotated genes in the differential expression profile and differential activity (DA) values. +#' +#' @param DE data.frame, a data.frame created either by function \code{getDE.limma.2G} or \code{getDE.BID.2G}. +#' Row names are gene names, columns must include the calculated differencial values (e.g. "ID", "logFC", "AveExpr", "P.Value" etc.). +#' @param name_col character, the name of the column in \code{DE} contains gene names. If NULL, will use the row names of \code{DE}. Default is NULL. +#' @param profile_col character, the name of the column in \code{DE} contains calculated differencial value (e.g. "logFC" or "P.Value"). +#' If \code{DE} is created by \code{getDE.limma.2G} or \code{getDE.BID.2G}, this parameter should be set to "logFC" or "t". +#' @param profile_trend character, users can choose between "pos2neg" and "neg2pos". "pos2neg" means high \code{profile_col} in target group will be shown on the left. +#' "neg2pos" means high \code{profile_col} in control group will be shown on the left. Default is "pos2neg". +#' @param use_gs2gene list, contains elements of gene sets. Element name is gene set name, each element contains a vector of genes belong to that gene set. +#' This list can be created by calling one element from \code{all_gs2gene}, or merge several gene sets into one by using \code{merge_gs}. +#' @param sig_gs_list a vector of characters, the names of top gene sets. +#' @param gs_DA_Z a vector of numerics, the Z-statistics of differentail activity (DA) values for the \code{sig_gs_list}. +#' It is highly suggested to assign name to the vector, otherwise will use name of \code{sig_gs_list}. +#' @param top_gs_number integer, the number of top significant gene sets to be displayed. Default is 30. +#' @param target_col character, name of the color palette used for display marker line in the panel. +#' Users can choose between "black" and "RdBu". If "black", the marker line in the panel is black. If "RdBu", the marker line in the panel is Red to Blue. +#' The color shade of the marker line is decided by each gene's significance of differentiation. High in red, low in blue. +#' Default is "RdBu". +#' @param left_annotation character, annotation on the left of profile curve, indicating high in control group or target group. Default is "". +#' @param right_annotation character, annotation on the right of profile curve, indicating high in the opposite group of \code{left_annotation}. Default is "". +#' @param main character, an overall title for the plot. Default is "". +#' @param profile_sig_thre numeric, threshold value for target genes. This parameter works only when \code{target_col_type} is set as "DE" and \code{target_col} is set as "RdBu". +#' Non-significant target genes will be colored grey. Default is 0. +#' @param Z_sig_thre numeric, threshold value of Z-statistics from \code{driver_DA_Z} and \code{driver_DE_Z}. Significant values will have background color. Default is 1.64. +#' @param pdf_file character, the file path to save figure as PDF file. If NULL, no PDF file will be saved. Default is NULL. +#' +#' @return Return a logical value. If TRUE, the plot has been created successfully. +#' @examples +#' \dontrun{ +#' db.preload(use_level='transcript',use_spe='human',update=FALSE) +#' +#' ## get all_gs2gene +#' +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' +#' ms_tab <- analysis.par$final_ms_tab +#' comp <- 'G4.Vs.others' +#' DE <- analysis.par$DE[[comp]] +#' analysis.par$out.dir.PLOT <- getwd() +#' +#' ## directory for saving the pdf files +#' exp_mat_gene <- Biobase::exprs(analysis.par$cal.eset) +#' +#' ## calculate activity for all genesets +#' use_gs2gene <- merge_gs(all_gs2gene=all_gs2gene, +#' use_gs=c('H','CP:BIOCARTA','CP:REACTOME','CP:KEGG','C5')) +#' ac_gs <- cal.Activity.GS(use_gs2gene = use_gs2gene,cal_mat = exp_mat_gene) +#' +#' ## get DA for the gene set +#' phe_info <- Biobase::pData(analysis.par$cal.eset) +#' G0 <- rownames(phe_info)[which(phe_info$`subgroup`!='G4')] +#' # get sample list for G0 +#' G1 <- rownames(phe_info)[which(phe_info$`subgroup`=='G4')] +#' # get sample list for G1 +#' DA_gs <- getDE.limma.2G(eset=generate.eset(ac_gs),G1=G1,G0=G0, +#' G1_name='G4',G0_name='others') +#' ## or use: DA_gs <- getDE.BID.2G(eset=generate.eset(ac_gs),G1=G1,G0=G0, +#' G1_name='G4',G0_name='others') +#' ## draw vocalno plot for top sig-GS +#' sig_gs <- draw.volcanoPlot(dat=DA_gs, +#' label_col='ID', +#' logFC_col='logFC', +#' Pv_col='P.Value', +#' logFC_thre=0.25, +#' Pv_thre=1e-4, +#' main='Volcano Plot for gene sets', +#' show_label=TRUE, +#' label_type = 'distribute', +#' label_cex = 0.5, +#' pdf_file=sprintf('%s/vocalno_GS_DA.pdf', +#' analysis.par$out.dir.PLOT)) +#' ## GSEA plot for the significant gene sets +#' draw.GSEA.NetBID.GS(DE=DE,name_col='ID', +#' profile_col='t',profile_trend='pos2neg', +#' sig_gs_list = sig_gs$ID, +#' gs_DA_Z=DA_gs[sig_gs$ID,'Z-statistics'], +#' use_gs2gene = use_gs2gene, +#' top_gs_number=5,target_col='RdBu', +#' left_annotation = 'test_left', +#' right_annotation = 'test_right', +#' main='test',Z_sig_thre=1.64,profile_sig_thre = 0, +#' pdf_file=sprintf('%s/NetBID_GSEA_GS_demo1.pdf', +#' analysis.par$out.dir.PLOT)) +#'} +#' @export +draw.GSEA.NetBID.GS <- function(DE=NULL,name_col=NULL,profile_col=NULL,profile_trend='pos2neg', + sig_gs_list=NULL,gs_DA_Z=NULL,use_gs2gene=NULL, + top_gs_number=30,target_col='RdBu', + left_annotation="",right_annotation="",main="", + profile_sig_thre=0,Z_sig_thre=1.64,pdf_file=NULL){ + # + all_input_para <- c('DE','name_col','profile_col','profile_trend','sig_gs_list', + 'gs_DA_Z','top_gs_number', + 'target_col','left_annotation','right_annotation','main', + 'profile_sig_thre','Z_sig_thre') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('profile_trend',c('pos2neg','neg2pos'),envir=environment()), + check_option('target_col',c('RdBu','black'),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if(!profile_col %in% colnames(DE)){ + message(sprintf('%s not in colnames of DE, please check and re-try!',profile_col)) + return(FALSE) + } + while(class(use_gs2gene[[1]])=='list'){ + nn <- unlist(lapply(use_gs2gene,names)) + use_gs2gene <- unlist(use_gs2gene,recursive = FALSE) + names(use_gs2gene)<-nn + } + use_gs2gene <- use_gs2gene[sig_gs_list] + if(is.null(name_col)==TRUE){ + DE <- base::cbind(DE[,base::setdiff(colnames(DE),'name')],name=rownames(DE),stringsAsFactors=FALSE) + name_col <- 'name' + } + if(is.null(names(gs_DA_Z))) names(gs_DA_Z) <- sig_gs_list + if(base::length(sig_gs_list)>top_gs_number){ + gs_DA_Z <- gs_DA_Z[sig_gs_list] + sig_gs_list <- sig_gs_list[order(abs(gs_DA_Z[sig_gs_list]),decreasing = TRUE)][1:top_gs_number] + } + if(profile_trend=='pos2neg') + sig_gs_list <- sig_gs_list[order(gs_DA_Z[sig_gs_list],decreasing = FALSE)] + else + sig_gs_list <- sig_gs_list[order(gs_DA_Z[sig_gs_list],decreasing = TRUE)] + show_label <- sig_gs_list + gs_DA_Z <- gs_DA_Z[sig_gs_list] + ################### + ## calculate graphics::layout + w1 <- which(is.na(DE[,profile_col])==FALSE) + DE <- DE[w1,] + if(profile_trend=='pos2neg') DE <- DE[order(DE[,profile_col],decreasing = TRUE),] else DE <- DE[order(DE[,profile_col],decreasing = FALSE),] + DE_profile <- DE[,profile_col] + DE_profile_name <- DE[,name_col] + use_gs2gene <- lapply(use_gs2gene,function(x)base::intersect(x,DE_profile_name)) + use_gs2gene <- use_gs2gene[sig_gs_list] + #names(DE_profile) <- rownames(DE) + n_gene <- base::length(DE_profile) + n_driver <- base::length(sig_gs_list) + plot_part <- function(ori=FALSE,before_off=FALSE){ + gswidth <- base::max(strwidthMod(sig_gs_list,units='inches',cex=1)) + gsheight <- base::max(strheightMod(sig_gs_list,units='inches',cex=1))*length(sig_gs_list)*2 + gswidth <- ceiling(gswidth) + ratio1 <- ceiling(gsheight/1.5) ## profile height to rows + ratio2 <- 6 ## width of main profile + rr1 <- ceiling(gswidth/0.5)+1 + rr2 <- 2 + # width: 0.2|rr1|0.2|ratio2|0.1|rr2|0.2 + # 4: gswidth,0.5 + # height: 0.2|ratio1|0.3|1|0.2 + if(before_off==TRUE) dev.off() + if(is.null(pdf_file)==FALSE){ + pdf(pdf_file,width=0.7+(gswidth+0.5)/rr1*(rr1+rr2+ratio2),height=0.5+(gsheight/ratio1)*(ratio1+1)) + } + # get graphics::layout + graphics::layout(matrix(c(rep(0,length.out=rr1),rep(1,length.out=ratio2),rep(0,length.out=rr2*1), + rep(c(rep(4,length.out=rr1),rep(2,length.out=ratio2),rep(3,length.out=rr2*1)), + length.out=ratio1*(ratio2+rr1+rr2))), + ncol=c(ratio2+rr1+rr2),byrow=TRUE)) + #print(matrix(c(rep(0,length.out=rr1),rep(1,length.out=ratio2),rep(0,length.out=rr2*1), + # rep(c(rep(4,length.out=rr1),rep(2,length.out=ratio2),rep(3,length.out=rr2*1)), + # length.out=ratio1*(ratio2+rr1+rr2))), + # ncol=c(ratio2+rr1+rr2),byrow=TRUE)) + # 0|1|0 + # 4|2|3 + ## plot 1 + par(mai=c(0.1,0.1,0.2,0.1)) + mm <- quantile(DE_profile,probs=c(0.0001,0.9999)); + mm <- base::max(abs(mm)); mm <- c(-mm,mm) + y1 <- base::seq(mm[1],mm[2],length.out=5); y1 <- round(y1,1) + unit <- n_gene/10; unit <- round(unit/100)*100 + x1 <- base::seq(0,n_gene,by=unit);x1 <- base::unique(x1); x1 <- c(x1[1:(base::length(x1)-1)],n_gene) + par(usr=c(0,base::length(DE_profile),mm[1],mm[2])) + graphics::plot(DE_profile,col='white',pch=16,xaxt='n',yaxt='n',xlab="",ylab="",bty='n',type='n',ylim=c(mm[1],mm[2]),main=main,cex.main=1.8) + pp <- par()$usr; rr <- (pp[2]-pp[1])/n_gene + graphics::polygon(x=c(pp[1],c(1:n_gene)*rr+pp[1],pp[2]),y=c(0,DE_profile,0),col='grey',border='grey',xpd=TRUE,lwd=0.3) + if(profile_trend=='pos2neg'){ + if(is.null(left_annotation)==FALSE) graphics::text(pp[1]+(pp[2]-pp[1])/100,mm[2]*0.8,adj=0,left_annotation,col=brewer.pal(9,'Reds')[6],xpd=TRUE,cex=1.2) + if(is.null(right_annotation)==FALSE) graphics::text(pp[2]-(pp[2]-pp[1])/100,mm[1]*0.8,adj=1,right_annotation,col=brewer.pal(9,'Blues')[6],xpd=TRUE,cex=1.2) + }else{ + if(is.null(left_annotation)==FALSE) graphics::text(pp[1]+(pp[2]-pp[1])/100,mm[1]*0.8,adj=0,left_annotation,col=brewer.pal(9,'Blues')[6],xpd=TRUE,cex=1.2) + if(is.null(right_annotation)==FALSE) graphics::text(pp[2]-(pp[2]-pp[1])/100,mm[2]*0.8,adj=1,right_annotation,col=brewer.pal(9,'Reds')[6],xpd=TRUE,cex=1.2) + } + graphics::axis(side=2,at=y1,labels=y1) + graphics::mtext(side=2,line = 2.5,profile_col,cex=1) + graphics::segments(pp[1],mm[1],pp[2],mm[1],xpd=TRUE) + graphics::segments(x1*rr+pp[1],mm[1]-(mm[2]-mm[1])/30,x1*rr+pp[1],mm[1],xpd=TRUE) + graphics::text(x1*rr+pp[1],mm[1]-(mm[2]-mm[1])/10,get_label_manual(x1),adj=1,xpd=TRUE,cex=0.6) + ## plot2 + par(mai=c(0.2,0.1,0.2,0.1)) + graphics::plot(1,col='white',xlab="",ylab="",xlim=c(0,n_gene),xaxt='n',yaxt='n') + pp <- par()$usr;rr <- (pp[2]-pp[1])/n_gene + yy1 <- base::seq(from=pp[3],to=pp[4],length.out=n_driver+1) + yy2 <- base::seq(from=pp[3],to=pp[4],length.out=base::length(sig_gs_list)+1) + # add columns + use_target_list <- use_gs2gene[sig_gs_list] + cc <- z2col(DE_profile,sig_thre=profile_sig_thre,n_len=100,red_col = brewer.pal(9,'Reds')[5:9],blue_col=brewer.pal(9,'Blues')[5:9], + col_max_thre=base::max(abs(DE_profile))) + cc[which(cc=='white')] <- 'light grey' + for(i in 1:base::length(sig_gs_list)){ + t1 <- use_target_list[[sig_gs_list[i]]] + w0 <- which(DE_profile_name %in% t1) + w1 <- w0*rr+pp[1] + if(target_col=='black'){ + graphics::segments(x0=w1,x1=w1,y0=yy1[i],y1=yy1[i+1],col='black',lwd=1) + }else{ + graphics::segments(x0=w1,x1=w1,y0=yy1[i],y1=yy1[i+1],lwd=1.5,col=cc[w0]) + } + } + graphics::segments(x0=pp[1],x1=pp[2],y0=yy1,y1=yy1,lwd=0.2,col='light grey') + graphics::segments(x0=pp[1],x1=pp[2],y0=yy2,y1=yy2,lwd=2,col='white') + graphics::segments(x0=pp[1],x1=pp[2],y0=yy2,y1=yy2,lwd=1.2,col='dark grey') + ## plot 3 + par(mai=c(0.2,0,0.2,0.2)) + graphics::plot(1,col='white',xlab="",ylab="",xlim=c(0,1),xaxt='n',yaxt='n',bty='n') + pp <- par()$usr + graphics::rect(xleft=pp[1],xright=pp[2],ybottom=pp[3],ytop=pp[4]) + yy2 <- base::seq(from=pp[3],to=pp[4],length.out=base::length(sig_gs_list)+1) + graphics::segments(x0=pp[1],x1=pp[2],y0=yy2,y1=yy2,lwd=2,col='white',xpd=TRUE) + graphics::segments(x0=pp[1],x1=pp[2],y0=yy2,y1=yy2,lwd=1.2,col='dark grey',xpd=TRUE) + ## add text + mm_min <- base::min(abs(gs_DA_Z[sig_gs_list]),na.rm=TRUE)*0.9 + mm_min <- base::max(mm_min,Z_sig_thre) + mm_max <- base::max(abs(gs_DA_Z[sig_gs_list]),na.rm=TRUE)*1.1 + c1 <- z2col(gs_DA_Z[sig_gs_list],sig_thre=Z_sig_thre,n_len=100,red_col = brewer.pal(9,'Reds')[7],blue_col=brewer.pal(9,'Blues')[7], + col_min_thre=mm_min,col_max_thre=mm_max) + for(i in 1:base::length(sig_gs_list)){ + z1 <- gs_DA_Z[sig_gs_list[i]] + p1 <- get_z2p(z1) + graphics::rect(xleft=pp[1],xright=pp[2],ybottom=yy2[i],ytop=yy2[i+1],col=c1[i],border='dark grey',xpd=TRUE) + graphics::text(x=(pp[1]+pp[2])/2,y=(yy2[i]+yy2[i+1])/2,p1,adj=0.5) + } + text((pp[1]+pp[2])/2,pp[4],'DA',xpd=TRUE,cex=1.5,pos=3) + ## plot 4 + par(mai=c(0.2,0.2,0.2,0.1)) + graphics::plot(1,col='white',xlab="",ylab="",xlim=c(0,2),xaxt='n',yaxt='n',bty='n') + pp <- par()$usr + yy2 <- base::seq(from=pp[3],to=pp[4],length.out=base::length(sig_gs_list)+1) + yy22 <- (yy2[1:(base::length(yy2)-1)]+yy2[2:base::length(yy2)])/2 + dyy <- yy22[2]-yy22[1] + # add target size + target_size <- unlist(lapply(use_gs2gene,length)) + mm <- base::max(target_size) + rr <- ceiling(gswidth)*1.5 + tt_left <- pp[2]-(pp[2]-pp[1])/(1+rr) + tt <- (pp[2]-pp[1])/(1+rr) + graphics::text(show_label,x=tt_left-tt/25,y=yy22,xpd=TRUE,adj=1) + for(i in 1:base::length(sig_gs_list)){ + graphics::rect(xleft=tt_left,xright=tt_left+target_size[i]/mm*tt, + ybottom=yy22[i]-dyy*0.2,ytop=yy22[i]+dyy*0.2,col='dark grey',border=NA) + } + graphics::segments(x0=tt_left,x1=pp[2],y0=pp[4],y1=pp[4],xpd=TRUE) + sst <- round(base::seq(0,mm,length.out=3)) + ss <- sst*tt/mm+tt_left + graphics::segments(x0=ss,x1=ss,y0=pp[4],y1=pp[4]+(pp[4]-pp[3])/150,xpd=TRUE) + graphics::text(x=ss,y=pp[4]+(pp[4]-pp[3])/100,srt=90,sst,xpd=TRUE,adj=0,cex=0.8) + text('Size',x=tt_left-tt/10,y=pp[4]+(pp[4]-pp[3])/50,adj=1,xpd=TRUE,cex=0.8) + ## + } + if(is.null(pdf_file)==FALSE){plot_part(ori=TRUE);plot_part(ori=TRUE,before_off=TRUE);while (!is.null(dev.list())) dev.off();} else {plot_part()} + graphics::layout(1); + return(TRUE) +} + + + +#' Merge Target Gene List for Two Drivers +#' +#' \code{merge_target_list} merges target gene list for two drivers together. Shared target genes with high "MI (mutual information)" statistics will be kept in the final target list. +#' +#' @param driver1 character, the name of the first driver. +#' @param driver2 character, the name of the second driver. +#' @param target_list list, the driver-to-target list object. The names of the list elements are drivers (e.g. driver1 and driver2). +#' Each element is a data frame, usually contains at least three columns. "target", target gene names; "MI", mutual information; "spearman", spearman correlation coefficient. +#' Users can call \code{get_net2target_list} to create this list. +#' @return Return a data.frame with rows of target genes, column of "target", "MI", "spearman". +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' driver1 <- ms_tab[1,'originalID_label'] +#' driver2 <- ms_tab[2,'originalID_label'] +#' m1 <- merge_target_list(driver1=driver1,driver2=driver2, +#' target_list=analysis.par$merge.network$target_list) +#' @export +merge_target_list <- function(driver1=NULL,driver2=NULL,target_list=NULL){ + # + all_input_para <- c('driver1','driver2','target_list') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + t1 <- target_list[driver1][[1]] + t2 <- target_list[driver2][[1]] + ov <- base::intersect(t1$target,t2$target) + rownames(t1) <- t1$target + rownames(t2) <- t2$target + if(base::length(ov)==0){ + t_out <- base::rbind(t1,t2) + }else{ + t_out1 <- base::rbind(t1[base::setdiff(rownames(t1),ov),],t2[base::setdiff(rownames(t2),ov),]) + t_out2 <- list() + for(each_ov in ov){ + x1 <- t1[each_ov,2:3] + x2 <- t2[each_ov,2:3] + if(x1[1]>x2[1]) t_out2[[each_ov]] <- t1[each_ov,] else t_out2[[each_ov]] <- t2[each_ov,] + } + t_out2 <- do.call(rbind,t_out2) + t_out <- base::rbind(t_out1,t_out2) + } + return(t_out) +} +# +#' Scatter Box Plot of Driver's Expression Values and Activity Values across Subgroup of Samples +#' +#' \code{draw.categoryValue} draws a scatter box plot to visualize one selected driver's expression value and activity value across different phenotype subgroups of samples. +#' Two side-by-side scatter box plots will be created. The left plot shows driver's activity values in different phenotype subgroups, each point is a sample. +#' The right plot shows driver's expression value in different phenotype subgroups, each point is a sample. +#' +#' @param ac_val a vector of numerics, the activity values of the selected driver across all samples. +#' @param exp_val a vector of numerics, the expression values of the selected driver across all samples. +#' @param use_obs_class a vector of characters, the category of sample. The order of samples here must match the order in \code{ac_val} and \code{exp_val}. +#' Users can call \code{get_obs_label} to create this vector. +#' @param class_order a vector of characters, the order of catefory (subgroup). +#' If NULL, will use the alphabetical order of the category (subgroup). Default is NULL. +#' @param category_color a vector of characters, a vector of color codes for each category in \code{class_order}. +#' If NULL, will call \code{get.class.color} to create the vector. Default is NULL. +#' @param stripchart_color character, the color of the scatter of points. Default is "black" with transparency alpha equals 0.7. +#' @param strip_cex numeric, giving the amount by which the size of scattered points should be magnified relative to the default. Default is 1. +#' @param class_srt numeric, rotation angle of the column labels (subgroup labels) at the bottom of the box plot. Default is 90. +#' @param class_cex numeric, giving the amount by which the text of category (subgroup) labels should be magnified relative to the default. Default is 1. +#' @param pdf_file character, the file path to save as PDF file. If NULL, no PDF file will be save. Default is NULL. +#' @param main_ac character, the main title of the plot to show activity values. Default is "". +#' @param main_exp character, the main title of the plot to show expression values. Default is "". +#' @param main_cex numeric, giving the amount by which the text of the main title should be magnified relative to the default. Default is 1. +#' @param use_color a vector of color codes, colors to be assigned to each member of display label. Default is brewer.pal(9, 'Set1'). +#' @param pre_define a vector of characters, pre-defined color codes for a certain input (e.g. c("blue", "red") with names c("A", "B")). Default is NULL. +#' +#' @return Return a logical value. If TRUE, the plot has been created successfully. +#' +#' @examples +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' driver_list <- rownames(sig_driver) +#' use_driver <- driver_list[3] +#' exp_mat <- Biobase::exprs(analysis.par$cal.eset) +#' ## expression,the rownames could match originalID +#' ac_mat <- Biobase::exprs(analysis.par$merge.ac.eset) +#' ## activity,the rownames could match originalID_label +#' phe_info <- Biobase::pData(analysis.par$cal.eset) +#' use_obs_class <- get_obs_label(phe_info = phe_info,'subgroup') +#' draw.categoryValue(ac_val=ac_mat[use_driver,], +#' exp_val=exp_mat[ms_tab[use_driver,'originalID'],], +#' use_obs_class=use_obs_class, +#' class_order=c('WNT','SHH','G4'), +#' class_srt=30, +#' main_ac = ms_tab[use_driver,'gene_label'], +#' main_exp=ms_tab[use_driver,'geneSymbol'], +#' pre_define=c('WNT'='blue','SHH'='red','G4'='green')) +#' \dontrun{ +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' ms_tab <- analysis.par$final_ms_tab +#' sig_driver <- draw.volcanoPlot(dat=ms_tab,label_col='gene_label', +#' logFC_col='logFC.G4.Vs.others_DA', +#' Pv_col='P.Value.G4.Vs.others_DA', +#' logFC_thre=0.4, +#' Pv_thre=1e-7, +#' main='Volcano Plot for G4.Vs.others_DA', +#' show_label=FALSE, +#' label_type = 'origin', +#' label_cex = 0.5) +#' driver_list <- rownames(sig_driver) +#' use_driver <- driver_list[3] +#' exp_mat <- Biobase::exprs(analysis.par$cal.eset) +#' ## rownames could match originalID +#' ac_mat <- Biobase::exprs(analysis.par$merge.ac.eset) +#' ## rownames could match originalID_label +#' phe_info <- Biobase::pData(analysis.par$cal.eset) +#' use_obs_class <- get_obs_label(phe_info = phe_info,'subgroup') +#' analysis.par$out.dir.PLOT <- getwd() ## directory for saving the pdf files +#' draw.categoryValue(ac_val=ac_mat[use_driver,], +#' exp_val=exp_mat[ms_tab[use_driver,'originalID'],], +#' use_obs_class=use_obs_class, +#' class_order=c('WNT','SHH','G4'), +#' class_srt=30, +#' main_ac = ms_tab[use_driver,'gene_label'], +#' main_exp=ms_tab[use_driver,'geneSymbol'], +#' pdf_file=sprintf('%s/categoryValue_demo1.pdf', +#' analysis.par$out.dir.PLOT)) +#'} +#' @export +draw.categoryValue <- function(ac_val=NULL,exp_val=NULL,use_obs_class=NULL,category_color=NULL, + stripchart_color=get_transparent('black',0.7),strip_cex=1,class_order=NULL,class_srt=90,class_cex=1,pdf_file=NULL, + main_ac="",main_exp="",main_cex=1, + use_color=NULL,pre_define=NULL){ + # + all_input_para <- c('ac_val','use_obs_class','strip_cex', + 'class_srt','class_cex','main_ac','main_exp','main_cex') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + use_obs_class <- clean_charVector(use_obs_class) + # + if(is.null(names(use_obs_class))==FALSE){ + if(is.null(ac_val)==FALSE) use_obs_class <- use_obs_class[names(ac_val)] + if(is.null(exp_val)==FALSE) use_obs_class <- use_obs_class[names(exp_val)] + } + if(is.null(class_order)){ + class_order <- sort(base::unique(use_obs_class)) + } + if(is.null(category_color)==TRUE){ + class_col <- get.class.color(class_order,use_color=use_color,pre_define=pre_define) + class_col1 <- get_transparent(class_col,0.5) + }else{ + class_col1 <- category_color + } + c1 <- 0 + if(is.null(ac_val)==FALSE){c1 <- c1+1} + if(is.null(exp_val)==FALSE){c1 <- c1+1} + labelWidth <- base::max(strwidthMod(class_order,'inches',cex=class_cex)*sin(class_srt*pi/180)) + if(is.null(pdf_file)==FALSE){ + hh <- 5+1.5+labelWidth + if(c1==1) pdf(pdf_file,width=1.5+3,height=hh) + if(c1==2) pdf(pdf_file,width=1.5+3*2,height=hh) + } + if(c1>1) graphics::layout(t(matrix(1:c1))) + par(mai=c(labelWidth+0.5,1,1,0.5)) + if(is.null(ac_val)==FALSE){ + ddf <- data.frame(data=ac_val,class=factor(use_obs_class,levels=class_order)) + a <- boxplot(data~class,data=ddf,ylab='Activity Value',col=class_col1,outline=FALSE,border='dark grey',cex.lab=1.2,names=NA,bty='n', + ylim=c(base::min(ddf$data),base::max(ddf$data)),main=main_ac,cex.main=main_cex,xlab='') + graphics::text(1:base::length(class_order),par()$usr[3]-(par()$usr[4]-par()$usr[3])/20,adj=0.5+class_srt/180,class_order,srt=class_srt,xpd=TRUE,cex=class_cex) + graphics::stripchart(data~class,data=ddf,add=TRUE,pch=16,method='jitter',vertical=TRUE,col=stripchart_color,cex=strip_cex) + } + if(is.null(exp_val)==FALSE){ + ddf <- data.frame(data=exp_val,class=factor(use_obs_class,levels=class_order)) + a <- boxplot(data~class,data=ddf,col=class_col1,ylab='Expression Value',outline=FALSE,border='dark grey',cex.lab=1.2,names=NA,bty='n', + ylim=c(base::min(ddf$data),base::max(ddf$data)),main=main_exp,cex.main=main_cex,xlab='') + graphics::text(1:base::length(class_order),par()$usr[3]-(par()$usr[4]-par()$usr[3])/20,adj=0.5+class_srt/180,class_order,srt=class_srt,xpd=TRUE,cex=class_cex) + graphics::stripchart(data~class,data=ddf,add=TRUE,pch=16,method='jitter',vertical=TRUE,col=stripchart_color,cex=strip_cex) + } + graphics::layout(1) + if(is.null(pdf_file)==FALSE) dev.off() + return(TRUE) +} + +### simple functions +get_transparent <- function(x,alpha=0.1){ + grDevices::rgb(t(grDevices::col2rgb(x)/255),alpha=alpha) +} + +get_label_manual <- function(x){ + x1 <- sapply(x,function(x2){ + x3 <- unlist(strsplit(as.character(x2),"")) + x4 <- base::length(x3)%/%3 ## add number + if(x4>0){ + pp <- base::length(x3)-base::seq(1,x4)*3; x3[pp] <- paste0(x3[pp],','); base::paste(x3,collapse="") + }else{ + x2 + } + }) + unlist(x1) +} + +#' Target Network Structure Plot for One Driver +#' +#' \code{draw.targetNet} draws a network structure to display the target genes of one selected driver. Edges of positively-regulated target genes are orange, +#' edges of negatively-regulated target genes are green. The width of the edges shows the strength of regulation. +#' +#' @param source_label character, the label of selected one driver. +#' @param source_z numeric, the Z-statistic of the selected driver. The color shade of driver's node in the network is decided by this Z-statistic. +#' If NULL, the driver node will be colored grey. Default is NULL. +#' @param edge_score a named vector of numerics, indicating the correlation between the driver and its target genes. The range of the numeric value is from -1 to 1. +#' Positive value means it is positively-regulated by driver and vice versa. The names of the vector are gene names. +#' @param label_cex numeric, giving the amount by which the text of target gene names should be magnified relative to the default. Default is 0.7. +#' @param source_cex numeric, giving the amount by which the text of driver name should be magnified relative to the default. Default is 1. +#' @param arrow_direction character, users can choose between "in" and "out". Default is "out". +#' @param pdf_file character, the file path to save as PDF file. If NULL, no PDF file will be saved. Default is NULL. +#' @param n_layer integer, number of circle layers to display. Default is 1. +#' @param alphabetical_order logical, if TRUE, the targe gene names will be sorted alphabetically. If FALSE, will be sorted by statistics. Default is FALSE. +#' +#' @return Return a logical value. If TRUE, the plot has been created successfully. +#' +#' @examples +#' source_label <- 'test1' +#' source_z <- 1.96 +#' edge_score <- (sample(1:200,size=100,replace=TRUE)-100)/100 +#' names(edge_score) <- paste0('G',1:100) +#' draw.targetNet(source_label=source_label,source_z=source_z, +#' edge_score=edge_score) +#' draw.targetNet(source_label=source_label,source_z=source_z, +#' edge_score=edge_score,n_layer=2) +#' draw.targetNet(source_label=source_label,source_z=source_z, +#' edge_score=edge_score, +#' arrow_direction='in', +#' source_cex=2) +#' \dontrun{ +#' source_label <- 'test1' +#' source_z <- 1.96 +#' edge_score <- (sample(1:200,size=100,replace=TRUE)-100)/100 +#' names(edge_score) <- paste0('G',1:100) +#' analysis.par <- list() +#' analysis.par$out.dir.PLOT <- getwd() +#' draw.targetNet(source_label=source_label,source_z=source_z, +#' edge_score=edge_score, +#' pdf_file=sprintf('%s/targetNet.pdf', +#' analysis.par$out.dir.PLOT)) +#'} +#' @export +draw.targetNet <- function(source_label="",source_z=NULL,edge_score=NULL, + label_cex=0.7,source_cex=1, + pdf_file=NULL,arrow_direction='out',n_layer=1,alphabetical_order=FALSE){ + # + all_input_para <- c('source_label','edge_score','label_cex','source_cex', + 'arrow_direction','n_layer','alphabetical_order') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('alphabetical_order',c(TRUE,FALSE),envir=environment()), + check_option('arrow_direction',c('in','out'),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + pos_col <- brewer.pal(12,'Paired')[8] + neg_col <- brewer.pal(12,'Paired')[4] + edge_score<- sort(edge_score) + tmp1 <- sapply(base::unique(names(edge_score)),function(x){ + x1 <- edge_score[which(names(edge_score)==x)] + x1[base::which.max(abs(x1))] + }) + names(tmp1) <- base::unique(names(edge_score)) + edge_score <- tmp1 + edge_score<- edge_score[order(edge_score,decreasing = TRUE)] + g1 <- names(edge_score) + ec <- z2col(edge_score*100,sig_thre=0,n_len=base::length(edge_score),red_col=pos_col,blue_col=neg_col);names(ec) <- names(edge_score) + ec <- get_transparent(ec,alpha=0.8); names(ec) <- names(edge_score) + ew <- 2*label_cex*(abs(edge_score)-base::min(abs(edge_score)))/(base::max(abs(edge_score))-base::min(abs(edge_score)))+label_cex/2; names(ew) <- names(edge_score) + if(base::max(abs(edge_score))-base::min(abs(edge_score))==0){ew <- rep(label_cex/2,length.out=length(edge_score)); names(ew) <- names(edge_score)} + t2xy <- function(tt,radius=1) { + t2p <- pi*2 * tt + list(x = radius * cos(t2p), y = radius * sin(t2p)) + } + if(alphabetical_order==TRUE) g1 <- sort(g1) + plot_part <- function(ori=FALSE,before_off=FALSE){ + geneWidth <- base::max(strwidthMod(g1,'inches',cex=label_cex)) + if(before_off==TRUE) dev.off() + if(is.null(pdf_file)==FALSE) pdf(pdf_file,width=6+2*geneWidth*n_layer,height=6+2*geneWidth*n_layer) + par(mai=c(1,1,1,1)) + graphics::plot(1,xlim=c(-1,1),ylim=c(-1,1),bty='n',col='white',xlab="",ylab="",xaxt='n',yaxt='n') + pp <- par()$usr + ## add target label + #tt <- seq(-0.5,0.5,length.out=base::length(g1)+1)[-1]; ## g1 + rad_v <- base::seq(from=0.5,to=1,length.out=n_layer) + if(n_layer==1) rad_v=0.8 + #tmp1 <- ceiling(base::length(g1)/sum(rad_v)*rad_v) + uu <- ceiling(base::length(g1)/base::length(rad_v)) + tmp1 <- rep(uu,length.out=base::length(rad_v)) + if(n_layer>1) tmp1[base::length(tmp1)] <- base::length(g1)-sum(tmp1[1:(base::length(tmp1)-1)]) else tmp1 <- base::length(g1) + tmp1<-cumsum(tmp1) + all_g1 <- list() + for(i in 1:n_layer){ + if(i==1) all_g1[[i]] <- g1[1:tmp1[i]] else all_g1[[i]] <- g1[(tmp1[i-1]+1):tmp1[i]] + } + if(alphabetical_order==FALSE) all_g1 <- lapply(all_g1,function(x)x[order(edge_score[x])]) + #all_tt <- lapply(1:n_layer,function(i)seq(-0.5,0.5,length.out=base::length(all_g1[[i]])+1)[-1]) + all_tt <- lapply(1:n_layer,function(i){ + if(i==1) return(seq(-0.5,0.5,length.out=uu+1)[-1][1:base::length(all_g1[[i]])]) + if(i>1) return(seq(-0.5-(i-1)/(n_layer*uu),0.5-(i-1)/(n_layer*uu),length.out=uu+1)[-1][1:base::length(all_g1[[i]])]) + }) + all_p <- lapply(1:n_layer,function(i)t2xy(all_tt[[i]],radius=rad_v[i])) + # add line + for(i in 1:n_layer){ + each_v <- rad_v[i] + p1 <- all_p[[i]] + g1_use <- all_g1[[i]] + tt <- all_tt[[i]] + geneWidth <- strwidthMod(source_label,'user',cex=source_cex) + if(arrow_direction=='out'){ + p2<-t2xy(tt,radius=each_v-label_cex/36); + p3<-t2xy(tt,radius=each_v-label_cex/48); + graphics::arrows(x0=0,y0=0,x1=p2$x,y1=p2$y,col=ec[g1_use],lwd=ew[g1_use],angle=10,length=0.1*label_cex,xpd=TRUE); + }else{ + p2<-t2xy(tt,radius=each_v-label_cex/36); + p3<-t2xy(tt,radius=each_v-label_cex/36); + p4<-t2xy(tt,radius=geneWidth/2); + graphics::arrows(x0=p2$x,y0=p2$y,x1=p4$x,y1=p4$y,col=ec[g1_use],lwd=ew[g1_use],angle=5,length=0.1*label_cex,xpd=TRUE); + } + graphics::points(p3$x,p3$y,pch=16,col='dark grey',cex=label_cex) + } + # add label + for(j in 1:n_layer){ + p1 <- all_p[[j]] + g1_use <- all_g1[[j]] + for(i in 1:base::length(p1$x)) graphics::text(p1$x[i],p1$y[i],g1_use[i],cex=label_cex,srt=180*atan(p1$y[i]/p1$x[i])/pi,adj=ifelse(p1$x[i]>0,0,1),xpd=TRUE) + } + ## add source label + if(is.null(source_z)==TRUE){ + draw.ellipse(0,0,a=1.05*geneWidth/2,b=1.05*geneWidth/2,col='light grey',border=NA) + }else{ + draw.ellipse(0,0,a=1.05*geneWidth/2,b=1.05*geneWidth/2,col=z2col(source_z),border=NA) + } + graphics::text(0,0,source_label,adj=0.5,xpd=TRUE,cex=source_cex) + graphics::legend(x=pp[1],y=pp[3],fill=c(pos_col,neg_col),c('Positively-regulated','Negatively-regulated'),bty='n',xpd=T,border=NA,cex=label_cex,horiz = FALSE) + } + if(is.null(pdf_file)==FALSE){plot_part(ori=TRUE);plot_part(ori=TRUE,before_off=TRUE);while (!is.null(dev.list())) dev.off()} else {plot_part()} + return(TRUE) +} + + +#' Target Ntwork Structure Plot for Two Drivers +#' +#' \code{draw.targetNet.TWO} draws a network structure to display the target genes of two selected drivers. Edges of positively-regulated target genes are orange, +#' edges of negatively-regulated target genes are green. The width of the edges shows the strength of regulation. +#' It will also print out the number of shared and unique targe genes for each driver, with P-value and odds ratio. +#' +#' @param source1_label character, the label of the first selected driver (to be displayed on the left). +#' @param source2_label character, the label of the second selected driver (to be displayed on the right). +#' @param source1_z numeric, the Z-statistic of the first driver. The color shade of driver’s node in the network is decided by this Z-statistic. +#' If NULL, the driver will be colored grey. Default is NULL. +#' @param source2_z numeric, the Z-statistic of the second driver.The color shade of driver’s node in the network is decided by this Z-statistic. +#' If NULL, the driver will be colored grey. Default is NULL. +#' @param edge_score1 a named vector of numerics, indicating the correlation between the first driver and its target genes. +#' The range of the numeric value is from -1 to 1. Positive value means it is positively-regulated by driver and vice versa. The names of the vector are gene names. +#' @param edge_score2 a named vector of numerics, indicating the correlation between the seconde driver and its target genes. +#' The range of the numeric value is from -1 to 1. Positive value means it is positively-regulated by driver and vice versa. The names of the vector are gene names. +#' @param arrow_direction1 character, the arrow direction for first driver. Users can choose between "in" and "out". Default is "out". +#' @param arrow_direction2 character, the arrow direction for second driver. Users can choose between "in" and "out". Default is "out". +#' @param label_cex numeric, giving the amount by which the text of target gene names should be magnified relative to the default. Default is 0.7. +#' @param source_cex numeric, giving the amount by which the text of driver name should be magnified relative to the default. Default is 1. +#' @param pdf_file character, the file path to save as PDF file. If NULL, no PDF file will be saved. Default is NULL. +#' @param total_possible_target numeric or a vector of characters. If input is numeric, it is the total number of possible target genes. +#' If input is a vector of characters, it is the background list of all possible target genes. +#' This parameter will be passed to function \code{test.targetNet.overlap} to test whether the target genes of the two drivers are significantly intersected. +#' If NULL, will do not perform this test. Default is NULL. +#' @param show_test logical, if TRUE, the test result will be printed and returned. Default is FALSE. +#' @param n_layer integer, number of circle layers to display. Default is 1. +#' @param alphabetical_order logical, if TRUE, the targe gene names will be sorted alphabetically. If FALSE, will be sorted by statistics. Default is FALSE. +#' @return If \code{show_test}==FALSE, will return a logical value indicating whether the plot has been successfully generated, +#' otherwise will return the statistics of testing when total_possible_target is not NULL. +#' +#' @examples +#' source1_label <- 'test1' +#' source1_z <- 1.96 +#' edge_score1 <- (sample(1:160,size=80,replace=TRUE)-80)/80 +#' names(edge_score1) <- sample(paste0('G',1:1000),size=80) +#' source2_label <- 'test2' +#' source2_z <- -2.36 +#' edge_score2 <- (sample(1:240,size=120,replace=TRUE)-120)/120 +#' names(edge_score2) <- sample(paste0('G',1:1000),size=120) +#' draw.targetNet.TWO(source1_label=source1_label, +#' source2_label=source2_label, +#' source1_z=source1_z,source2_z=source2_z, +#' edge_score1=edge_score1,edge_score2=edge_score2, +#' total_possible_target=paste0('G',1:1000), +#' show_test=TRUE,label_cex=0.6) +#' draw.targetNet.TWO(source1_label=source1_label, +#' source2_label=source2_label, +#' source1_z=source1_z,source2_z=source2_z, +#' edge_score1=edge_score1,edge_score2=edge_score2, +#' total_possible_target=paste0('G',1:1000), +#' show_test=TRUE,label_cex=0.6,n_layer=2) +#' +#' \dontrun{ +#' source1_label <- 'test1' +#' source1_z <- 1.96 +#' edge_score1 <- (sample(1:160,size=100,replace=TRUE)-80)/80 +#' names(edge_score1) <- sample(paste0('G',1:1000),size=100) +#' source2_label <- 'test2' +#' source2_z <- -2.36 +#' edge_score2 <- (sample(1:240,size=100,replace=TRUE)-120)/120 +#' names(edge_score2) <- sample(paste0('G',1:1000),size=100) +#' analysis.par <- list() +#' analysis.par$out.dir.PLOT <- getwd() +#' draw.targetNet.TWO(source1_label=source1_label, +#' source2_label=source2_label, +#' source1_z=source1_z,source2_z=source2_z, +#' edge_score1=edge_score1,edge_score2=edge_score2, +#' total_possible_target=paste0('G',1:1000),show_test=TRUE, +#' pdf_file=sprintf('%s/targetNetTWO.pdf', +#' analysis.par$out.dir.PLOT)) +#' } +#' @export +draw.targetNet.TWO <- function(source1_label="",source2_label="", + source1_z=NULL,source2_z=NULL, + edge_score1=NULL,edge_score2=NULL, + arrow_direction1='out',arrow_direction2='out', + label_cex=0.7,source_cex=1,pdf_file=NULL, + total_possible_target=NULL,show_test=FALSE,n_layer=1,alphabetical_order=FALSE){ + # + all_input_para <- c('source1_label','source2_label','source1_z','source2_z', + 'edge_score1','edge_score2','label_cex','source_cex', + 'arrow_direction1','arrow_direction2', + 'n_layer','alphabetical_order','show_test') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('alphabetical_order',c(TRUE,FALSE),envir=environment()), + check_option('show_test',c(TRUE,FALSE),envir=environment()), + check_option('arrow_direction1',c('in','out'),envir=environment()), + check_option('arrow_direction2',c('in','out'),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + pos_col <- brewer.pal(12,'Paired')[8] + neg_col <- brewer.pal(12,'Paired')[4] + tmp1 <- sapply(base::unique(names(edge_score1)),function(x){ + x1 <- edge_score1[which(names(edge_score1)==x)];x1[base::which.max(abs(x1))] + }) + names(tmp1) <- base::unique(names(edge_score1));edge_score1 <- tmp1 + tmp1 <- sapply(base::unique(names(edge_score2)),function(x){ + x1 <- edge_score2[which(names(edge_score2)==x)];x1[base::which.max(abs(x1))] + }) + names(tmp1) <- base::unique(names(edge_score2));edge_score2 <- tmp1 + edge_score1<- sort(edge_score1,decreasing = FALSE) + edge_score2<- sort(edge_score2,decreasing = TRUE) + g12 <- base::intersect(names(edge_score1),names(edge_score2)) + g1 <- base::setdiff(names(edge_score1),names(edge_score2)) + g2 <- base::setdiff(names(edge_score2),names(edge_score1)) + ec1 <- z2col(edge_score1*100,sig_thre=0,n_len=base::length(edge_score1),red_col=pos_col,blue_col=neg_col);names(ec1) <- names(edge_score1) + ec2 <- z2col(edge_score2*100,sig_thre=0,n_len=base::length(edge_score2),red_col=pos_col,blue_col=neg_col);names(ec2) <- names(edge_score2) + ew1 <- 2*label_cex*(abs(edge_score1)-base::min(abs(edge_score1)))/(base::max(abs(edge_score1))-base::min(abs(edge_score1)))+label_cex/2; names(ew1) <- names(edge_score1) + ew2 <- 2*label_cex*(abs(edge_score2)-base::min(abs(edge_score2)))/(base::max(abs(edge_score2))-base::min(abs(edge_score2)))+label_cex/2; names(ew2) <- names(edge_score2) + t2xy <- function(tt,radius=1,init.angle=0) { + t2p <- pi*2 * tt + init.angle * pi/180 + list(x = radius * cos(t2p), y = radius * sin(t2p)) + } + #g1|g12|g2 + if(alphabetical_order==TRUE){ + g1 <- sort(g1); + g2 <- sort(g2); + g12 <- sort(g12); + } + plot_part <- function(ori=FALSE,before_off=FALSE){ + geneWidth <- base::max(strwidthMod(g1,'inches',cex=label_cex)) + if(before_off==TRUE) dev.off() + if(is.null(pdf_file)==FALSE) pdf(pdf_file,width=10+4*geneWidth,height=8+2*geneWidth) + graphics::plot(1,xlim=c(-1.4,1.4),ylim=c(-1,1),bty='n',col='white',xlab="",ylab="",xaxt='n',yaxt='n') + par(mai=c(1,1,1,1)) + + geneWidth1 <- strwidthMod(source1_label,'user',cex=source_cex) + geneWidth2 <- strwidthMod(source2_label,'user',cex=source_cex) + geneWidth <- base::max(geneWidth1,geneWidth2) + + ag <- 0.245-0.01*n_layer + lp <- 0.4 + rad_v <- base::seq(from=0.6,to=1,length.out=n_layer) + if(n_layer==1) rad_v=0.8 + + if(base::length(g1)>0){ + # get info + uu <- ceiling(base::length(g1)/base::length(rad_v)) + tmp1 <- rep(uu,length.out=base::length(rad_v)) + if(n_layer>1) tmp1[base::length(tmp1)] <- base::length(g1)-sum(tmp1[1:(base::length(tmp1)-1)]) else tmp1 <- base::length(g1) + tmp1<-cumsum(tmp1) + all_g1 <- list() + for(i in 1:n_layer){ + if(i==1) all_g1[[i]] <- g1[1:tmp1[i]] else all_g1[[i]] <- g1[(tmp1[i-1]+1):tmp1[i]] + } + #all_g1 <- lapply(all_g1,function(x)x[order(edge_score[x])]) + all_tt <- lapply(1:n_layer,function(i){ + if(i==1) return(seq(-ag,ag,length.out=uu)[1:base::length(all_g1[[i]])]) + if(i>1) return(seq(-ag-2*ag*(i-1)/(n_layer*uu),ag-2*ag*(i-1)/(n_layer*uu),length.out=uu)[1:base::length(all_g1[[i]])]) + }) + all_p <- lapply(1:n_layer,function(i)t2xy(all_tt[[i]],radius=rad_v[i],init.angle= -180)) + + # add line + for(i in 1:n_layer){ + each_v <- rad_v[i] + p1 <- all_p[[i]] + g1_use <- all_g1[[i]] + tt <- all_tt[[i]] + if(arrow_direction1=='out'){ + p2<-t2xy(tt,radius=each_v-label_cex/36,init.angle= -180); + p3<-t2xy(tt,radius=each_v-label_cex/48,init.angle= -180); + graphics::arrows(x0=-lp,y0=0,x1=p2$x-lp,y1=p2$y,col=ec1[g1_use],lwd=ew1[g1_use],angle=10,length=0.1*label_cex,xpd=TRUE); + }else{ + p2<-t2xy(tt,radius=each_v-label_cex/36,init.angle= -180); + p3<-t2xy(tt,radius=each_v-label_cex/36,init.angle= -180); + p4<-t2xy(tt,radius=geneWidth/2,init.angle= -180); + graphics::arrows(x0=p2$x-lp,y0=p2$y,x1=p4$x-lp,y1=p4$y,col=ec1[g1_use],lwd=ew1[g1_use],angle=5,length=0.1*label_cex,xpd=TRUE); + } + graphics::points(p3$x-lp,p3$y,pch=16,col='dark grey',cex=label_cex) + } + # add label + for(j in 1:n_layer){ + p1 <- all_p[[j]] + g1_use <- all_g1[[j]] + for(i in 1:base::length(p1$x)) graphics::text(p1$x[i]-lp,p1$y[i],g1_use[i],cex=label_cex,srt=180*atan(p1$y[i]/p1$x[i])/pi,adj=ifelse(p1$x[i]>0,0,1),xpd=TRUE) + } + } + ## + if(base::length(g2)>0){ + # get info + uu <- ceiling(base::length(g2)/base::length(rad_v)) + tmp1 <- rep(uu,length.out=base::length(rad_v)) + if(n_layer>1) tmp1[base::length(tmp1)] <- base::length(g2)-sum(tmp1[1:(base::length(tmp1)-1)]) else tmp1 <- base::length(g2) + tmp1<-cumsum(tmp1) + all_g1 <- list() + for(i in 1:n_layer){ + if(i==1) all_g1[[i]] <- g2[1:tmp1[i]] else all_g1[[i]] <- g2[(tmp1[i-1]+1):tmp1[i]] + } + #all_g1 <- lapply(all_g1,function(x)x[order(edge_score[x])]) + all_tt <- lapply(1:n_layer,function(i){ + if(i==1) return(seq(-ag,ag,length.out=uu)[1:base::length(all_g1[[i]])]) + if(i>1) return(seq(-ag-2*ag*(i-1)/(n_layer*uu),ag-2*ag*(i-1)/(n_layer*uu),length.out=uu)[1:base::length(all_g1[[i]])]) + }) + all_p <- lapply(1:n_layer,function(i)t2xy(all_tt[[i]],radius=rad_v[i],init.angle=0)) + + # add line + for(i in 1:n_layer){ + each_v <- rad_v[i] + p1 <- all_p[[i]] + g1_use <- all_g1[[i]] + tt <- all_tt[[i]] + if(arrow_direction2=='out'){ + p2<-t2xy(tt,radius=each_v-label_cex/36); + p3<-t2xy(tt,radius=each_v-label_cex/48); + graphics::arrows(x0=lp,y0=0,x1=p2$x+lp,y1=p2$y,col=ec2[g1_use],lwd=ew2[g1_use],angle=10,length=0.1*label_cex,xpd=TRUE); + }else{ + p2<-t2xy(tt,radius=each_v-label_cex/36); + p3<-t2xy(tt,radius=each_v-label_cex/36); + p4<-t2xy(tt,radius=geneWidth/2); + graphics::arrows(x0=p2$x+lp,y0=p2$y,x1=p4$x+lp,y1=p4$y,col=ec2[g1_use],lwd=ew2[g1_use],angle=5,length=0.1*label_cex,xpd=TRUE); + } + graphics::points(p3$x+lp,p3$y,pch=16,col='dark grey',cex=label_cex) + } + # add label + for(j in 1:n_layer){ + p1 <- all_p[[j]] + g1_use <- all_g1[[j]] + for(i in 1:base::length(p1$x)) graphics::text(p1$x[i]+0.4,p1$y[i],g1_use[i],cex=label_cex,srt=180*atan(p1$y[i]/p1$x[i])/pi,adj=ifelse(p1$x[i]>0,0,1),xpd=TRUE) + } + } + ## + if(base::length(g12)>0){ + rm <- base::min(0.1*base::length(g12),1) + dd <- par.char2pos()[2]/2; nr <-ceiling(base::length(g12)/(rm*2/dd));each_col_n<-ceiling(base::length(g12)/nr) + tt <- base::seq(rm,-rm,length.out=each_col_n); + xx <- seq(-lp+geneWidth,lp-geneWidth,length.out=nr) + if(nr==1) xx<-0 + tt <- unlist(lapply(tt,function(x)rep(x,length.out=nr)))[1:base::length(g12)] + xx <- rep(xx,length.out=base::length(xx)*each_col_n)[1:base::length(g12)] + + if(arrow_direction1=='out'){ + graphics::arrows(x0=-lp,y0=0,x1=xx,y1=tt,col=ec1[g12],lwd=ew1[g12],angle=10,length=0.1*label_cex,xpd=TRUE) + }else{ + p4<-t2xy(tt,radius=geneWidth/2); + graphics::arrows(x0=xx,y0=tt,x1=-lp,y1=0,col=ec1[g12],lwd=ew1[g12],angle=5,length=0.1*label_cex,xpd=TRUE); + } + if(arrow_direction2=='out'){ + graphics::arrows(x0=lp,y0=0,x1=xx,y1=tt,col=ec2[g12],lwd=ew2[g12],angle=10,length=0.1*label_cex,xpd=TRUE) + }else{ + p4<-t2xy(tt,radius=geneWidth/2); + graphics::arrows(x0=xx,y0=tt,x1=lp,y1=0,col=ec1[g12],lwd=ew1[g12],angle=5,length=0.1*label_cex,xpd=TRUE); + } + boxtext(xx,tt,labels=g12,col.bg=get_transparent('light grey',0.3),cex=label_cex) + } + if(is.null(source2_z)==TRUE) + draw.ellipse(lp,0,a=geneWidth/2,b=geneWidth/2,col='light grey',border=NA) + else + draw.ellipse(lp,0,a=geneWidth/2,b=geneWidth/2,col=z2col(source2_z),border=NA) + + graphics::text(lp,0,source2_label,adj=0.5,cex=source_cex) + + if(is.null(source1_z)==TRUE) + draw.ellipse(-lp,0,a=geneWidth/2,b=geneWidth/2,col='light grey',border=NA) + else + draw.ellipse(-lp,0,a=geneWidth/2,b=geneWidth/2,col=z2col(source1_z),border=NA) + text(-lp,0,source1_label,adj=0.5,cex=source_cex) + pp <- par()$usr + graphics::legend(x=pp[1],y=pp[3],fill=c(pos_col,neg_col),c('Positively-regulated','Negatively-regulated'),bty='n',xpd=T,border=NA,cex=label_cex,horiz = TRUE) + } + # fisher test for target + if(is.null(pdf_file)==FALSE){plot_part(ori=TRUE);plot_part(ori=TRUE,before_off=TRUE);while (!is.null(dev.list())) dev.off()} else {plot_part()} + + if(is.null(total_possible_target)==FALSE & show_test==TRUE){ + res <- test.targetNet.overlap(source1_label,source2_label,names(edge_score1),names(edge_score2),total_possible_target) + return(res) + } + return(TRUE) +} + +#' Test for Intersection of Target Genes between Two Drivers +#' +#' \code{test.targetNet.overlap} performs Fisher's exact test to see whether the target genes from two drivers are significantly intersected. +#' +#' +#' @param source1_label character, the label of the first selected driver. +#' @param source2_label character, the label of the second selected driver. +#' @param target1 a vector of characters, the list of target genes from the first driver. +#' @param target2 a vector of characters, the list of target genes from the second driver. +#' @param total_possible_target numeric or a vector of characters. If input is numeric, it is the total number of possible target genes. +#' If input is a vector of characters, it is the background list of all possible target genes. +#' +#' @return Return statistics of the testing, including the \code{P.Value}, \code{Odds_Ratio} and \code{Intersected_Number}. +#' +#' @examples +#' source1_label <- 'test1' +#' target1 <- sample(paste0('G',1:1000),size=80) +#' source2_label <- 'test2' +#' target2 <- sample(paste0('G',1:1000),size=120) +#' test.targetNet.overlap(source1_label=source1_label,source2_label=source2_label, +#' target1=target1,target2=target2, +#' total_possible_target=paste0('G',1:1000)) +#' \dontrun{ +#' } +#' @export +test.targetNet.overlap <- function(source1_label=NULL,source2_label=NULL, + target1=NULL,target2=NULL, + total_possible_target=NULL){ + # + all_input_para <- c('source1_label','source2_label','target1','target2') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + t1 <- base::unique(target1) + t2 <- base::unique(target2) + print(sprintf('%s has %d unique targets !',source1_label,base::length(t1))) + print(sprintf('%s has %d unique targets !',source2_label,base::length(t2))) + n11 <- base::length(base::intersect(t1,t2)) + n12 <- base::length(base::setdiff(t1,t2)) + n21 <- base::length(base::setdiff(t2,t1)) + if(class(total_possible_target) %in% c('integer','numeric')){ + n22 <- total_possible_target-base::length(union(t1,t2)) + }else{ + n22 <- base::length(base::setdiff(total_possible_target,c(t1,t2))) + } + mm <- base::cbind(c(n11,n21),c(n12,n22)) + ft <- fisher.test(mm)$p.value + or <- n11/n12/(n21/n22) + rownames(mm) <- c(sprintf('In %s target',source1_label),sprintf('Not in %s target',source1_label)) + colnames(mm) <- c(sprintf('In %s target',source2_label),sprintf('Not in %s target',source2_label)) + print(mm) + res <- c('P.Value'=ft,'Odds_Ratio'=or,'Intersected_Number'=mm[1,1]) + return(res) +} + +#' Convert Pairwise Network Data Frame to Driver-to-Target List +#' +#' \code{get_net2target_list} is a helper function in the \code{get.SJAracne.network}. +#' But if users have their own pairwise gene network files, they can convert it to driver-to-target list object. +#' +#' @param net_dat data.frame, must contain two columns with column names "source" (driver) and "target" (target genes). +#' "MI" (mutual information) and "spearman" (spearman correlation coefficient) columns are optional, but strongly suggested to use. +#' If "MI" and "spearman" columns are missing, errors may occur in some following steps (e.g. es.method='weightedmean' in \code{cal.Activity}). +#' +#' @return Return a list. The names of the list elements are drivers. +#' Each element is a data frame, contains three columns. "target", target gene names; +#' "MI", mutual information; "spearman", spearman correlation coefficient. +#' +#' @examples +#' tf.network.file <- sprintf('%s/demo1/network/SJAR/project_2019-02-14/%s/%s', +#' system.file(package = "NetBID2"), +#' 'output_tf_sjaracne_project_2019-02-14_out_.final', +#' 'consensus_network_ncol_.txt') +#' net_dat <- read.delim(file=tf.network.file,stringsAsFactors = FALSE) +#' target_list <- get_net2target_list(net_dat) +#' @export +get_net2target_list <- function(net_dat=NULL) { + all_input_para <- c('net_dat') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + all_source <- base::unique(net_dat$source) + all_target <- lapply(all_source, function(x) { + n1 <- net_dat[which(net_dat$source == x), base::intersect(c('target', 'MI', 'spearman'),colnames(net_dat))] + if(class(n1)=='character') n1 <- data.frame('target'=n1,'MI'=1,'spearman'=1,stringsAsFactors=F) + n1 <- unique(n1) + if(length(unique(n1$target))!=length(n1$target)){ ## multiple + t1 <- table(n1$target) + w1 <- names(which(t1==1)); w2 <- names(which(t1>1)) + w21 <- n1[which(n1$target %in% w1),] + w22 <- do.call(rbind,lapply(w2,function(x){ + x1 <- n1[which(n1$target==x),] + x1 <- x1[which.max(x1$MI),] + })) + n1 <- rbind(w21,w22) + } + rownames(n1) <- n1$target + return(n1) + }) + names(all_target) <- all_source + return(all_target) +} + +#' Read SJARACNe Network Result and Return it as List Object +#' +#' \code{get.SJAracne.network} reads SJARACNe network construction result and returns a list object +#' with network data frame, driver-to-target list and igraph object wrapped inside. +#' +#' In the demo, "consensus_network_ncol_.txt" file will be read and convert into a list object. +#' This list contains three elements, \code{network_data}, \code{target_list} and \code{igraph_obj}. +#' \code{network_dat} is a data.frame, contains all the information of the network SJARACNe constructed. +#' \code{target_list} is a driver-to-target list object. Please check details in \code{get_net2target_list}. +#' \code{igraph_obj} is an igraph object used to save this directed and weighted network. +#' Each edge of the network has two attributes, \code{weight} and \code{sign}. +#' \code{weight} is the "MI (mutual information)" value and \code{sign} is the sign of the spearman +#' correlation coefficient (1, positive regulation; -1, negative regulation). +#' +#' @param network_file character, the path for storing network file. For the output of SJAracne, the name of the network file will be "consensus_network_ncol_.txt" under the output directory. +#' +#' @return Return a list containing three elements, \code{network_dat}, \code{target_list} and \code{igraph_obj}. +#' +#' @examples +#' if(exists('analysis.par')==TRUE) rm(analysis.par) +#' network.dir <- sprintf('%s/demo1/network/',system.file(package = "NetBID2")) # use demo +#' network.project.name <- 'project_2019-02-14' # demo project name +#' project_main_dir <- 'test/' +#' project_name <- 'test_driver' +#' analysis.par <- NetBID.analysis.dir.create(project_main_dir=project_main_dir, +#' project_name=project_name, +#' network_dir=network.dir, +#' network_project_name=network.project.name) +#' analysis.par$tf.network <- get.SJAracne.network(network_file=analysis.par$tf.network.file) +#' analysis.par$sig.network <- get.SJAracne.network(network_file=analysis.par$sig.network.file) +#' +#' \dontrun{ +#' } +#' @export +get.SJAracne.network <- function(network_file=NULL){ + all_input_para <- c('network_file') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + net_dat <- read.delim(file=network_file,stringsAsFactors = FALSE) + target_list <- get_net2target_list(net_dat) + igraph_obj <- graph_from_data_frame(net_dat[,c('source','target')],directed=TRUE) ## add edge weight ??? + if('MI' %in% colnames(net_dat)) igraph_obj <- set_edge_attr(igraph_obj,'weight',index=E(igraph_obj),value=net_dat[,'MI']) + if('spearman' %in% colnames(net_dat)) igraph_obj <- set_edge_attr(igraph_obj,'sign',index=E(igraph_obj),value=sign(net_dat[,'spearman'])) + return(list(network_dat=net_dat,target_list=target_list,igraph_obj=igraph_obj)) +} + +#' Update Network List Object Using Constraints +#' +#' \code{update_SJAracne.network} updates the network object created by \code{get.SJAracne.network}, using constraints like statistical thresholds and interested gene list. +#' +#' @param network_list list, the network list object created by \code{get.SJAracne.network}. +#' The list contains three elements, \code{network_dat}, \code{target_list} and \code{igraph_obj}. For details, please check \code{get.SJAracne.network}. +#' @param all_possible_drivers a vector of characters, all possible drivers used to filter the network. +#' If NULL, will use drivers from \code{network_list}. Default is NULL. +#' @param all_possible_targets a vector of characters, all possible target genes used to filter the network. +#' If NULL, will use targets from \code{network_list}. Default is NULL. +#' @param force_all_drivers logical, if TRUE, will include all drivers from \code{all_possible_drivers} into the final network. +#' For \code{network_dat} and \code{target_list} in the network list object, all genes in \code{all_possible_drivers} will not be filtered using the following statistical thresholds. +#' For \code{igraph_obj} in the network list object, all genes in \code{all_possible_drivers} that don't exist in the original network, will be kept as vertices. +#' Default is TRUE. +#' @param force_all_targets logical, if TRUE, will include all genes from \code{all_possible_targets} into the final network. +#' For \code{network_dat} and \code{target_list} in the network list object, all genes in \code{all_possible_drivers} will not be filtered using the following statistical thresholds. +#' For \code{igraph_obj} in the network list object, all genes in \code{all_possible_drivers} that don't exist in the original network, will be kept as vertices. +#' Default is TRUE. +#' @param min_MI numeric, minimum threshold for "MI (mutual information)". Default is 0. +#' @param max_p.value numeric, maximum threshold for P-value. Default is 1. +#' @param min_spearman_value numeric, minimum threshold for spearman absolute value. Default is 0. +#' @param min_pearson_value numeric, minimum threshold for pearson absolute value. Default is 0. +#' @param spearman_sign_use a vector of numerics, users can choose from 1, -1 and c(1, -1). 1 means only positve spearman values will be used. +#' -1 means only negative spearman values will be used. Default is c(1,-1). +#' @param pearson_sign_use a vector of numerics, users can choose from 1, -1 and c(1, -1). 1 means only positve pearson values will be used. +#' -1 means only negative pearson values will be used. Default is c(1,-1). +#' @param directed logical, if TRUE, the network is a directed graph. Default is TRUE. +#' @param weighted logical, if TRUE, the network is weighted. Default is TRUE. +#' +#' @return Return a list containing three elements, \code{network_dat}, \code{target_list} and \code{igraph_obj}. +#' +#' @examples +#' if(exists('analysis.par')==TRUE) rm(analysis.par) +#' network.dir <- sprintf('%s/demo1/network/',system.file(package = "NetBID2")) # use demo +#' network.project.name <- 'project_2019-02-14' # demo project name +#' project_main_dir <- 'test/' +#' project_name <- 'test_driver' +#' analysis.par <- NetBID.analysis.dir.create(project_main_dir=project_main_dir, +#' project_name=project_name, +#' network_dir=network.dir, +#' network_project_name=network.project.name) +#' analysis.par$tf.network <- get.SJAracne.network(network_file=analysis.par$tf.network.file) +#' analysis.par$sig.network <- get.SJAracne.network(network_file=analysis.par$sig.network.file) +#' all_possible_drivers <- c(names(analysis.par$tf.network$target_list)[1:1000], +#' 'addition_driver_1','addition_driver_2') +#' tf.network.update <- update_SJAracne.network(network_list=analysis.par$tf.network, +#' all_possible_drivers=all_possible_drivers, +#' force_all_drivers=TRUE, +#' force_all_targets=FALSE, +#' pearson_sign_use=1) +#' print(base::intersect(c('addition_driver_1','addition_driver_2'), +#' names(V(tf.network.update$igraph_obj)))) ## check +#' \dontrun{ +#' } +#' @export update_SJAracne.network +update_SJAracne.network <- function(network_list=NULL, + all_possible_drivers=NULL, + all_possible_targets=NULL, + force_all_drivers=TRUE, + force_all_targets=TRUE, + min_MI=0,max_p.value=1, + min_spearman_value=0, + min_pearson_value=0, + spearman_sign_use=c(1,-1), + pearson_sign_use=c(1,-1), + directed=TRUE,weighted=TRUE +){ + # + all_input_para <- c('network_list','force_all_drivers','force_all_targets', + 'min_MI','max_p.value','min_spearman_value','min_pearson_value', + 'spearman_sign_use','pearson_sign_use','directed','weighted') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('force_all_drivers',c(TRUE,FALSE),envir=environment()), + check_option('force_all_targets',c(TRUE,FALSE),envir=environment()), + check_option('directed',c(TRUE,FALSE),envir=environment()), + check_option('weighted',c(TRUE,FALSE),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + n1 <- names(network_list) + n2 <- c('network_dat','target_list','igraph_obj') + if(base::length(base::setdiff(n2,n1))>0){ + message(sprintf('%s not included in the network_list, please chech and re-try !',base::paste(base::setdiff(n2,n1),collapse=';'))); + return(FALSE) + } + ori_net_dat <- network_list$network_dat + net_dat <- ori_net_dat + if(is.null(all_possible_drivers)==TRUE) all_possible_drivers <- base::unique(net_dat$source) + if(is.null(all_possible_targets)==TRUE) all_possible_targets <- base::unique(net_dat$target) + # basic statistics filter + w1 <- which(net_dat$MI>=min_MI & net_dat$p.value<=max_p.value + & abs(net_dat$pearson)>=min_pearson_value & + abs(net_dat$spearman)>=min_spearman_value) + all_w1 <- w1 ## use rows + # sign choose + if(!1 %in% spearman_sign_use){ ## do not use the positive ones + w1 <- which(net_dat$spearman<=0) + all_w1 <- base::setdiff(all_w1,w1) + } + if(!-1 %in% spearman_sign_use){ ## do not use the negative ones + w1 <- which(net_dat$spearman>=0) + all_w1 <- base::setdiff(all_w1,w1) + } + if(!1 %in% pearson_sign_use){ ## do not use the positive ones + w1 <- which(net_dat$pearson<=0) + all_w1 <- base::setdiff(all_w1,w1) + } + if(!-1 %in% pearson_sign_use){ ## do not use the negative ones + w1 <- which(net_dat$pearson>=0) + all_w1 <- base::setdiff(all_w1,w1) + } + # nodes filter + all_possible_nodes <- base::unique(c(base::unique(net_dat$source),base::unique(net_dat$target))) ## original all possible + if(force_all_drivers==TRUE){ + w1 <- which(net_dat$source %in% all_possible_drivers) + all_w1 <- base::unique(c(all_w1,w1)) + all_possible_nodes <- base::unique(c(all_possible_nodes,all_possible_drivers)) + } + if(force_all_targets==TRUE){ + w1 <- which(net_dat$target %in% all_possible_targets) + all_w1 <- base::unique(c(all_w1,w1)) + all_possible_nodes <- base::unique(c(all_possible_nodes,all_possible_targets)) + } + message(sprintf('%d from %d edges are kept in the network !',base::length(all_w1),nrow(ori_net_dat))) + message(sprintf('%d nodes will be used to generate the igraph!',base::length(all_possible_nodes))) + # keep all genes in all* to be in the igraph + net_dat <- net_dat[which(net_dat$source %in% all_possible_drivers & net_dat$target %in% all_possible_targets),] ## filter by all + igraph_obj <- graph_from_data_frame(net_dat[,c('source','target')],directed=directed,vertices = all_possible_nodes) ## + if(weighted==TRUE & 'MI' %in% colnames(net_dat)) igraph_obj <- set_edge_attr(igraph_obj,'weight',index=E(igraph_obj),value=net_dat[,'MI']) + if(directed==TRUE & 'spearman' %in% colnames(net_dat)) igraph_obj <- set_edge_attr(igraph_obj,'sign',index=E(igraph_obj),value=sign(net_dat[,'spearman'])) + target_list <- get_net2target_list(net_dat) + return(list(network_dat=net_dat,target_list=target_list,igraph_obj=igraph_obj)) +} + + +#' QC Tables and Plots for Network Object +#' +#' \code{draw.network.QC} creates tables and plots for showing some basic statistics, +#' driver statistics and scale-free checking of the target network. +#' +#' @param igraph_obj igraph object, created by \code{get.SJAracne.network}. +#' @param outdir character, the output directory for saving QC tables and plots. +#' @param prefix character, the prefix of output QC figures names. Default is "". +#' @param directed logical, if TRUE, this network will be treated as a directed network. Default is TRUE. +#' @param weighted logical, if TRUE, this network will be treated as a weighted network. Default is FALSE. +#' @param generate_html logical, if TRUE, a html file will be created by R Markdown. +#' If FALSE, plots will be save as separated PDFs. +#' Default is TRUE. +#' @param html_info_limit logical, if TRUE, the statistics for network QC html will be limited. Default is TRUE. +#' @return Return a logical value. If TRUE, success in creating QC tables and plots. +#' +#' @examples +#' \dontrun{ +#' if(exists('analysis.par')==TRUE) rm(analysis.par) +#' network.dir <- sprintf('%s/demo1/network/',system.file(package = "NetBID2")) # use demo +#' network.project.name <- 'project_2019-02-14' # demo project name +#' project_main_dir <- 'test/' +#' project_name <- 'test_driver' +#' analysis.par <- NetBID.analysis.dir.create(project_main_dir=project_main_dir, +#' project_name=project_name, +#' network_dir=network.dir, +#' network_project_name=network.project.name) +#' analysis.par$tf.network <- get.SJAracne.network(network_file=analysis.par$tf.network.file) +#' analysis.par$out.dir.PLOT <- getwd() ## directory for saving the pdf files +#' draw.network.QC(analysis.par$tf.network$igraph_obj, +#' outdir=analysis.par$out.dir.QC,prefix='TF_net_') +#' } +#' @export +draw.network.QC <- function(igraph_obj,outdir=NULL,prefix="",directed=TRUE,weighted=FALSE,generate_html=TRUE,html_info_limit=TRUE){ + # + all_input_para <- c('igraph_obj','prefix','directed','weighted','generate_html','html_info_limit') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('directed',c(TRUE,FALSE),envir=environment()), + check_option('weighted',c(TRUE,FALSE),envir=environment()), + check_option('generate_html',c(TRUE,FALSE),envir=environment()), + check_option('html_info_limit',c(TRUE,FALSE),envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + if (!file.exists(outdir)) { + dir.create(outdir, recursive = TRUE) + message(paste0("The output directory: \"", outdir, "\" is created!")) + }else + message(paste0("The output will overwrite the files in directory: \"",outdir,"\"")) + if(class(igraph_obj)!='igraph'){ + message('Should input igraph object ! ');return(FALSE); + } + net <- igraph_obj + if(generate_html==TRUE){ + if(pandoc_available()==FALSE){ + stop('pandoc not available, please set Sys.setenv(RSTUDIO_PANDOC=$pandoc_installed_path), or set generate_html=FALSE') + } + directed <- directed + weighted <- weighted + output_rmd_file <- sprintf('%s/%snetQC.Rmd',outdir,prefix) + file.copy(from=system.file('Rmd/net_QC.Rmd',package = "NetBID2"),to=output_rmd_file) + rmarkdown::render(output_rmd_file, html_document(toc = TRUE)) + return(TRUE) + } + ### + deg <- igraph::degree(net,mode='out') + source_list <- names(deg)[which(deg>0)] + # + res_file <- sprintf('%s/%snetwork_info.pdf',outdir,prefix) + pdf(res_file,width=8,height=8); + # + par(mar=c(6,6,6,8)) + d_out <- igraph::degree(net,mode = 'all') + a <- graphics::hist(d_out,breaks = 20,xlab='Degree',cex.lab=1.2,cex.axis=1.2,cex.main=1.2, + main=sprintf('Density plot of degree distribution for all nodes \n (network node:%d, network edge:%d)',base::length(V(net)),base::length(E(net)))); + d1 <- stats::density(d_out) + mm <- base::max(a$counts)/base::max(d1$y);mm1 <- base::seq(0,base::max(d1$y),length.out = 5);mm2 <- format(mm1,scientific=TRUE,digits=3);mm2[1]<-'0'; + lines(x=d1$x,y=d1$y*mm,col=get_transparent('red',0.5),lwd=1.5) + graphics::axis(side=4,at=mm*mm1,labels=mm2,las=2); + graphics::mtext(side=4,line = 6,'Density',cex=1.2) + d_out <- igraph::degree(net,mode = 'out')[source_list] + a <- graphics::hist(d_out,breaks = 20,xlab='Degree',cex.lab=1.2,cex.axis=1.2,cex.main=1.2, + main=sprintf('Density plot of target size for all %s drivers \n (Size from %s to %s; mean Size: %s, median Size: %s )',base::length(source_list),base::min(d_out),base::max(d_out),format(base::mean(d_out),digits=5),stats::median(d_out))); + d1 <- stats::density(d_out) + mm <- base::max(a$counts)/base::max(d1$y);mm1 <- base::seq(0,base::max(d1$y),length.out = 5);mm2 <- format(mm1,scientific=TRUE,digits=3);mm2[1]<-'0'; + lines(x=d1$x,y=d1$y*mm,col=get_transparent('red',0.5),lwd=1.5) + graphics::axis(side=4,at=mm*mm1,labels=mm2,las=2); + graphics::mtext(side=4,line = 6,'Density',cex=1.2) + # + res1 <- check_scalefree(net) + dev.off() + return(TRUE) +} + +## functions to check the scale free feature of the network +check_scalefree <- function(igraph_obj) { + gr1 <- igraph_obj + fp1 <- igraph::degree_distribution(gr1) + dd <- as.data.frame(base::cbind(k = 1:base::max(igraph::degree(gr1)), pk = fp1[-1])) + dd$pk <- dd$pk + 1 / base::length(V(gr1)) + r2 <- + stats::lm(log10(dd$pk) ~ log10(dd$k)) + r3 <- summary(r2)$adj.r.squared + if(base::length(dd$k)>100) graphics::plot(pk ~ k,data = dd,log = 'xy',main = sprintf('R2:%s', format(r3,digits=4)),pch=16,col=get_transparent('dark grey',0.8),cex.lab=1.4,cex.axis=1.2) + if(base::length(dd$k)<=100) graphics::plot(pk ~ k,data = dd,log = 'xy',main = sprintf('R2:%s', format(r3,digits=4)),pch=16,col=get_transparent('black',0.8),cex.lab=1.4,cex.axis=1.2) + graphics::abline(a=r2$coefficients[1],b=r2$coefficients[2],col=get_transparent('red',0.5),lwd=2) + return(r3) +} + + +#' Prepare Data Files for Running SJARACNe +#' +#' \code{SJAracne.prepare} prepares data files for running SJAracne. SJARACNe is a scalable software tool for gene network reverse engineering from big data. +#' Detailed description and how to run SJARACNe can be found in its GitHub repository. +#' The usage of SJARACNe may be updated and the bash file generated by this function may not fit the version in use (not suit for SJAracne 0.2.0). +#' Please check \url{https://github.com/jyyulab/SJARACNe/} for details. +#' +#' @param eset an ExpressionSet class object, which contains the expression matrix. +#' @param use.samples a vector of characters, the list of sample used to run SJARACNe. +#' @param TF_list a vector of characters, the TF list. +#' @param SIG_list a vector of characters, the SIG list. +#' @param SJAR.main_dir character, the path to save the results generated by SJARACNe. +#' @param SJAR.project_name character, the project name used to label the output directory. +#' @param IQR.thre numeric, the IQR filter threshold to filter all non-driver genes. +#' @param IQR.loose_thre numeric, the IQR filter threshold to filter for all driver(TF/SIG) genes. +#' @param geneSymbol_column character, the column name in fdata(eset) which contains gene symbol. +#' If NULL, will use the main ID in exprs(eset) to fulfill the "geneSymbol" column. Default is NULL. +#' @param add_options additional option for running SJARACNe. +#' @examples +#' \dontrun{ +#' network.par <- list() +#' network.par$out.dir.DATA <- system.file('demo1','network/DATA/',package = "NetBID2") +#' NetBID.loadRData(network.par=network.par,step='exp-QC') +#' db.preload(use_level='gene',use_spe='human',update=FALSE) +#' use_gene_type <- 'external_gene_name' ## this should user-defined !!! +#' use_genes <- rownames(Biobase::fData(network.par$net.eset)) +#' use_list <- get.TF_SIG.list(use_genes,use_gene_type=use_gene_type) +#' #select sample for analysis +#' phe <- Biobase::pData(network.par$net.eset) +#' use.samples <- rownames(phe) ## use all samples, or choose to use some samples +#' prj.name <- network.par$project.name # can use other names, if need to run different use samples +#' network.par$out.dir.SJAR <- 'test' ## set the directory +#' SJAracne.prepare(eset=network.par$net.eset, +#' use.samples=use.samples, +#' TF_list=use_list$tf, +#' SIG_list=use_list$sig, +#' IQR.thre = 0.5,IQR.loose_thre = 0.1, +#' SJAR.project_name=prj.name, +#' SJAR.main_dir=network.par$out.dir.SJAR) +#' } +#' @export +SJAracne.prepare <- + function(eset,use.samples = rownames(Biobase::pData(eset)), + TF_list=NULL,SIG_list=NULL, + SJAR.main_dir='', + SJAR.project_name = "", + IQR.thre=0.5,IQR.loose_thre=0.1,add_options='',geneSymbol_column=NULL) { + # + all_input_para <- c('eset','use.samples','TF_list','SIG_list','SJAR.main_dir','SJAR.project_name','IQR.thre','IQR.loose_thre') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + SJAR.outdir <- file.path(SJAR.main_dir, SJAR.project_name) + if (!file.exists(SJAR.outdir)) { + dir.create(SJAR.outdir, recursive = TRUE) + } + SJAR.expression_matrix <- + file.path(SJAR.main_dir, SJAR.project_name, 'input.exp') + SJAR.hub_genes.tf <- + file.path(SJAR.main_dir, SJAR.project_name, 'tf.txt') + SJAR.hub_genes.sig <- + file.path(SJAR.main_dir, SJAR.project_name, 'sig.txt') + SJAR.bash_file.tf <- + file.path(SJAR.main_dir, SJAR.project_name, 'run_tf.sh') + SJAR.bash_file.sig <- + file.path(SJAR.main_dir, SJAR.project_name, 'run_sig.sh') + ## process exp matrix + d <- Biobase::exprs(eset)[, use.samples] + # filter genes with count=0 + d <- d[!apply(d == 0, 1, all), ] + # filter genes with IQR + choose1 <- IQR.filter(d, rownames(d),thre = IQR.thre,loose_thre=IQR.loose_thre,loose_gene=base::unique(c(TF_list,SIG_list))) + d <- d[choose1, ] + use.genes <- rownames(d) + use.genes <- use.genes[which(is.na(use.genes)==FALSE)] + w1 <- which(rownames(d) %in% use.genes) + d <- d[w1,] + use.genes <- rownames(d) + # write exp data to exp format + use.genes.symbol <- use.genes + if(is.null(geneSymbol_column)==FALSE){ + if(geneSymbol_column %in% colnames(Biobase::fData(eset))){use.genes.symbol <- Biobase::fData(eset)[use.genes,geneSymbol_column]} + } + use.genes <- clean_charVector(use.genes) + use.genes.symbol <- clean_charVector(use.genes.symbol) + expdata <- data.frame(base::cbind(isoformId = use.genes, geneSymbol = use.genes.symbol, d),stringsAsFactors=FALSE) + # + write.table( + expdata, + file = SJAR.expression_matrix, + sep = "\t", + row.names = FALSE, + quote = FALSE + ) + ## + cat(base::intersect(use.genes, TF_list),file = SJAR.hub_genes.tf,sep = '\n') + cat(base::intersect(use.genes, SIG_list),file = SJAR.hub_genes.sig,sep = '\n') + # write scripts to bash file for tf + network_project_name <- SJAR.project_name + tf_out_path <- SJAR.main_dir + tf_pj <- sprintf('%s_TF',network_project_name) + sig_out_path <- SJAR.main_dir + sig_pj <- sprintf('%s_SIG',network_project_name) + cmd_tf <- sprintf('sjaracne %s %s %s %s %s',tf_pj,SJAR.expression_matrix,SJAR.hub_genes.tf,tf_out_path,add_options) + cmd_sig <- sprintf('sjaracne %s %s %s %s %s',sig_pj,SJAR.expression_matrix,SJAR.hub_genes.sig,sig_out_path,add_options) + cat(cmd_tf,file = SJAR.bash_file.tf,sep = '\n') + cat(cmd_sig,file = SJAR.bash_file.sig,sep = '\n') + message(sprintf('The running command is: %s for TF, and the bash file is generated in %s.',cmd_tf,SJAR.bash_file.tf)) + message(sprintf('The running command is: %s for SIG, and the bash file is generated in %s.',cmd_sig,SJAR.bash_file.sig)) + message('The bash files only suit for SJAracne 0.1.0, if you are using other versions (e.g 0.2.0), please check the usage online and prepare the bash scripts by yourself !') + return(TRUE) + } + +#### +#' Calculate Differential Expression (DE) or Differential Activity (DA) by Using Bayesian Inference +#' +#' \code{bid} calculates the differential expression (DE) / differential activity (DA) by using Bayesian Inference method. +#' Users can choose different regression models and pooling strategies. +#' +#' It is a core function inside \code{getDE.BID.2G}. +#' This function allows users to have access to more options when calculating the statistics using Bayesian Inference method. +#' In some cases, the input expression matrix could be at probe/transcript level, but DE/DA calculated at gene level is expected. +#' By setting pooling strategy, users can successfully solve the special cases. +#' The P-value is estimated by the posterior distribution of the coefficient. +#' +#' @param mat matrix, the expression/activity matrix of IDs (gene/transcript/probe) from one gene. Rows are IDs, columns are samples. +#' It is strongly suggested to contain rownames of IDs and column names of samples. Example, geneA has two probes A1 and A2 across all 6 samples (Case-rep1, Case-rep2, Case-rep3, Control-rep1, Control-rep2 and Control-rep3). +#' The \code{mat} of geneA is a 2*6 numeric matrix. Likewise, if geneA has only one probe, the \code{mat} is a one-row matrix. +#' @param use_obs_class a vector of characters, the category of sample. +#' If the vector names are not available, the order of samples in \code{use_obs_class} must be the same as in \code{mat}. +#' Users can call \code{get_obs_label} to create this vector. +#' @param class_order a vector of characters, the order of the sample's category. +#' The first class in this vector will be considered as the control group by default. +#' If NULL, the order will be assigned using alphabetical order. Default is NULL. +#' @param class_ordered logical, if TRUE, the \code{class_order} will be ordered. And the order must be consistent with the phenotypic trend, +#' such as "low", "medium", "high". Default is TRUE. +#' @param method character, users can choose between "MLE" and "Bayesian". +#' "MLE", the maximum likelihood estimation, will call generalized linear model(glm/glmer) to perform data regression. +#' "Bayesian", will call Bayesian generalized linear model (bayesglm) or multivariate generalized linear mixed model (MCMCglmm) to perform data regression. +#' Default is "Bayesian". +#' @param family character or family function or the result of a call to a family function. +#' This parameter is used to define the model's error distribution. See \code{?family} for details. +#' Currently, options are gaussian, poisson, binomial(for two-group sample classes)/category(for multi-group sample classes)/ordinal(for multi-group sample classes with class_ordered=TRUE). +#' If set with gaussian or poission, the response variable in the regression model will be the expression level, and the independent variable will be the sample's phenotype. +#' If set with binomial, the response variable in the regression model will be the sample phenotype, and the independent variable will be the expression level. +#' For binomial, category and ordinal input, the family will be automatically reset, based on the sample's class level and the setting of \code{class_ordered}. +#' Default is gaussian. +#' @param pooling character, users can choose from "full","no" and "partial". +#' "full", use probes as independent observations. +#' "no", use probes as independent variables in the regression model. +#' "partial", use probes as random effect in the regression model. +#' Default is "full". +#' @param prior.V.scale numeric, the V in the parameter "prior" used in \code{MCMCglmm}. +#' It is meaningful to set when one choose "Bayesian" as method and "partial" as pooling. +#' Default is 0.02. +#' @param prior.R.nu numeric, the R-structure in the parameter "prior" used in \code{MCMCglmm}. +#' It is meaningful to set when one choose "Bayesian" as method and "partial" as pooling. +#' Default is 1. +#' @param prior.G.nu numeric, the G-structure in the parameter "prior" used in \code{MCMCglmm}. +#' It is meaningful to set when one choose "Bayesian" as method and "partial" as pooling. +#' Default is 2. +#' @param nitt numeric, the parameter "nitt" used in \code{MCMCglmm}. +#' It is meaningful to set when one choose "Bayesian" as method and "partial" as pooling. +#' Default is 13000. +#' @param burnin numeric, the parameter "burnin" used in \code{MCMCglmm}. +#' It is meaningful to set when one choose "Bayesian" as method and "partial" as pooling. +#' Default is 3000. +#' @param thin numeric, the parameter "thin" used in \code{MCMCglmm}. +#' It is meaningful to set when one choose "Bayesian" as method and "partial" as pooling. +#' Default is 10. +#' @param std logical, if TRUE, the expression matrix will be normalized by column. Default is TRUE. +#' @param logTransformed logical, if TRUE, log transformation will be performed. Default is TRUE. +#' @param log.base numeric, the base of log transformation when \code{do.logtransform} is set to TRUE. Default is 2. +#' @param average.method character, the method applied to calculate FC (fold change). Users can choose between "geometric" and "arithmetic". +#' Default is "geometric". +#' @param pseudoCount integer, the integer added to avoid "-Inf" showing up during log transformation in the FC (fold change) calculation. +#' @param return_model logical, if TRUE, the regression model will be returned; Otherwise, just return basic statistics from the model. Default is FALSE. +#' @param use_seed integer, the random seed. Default is 999. +#' @param verbose logical, if TRUE, print out additional information during calculation. Default is FALSE. +#' +#' @return Return a one-row data frame with calculated statistics for one gene/gene set if \code{return_model} is FALSE. +#' Otherwise, the regression model will be returned. +#' +#' @examples +#' mat <- matrix(c(0.50099,1.2108,1.0524,-0.34881,-0.13441,-0.87112, +#' 1.84579,2.0356,2.6025,1.62954,1.88281,1.29604), +#' nrow=2,byrow=TRUE) +#' rownames(mat) <- c('A1','A2') +#' colnames(mat) <- c('Case-rep1','Case-rep2','Case-rep3', +#' 'Control-rep1','Control-rep2','Control-rep3') +#' res1 <- bid(mat=mat, +#' use_obs_class = c(rep('Case',3),rep('Control',3)), +#' class_order = c('Control','Case')) +#' \dontrun{ +#' } +#' @export +bid <- function(mat=NULL,use_obs_class=NULL,class_order=NULL,class_ordered=TRUE, + method='Bayesian',family=gaussian,pooling='full', + prior.V.scale=0.02,prior.R.nu=1,prior.G.nu=2,nitt = 13000,burnin =3000,thin=10, + std=TRUE,logTransformed=TRUE,log.base=2, + average.method='geometric',pseudoCount=0,return_model=FALSE,use_seed=999,verbose=FALSE){ + #check input + # + all_input_para <- c('mat','use_obs_class','class_order','class_ordered', + 'method','family','pooling', + 'prior.V.scale','prior.R.nu','prior.G.nu', + 'nitt','burnin','thin','std','log.base', + 'logTransformed','average.method','pseudoCount', + 'return_model','use_seed','verbose') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + check_res <- c(check_option('class_ordered',c(TRUE,FALSE),envir=environment()), + check_option('std',c(TRUE,FALSE),envir=environment()), + check_option('logTransformed',c(TRUE,FALSE),envir=environment()), + check_option('return_model',c(TRUE,FALSE),envir=environment()), + check_option('verbose',c(TRUE,FALSE),envir=environment()), + check_option('method',c('Bayesian','MLE'),envir=environment()), + check_option('pooling',c('full','no','partial'),envir=environment()), + check_option('average.method',c('arithmetic','geometric'),envir=environment()) + ) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + # + use_obs_class <- clean_charVector(use_obs_class) + # + if (is.character(family)){ + if(family %in% c('category','ordinal')) family <- 'binomial' ## use automatically judging + if (!family %in% c('gaussian','binomial', 'poisson')) { + message("Only gaussian,poisson, and binomial/category/ordinal are supported, please check and re-try!");return(FALSE); + } + family <- get(family, mode = "function", envir = parent.frame()) + } + if (is.function(family)) + family <- family() + if (is.character(family)) + family <- get(family, mode = "function", envir = parent.frame()) + if (is.function(family)) + family <- family() + if (!family$family %in% c('gaussian','binomial', 'poisson')) { + message("Only gaussian,poisson, and binomial/category/ordinal are supported, please check and re-try!");return(FALSE); + } + method<-match.arg(method) + #sample name + if(is.null(colnames(mat))==TRUE & is.null(names(use_obs_class))==TRUE){ + colnames(mat) <- paste0('Sample',1:ncol(mat)) + names(use_obs_class) <- paste0('Sample',1:ncol(mat)) + } + if(is.null(colnames(mat))==TRUE & is.null(names(use_obs_class))==FALSE){ + colnames(mat) <- names(use_obs_class) + } + if(is.null(colnames(mat))==FALSE & is.null(names(use_obs_class))==TRUE){ + names(use_obs_class) <- colnames(mat) + } + use_obs_class <- use_obs_class[colnames(mat)] + ##check sample class + class_order <- base::intersect(class_order,use_obs_class) + if(base::length(class_order)<=1){message('Sample class order is smaller than two classes, please check and re-try!');return(FALSE);} + if(verbose==TRUE) message(sprintf('%d sample classes will be used in calculation and %s will be treated as control',base::length(class_order),class_order[1])) + ##generate comp + comp <- factor(use_obs_class,levels=class_order) + ##check input matrix + #remove NAs + d <- mat + nna<-apply(!is.na(d),2,all) + if(dim(d)[1]==1){ d <- t(as.matrix(d[,nna])); rownames(d) <- rownames(mat) }else{ d <- d[,nna]} + ##generate data frame + d <- reshape::melt(t(d)) + dat <- data.frame(response=d$value,treatment=rep(comp,base::length(base::unique(d$X2))),probe=d$X2) + ##calculate FC + n.levels<-base::length(base::unique(dat$probe)) ## + n.treatments<-base::length(base::unique(dat$treatment)) ## 2grp or mgrp + AveExpr<-base::mean(dat$response) + if(n.treatments==2){ + FC.val<-FC(dat$response,dat$treatment, + logTransformed=logTransformed,log.base=log.base, + average.method=average.method,pseudoCount=pseudoCount) + res<-c(FC=FC.val,AveExpr=AveExpr,n.levels=as.integer(n.levels)) + }else{ + res<-c(AveExpr=AveExpr,n.levels=as.integer(n.levels)) + } + ##re-strandarize the input data + if(std==TRUE & family$family=='Poisson'){ + message('negative values not allowed for the Poisson family, please set std=FALSE and re-try!');return(FALSE) + } + if(std==TRUE & sd(dat$response)>0) + dat$response<-0.5*(dat$response-base::mean(dat$response))/sd(dat$response) + + if(class_ordered==TRUE) dat$treatment <- as.ordered(dat$treatment) ## for multi-groups + if(n.treatments==2) class_ordered=FALSE + ## main part !!! + # inner functions + get_info_model<-function(M,dat,method=NULL,df_sta=NULL){ + sum.tmp<-summary(M) + if('summary.glm' %in% class(sum.tmp) | 'summary.clm' %in% class(sum.tmp)){ + if('summary.glm' %in% class(sum.tmp) ) w1 <- grep('treatment|response',rownames(sum.tmp$coef)) + if('summary.clm' %in% class(sum.tmp) ) w1 <- grep('\\||response',rownames(sum.tmp$coef)) + if(is.null(df_sta)==TRUE){ + if(method=='MLE') df_sta<-sum.tmp$df.residual else df_sta <- summary(M)$df.residual-summary(M)$df[1] ## ??? + } + if('z value' %in% colnames(sum.tmp$coef)){ ### + z_sta <- sum.tmp$coef[w1,'z value'] + p_sta <- sum.tmp$coef[w1,4] + t_sta <- NA + } else{ + t_sta <- sum.tmp$coef[w1,'t value'] + if(base::length(grep('^Pr',colnames(sum.tmp$coef)))>0){ + p_sta <- sum.tmp$coef[w1,grep('^Pr',colnames(sum.tmp$coef))] + }else{ + p_sta<-2*pt(abs(t_sta),lower.tail=FALSE,df=df_sta) + } + z_sta<-sign(t_sta)*abs(qnorm(p_sta/2)) + } + if(verbose==TRUE) print(sum.tmp$coef) + rs<-c(t=t_sta,'P.Value'=p_sta,'Z-statistics'=z_sta) + } + if('summary.merMod' %in% class(sum.tmp)){ + if(verbose==TRUE) print(sum.tmp$coef) + w1 <- grep('treatment|response',rownames(sum.tmp$coef)) + t_sta<-sum.tmp$coef[w1,3] + if(is.null(df_sta)==TRUE) df_sta<-as.numeric(sum.tmp$devcomp$dims['n']-sum.tmp$devcomp$dims['p']-sum.tmp$devcomp$dims['q']) + p_sta<-2*pt(abs(t_sta),lower.tail=FALSE,df=df_sta) + z_sta<-sign(t_sta)*abs(qnorm(p_sta/2)) + rs<-c(t=t_sta,'P.Value'=p_sta,'Z-statistics'=z_sta) + } + if('summary.MCMCglmm' %in% class(sum.tmp)){ + if(verbose==TRUE) print(sum.tmp$sol) + t_sta<-sum.tmp$sol[2,1]/sd(M$Sol[,2]) ## t + p_sta<-2*pt(-abs(t_sta),df=df_sta) + if(is.na(p_sta)) p_sta<-sum.tmp$sol[2,5] + z_sta<-sign(t_sta)*abs(qnorm(p_sta/2)) + rs<-c(t=t_sta,'P.Value'=p_sta,'Z-statistics'=z_sta) + } + return(rs) + } + ## check sd + rs_NA <- c(t=0,'P.Value'=1,'Z-statistics'=0) + if(sd(dat$response)==0){rs <- rs_NA; return(rs);} + df_sta <- NULL + ################### in total: 2(MLE/Bayesian)*3(family)*3(pooling)*2(n.levels)=36 *2(random_effect)=72 + ################### MLE, 6 conditions + #### gaussian/poisson + if(method=='MLE' & family$family %in% c('gaussian','poisson') & (n.levels==1 | pooling=='full')){ ## n.levels==1/full + if(class_ordered==TRUE){message('If need to get p-value between each group, try to set family to "ordinial"!');} + M <- stats::glm(response ~ treatment, data=dat, family=family) + } + if(method=='MLE' & family$family %in% c('gaussian','poisson') & n.levels>1 & pooling=='no'){ + if(class_ordered==TRUE){message('If need to get p-value between each group, try to set family to "ordinial"!');} + M <- stats::glm(response ~ treatment + probe, data=dat,family=family) + } + if(method=='MLE' & family$family %in% c('gaussian','poisson') & n.levels>1 & pooling=='partial'){ + if(class_ordered==TRUE){message('If need to get p-value between each group, try to set family to "ordinial"!');} + M <- lme4::lmer(response ~ treatment + (treatment + 1 | probe), data=dat) + } + #### binomial + if(method=='MLE' & family$family=='binomial' & (n.levels==1 | pooling=='full')){ ## n.levels==1 + if(class_ordered==FALSE) M <- stats::glm(treatment ~ response, data=dat, family=family) + if(class_ordered==TRUE) M <- ordinal::clm(treatment ~ response, data=dat) + } + if(method=='MLE' & family$family=='binomial' & n.levels>1 & pooling=='no'){ + if(class_ordered==FALSE) M <- stats::glm(treatment ~ response + probe, data=dat,family=family) + if(class_ordered==TRUE) M <- ordinal::clm(treatment ~ response + probe, data=dat) + } + if(method=='MLE' & family$family=='binomial' & n.levels>1 & pooling=='partial'){ + if(class_ordered==FALSE) M <- lme4::lmer(treatment ~ response + (response + 1 | probe), data=dat,family=family) + if(class_ordered==TRUE){ + if(base::length(levels(dat$probe))<3){message('Random-effect terms has less than three levels, treat as fix effect!');M <- ordinal::clm(treatment ~ response + probe, data=dat)} + if(base::length(levels(dat$probe))>=3) M <- ordinal::clmm(treatment ~ response + (response+1|probe), data=dat) ## clmm must have grouping factor larger than 3 + } + } + ################### Bayesian, 8 conditions + set.seed(use_seed) + if(family$link=='logit'){prior.scale<-2.5;glmm.family<-'categorial';} + if(family$link=='probit'){prior.scale<-2.5*1.6;glmm.family<-'ordinal';} + #### gaussian/poisson + if(method=='Bayesian' & family$family %in% c('gaussian','poisson') & (pooling=='full' | pooling=='no' & n.levels==1)){ + if(class_ordered==TRUE){message('If need to get p-value between each group, try to set family to "ordinial"!');} + M <- arm::bayesglm(response ~ treatment, data=dat, family=family) + } + if(method=='Bayesian' & family$family %in% c('gaussian','poisson') & n.levels>1 & pooling=='no'){ + if(class_ordered==TRUE){message('If need to get p-value between each group, try to set family to "ordinial"!');} + M <- arm::bayesglm(response ~ treatment + probe, data=dat,family=family) + } + # + prior<-list(R = list(V = prior.V.scale, nu=prior.R.nu)) ## default prior + if(method=='Bayesian' & family$family %in% c('gaussian','poisson') & n.levels==1 & pooling=='partial'){ + if(class_ordered==TRUE){message('If need to get p-value between each group, try to set family to "ordinial"!');} + M <- MCMCglmm::MCMCglmm(response ~ treatment, data=dat, prior=prior,verbose=FALSE, nitt=nitt, burnin = burnin,thin=thin,family=family$family) + df_sta<-nrow(dat)-n.treatments + } + if(method=='Bayesian' & family$family %in% c('gaussian','poisson') & n.levels>1 & pooling=='partial'){ + prior<-list(R = list(V = prior.V.scale, nu=prior.R.nu), G = list(G1 = list(V = diag(n.treatments)*prior.V.scale, nu = prior.G.nu))) + if(class_ordered==TRUE){message('If need to get p-value between each group, try to set family to "ordinial"!');} + M <- MCMCglmm::MCMCglmm(response ~ treatment, random=~idh(treatment+1):probe, data=dat, prior=prior,verbose=FALSE, nitt=nitt, burnin = burnin,thin=thin,family=family$family) + df_sta <-nrow(dat)-(n.levels+1)*n.treatments + } + #### binomial + if(method=='Bayesian' & family$family=='binomial' & (pooling=='full' | pooling=='no' & n.levels==1)){ ### + if(class_ordered==FALSE) M <- arm::bayesglm(treatment ~ response, data=dat, family=family, prior.scale = prior.scale) + if(class_ordered==TRUE){ + M <- MCMCglmm::MCMCglmm(treatment ~ response, data=dat, family='ordinal',prior=prior,verbose=FALSE, nitt=nitt, burnin = burnin,thin=thin) + df_sta<-nrow(dat)-2 + } + } + if(method=='Bayesian' & family$family=='binomial' & n.levels>1 & pooling=='no'){ ### + if(class_ordered==FALSE) M <- arm::bayesglm(treatment ~ response + probe, data=dat, family=family,prior.scale = prior.scale) + if(class_ordered==TRUE){ + M <- MCMCglmm::MCMCglmm(treatment ~ response + probe, data=dat, family='ordinal',prior=prior,verbose=FALSE, nitt=nitt, burnin = burnin,thin=thin) + df_sta <-nrow(dat)-(n.levels+1)*2 + } + } + # for partial + if(method=='Bayesian' & family$family=='binomial' & n.levels==1 & pooling=='partial'){ + if(family$link=='logit'){prior<-list(R = list(V = prior.V.scale, nu=n.levels))} + if(family$link=='probit'){prior<-list(R = list(V = prior.V.scale, nu=n.levels+1))} + if(!family$link %in% c('probit','logit')){message('For partial pooling with Binomial family and Bayeisan method, only logit and probit model are supported!');return(FALSE)} + if(class_ordered==FALSE) M <- MCMCglmm::MCMCglmm(treatment~response,family=glmm.family, prior=prior, data=dat, verbose=FALSE, nitt = nitt, burnin = burnin,thin=thin) + if(class_ordered==TRUE) M <- MCMCglmm::MCMCglmm(treatment~response,family='ordinal', prior=prior, data=dat, verbose=FALSE, nitt = nitt, burnin = burnin,thin=thin) + df_sta<-nrow(dat)-2 + } + if(method=='Bayesian' & family$family=='binomial' & n.levels>1 & pooling=='partial'){ + if(family$link=='logit'){prior<-list(R = list(V = prior.V.scale, nu=n.levels), G = list(G1 = list(V = diag(2)*prior.V.scale, nu = n.levels+1)))} + if(family$link=='probit'){prior<-list(R = list(V = prior.V.scale, nu=n.levels+1), G = list(G1 = list(V = diag(2)*prior.V.scale, nu = n.levels+1)))} + if(!family$link %in% c('probit','logit')){message('For partial pooling with Binomial family and Bayeisan method, only logit and probit model are supported!');return(FALSE)} + if(class_ordered==FALSE) M <- MCMCglmm::MCMCglmm(treatment~response, random=~idh(1+response):probe,family=glmm.family, prior=prior, data=dat, verbose=FALSE, nitt = nitt, burnin = burnin,thin=thin) + if(class_ordered==TRUE) M <- MCMCglmm::MCMCglmm(treatment~response, random=~idh(1+response):probe,family='ordinal', prior=prior, data=dat, verbose=FALSE, nitt = nitt, burnin = burnin,thin=thin) + df_sta <-nrow(dat)-(n.levels+1)*2 + } + ## + if(return_model==TRUE) return(M) + rs <- get_info_model(M=M,dat=dat,method=method,df_sta=df_sta); rs <- c(res,rs) + return(rs) +} + +## +#fold change function, inner function +#first class is 0. +#positive: class1/class0 +#negative: class0/class1 +FC <- function(x,cl,logTransformed = TRUE, + log.base = 2,average.method = c('geometric', 'arithmetic'), + pseudoCount = 0) { + x.class0 <- x[(cl == levels(cl)[1])] + pseudoCount + x.class1 <- x[(cl == levels(cl)[2])] + pseudoCount + if (missing(average.method)) + average.method <- 'geometric' + if (logTransformed) { + if (is.na(log.base) | log.base < 0) + stop('Please specify log.base !\n') + logFC <- base::mean(x.class1) - base::mean(x.class0) + FC.val <- sign(logFC) * log.base ^ abs(logFC) + } else{ + logFC <- + ifelse(average.method == 'arithmetic', + log(base::mean(x.class1)) - log(base::mean(x.class0)), + base::mean(log(x.class1) - base::mean(log(x.class0)))) + FC.val <- sign(logFC) * exp(abs(logFC)) + } + FC.val[FC.val == 0 | is.na(FC.val)] <- 1 + FC.val +} +################## internal function for graphic +par.pos2inch <- function(){ + user.range <- par("usr")[c(2,4)] - par("usr")[c(1,3)] + region.pin <- par("pin") + return(region.pin/user.range) +} +par.inch2pos <- function(){return(1/par.pos2inch())} +par.char2inch <- function(){return(par()$cin)} ## letter W +par.devSize2xypos <- function(){ + pp <- par()$usr + rr <- par("din")/par.pos2inch() + mm <- c(pp[1]/2+pp[2]/2,pp[3]/2+pp[4]/2) + xyp <- c(mm-(rr/2),mm+(rr/2)) + xyp <- xyp[c(1,3,2,4)] + return(xyp) +} +par.lineHeight2inch <- function(){ + lheight <- par()$lheight + y1 <- par.char2inch()[2]*lheight ## line height in inches + y1 +} +par.char2pos <- function(){par()$cxy} +strheightMod <- function(s, units = "inch", cex = 1,ori=TRUE,mod=FALSE){ + if(ori==TRUE) return(strheight(s=s,units=units,cex=cex)) + if(units=='user') return(par.char2pos()[2]*cex) + if(units=='inch' | units=='inches') return(par.char2inch()[2]*cex) +} +strwidthMod <- function(s, units = "inch", cex = 1,ori=TRUE,mod=FALSE){ + if(ori==TRUE) return(strwidth(s=s,units=units,cex=cex)) + if(mod==TRUE){ + plot.new() + rt <- strwidth(s,units=units)/strwidth('W',units=units); rt <- ceiling(rt) + if(units=='user') r1 <- par.char2pos()[1]*cex*rt + if(units=='inch') r1 <- par.char2inch()[1]*cex*rt + dev.off(); return(r1) + }else{ + if(units=='user') return(par.char2pos()[1]*cex*nchar(s)) + if(units=='inch'| units=='inches') return(par.char2inch()[1]*cex*nchar(s)) + } +} + +## +#' Lazy mode for NetBID2 result visualization +#' +#' \code{NetBID.lazyMode.DriverVisualization} is an integrated function to draw visualization plots for top drivers. +#' +#' User need to strictly follow the NetBID2 pipeline to get the complicated list object analysis.par. +#' "intgroup", "use_comp" should be specified; +#' "transfer_tab" could be set to NULL with "main_id_type" specified, but it is suggested to input by hand if available. +#' +#' @param analysis.par list, stores all related datasets from driver analysis step. +#' @param intgroup character, one interested phenotype group from the \code{analysis.par$cal.eset}. +#' @param use_comp character, the name of the comparison of interest, should be included in the colnames of \code{analysis.par$DA} and \code{analysis.par$DE}. +#' @param main_id_type character, the type of driver's ID. It comes from the attribute name in biomaRt package. +#' Such as "ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id", "ensembl_transcript_id_version" or "refseq_mrna". +#' For details, user can call \code{biomaRt::listAttributes()} to display all available attributes in the selected dataset. +#' @param transfer_tab data.frame, the ID conversion table. Users can call \code{get_IDtransfer} to get this table. +#' @param use_gs a vector of characters, names of major gene set collections. Users can call \code{all_gs2gene_info} to see all the available collections. +#' Default is c("H", "CP:BIOCARTA", "CP:REACTOME", "CP:KEGG"). +#' @param min_Size numeric, minimum size for the target genes. Default is 30. +#' @param max_Size numeric, maximum size for the target genes. Default is 1000. +#' @param top_number number for the top significant genes/drivers in the combine results to be displayed on the plot. +#' Default is 30. +#' @param top_strategy character, choose from "Both", "Up", "Down". +#' If set to "Both", top drivers with highest absolute Z statistics will be displayed. +#' If set to "Up", only top up-regulated drivers will be displayed. +#' If set to "Down", only top down-regulated drivers will be displayed. +#' Default is "Both". +#' @param logFC_thre numeric, the threshold of logFC. Genes or drivers with absolute logFC value higher than the threshold will be kept. +#' Default is 0.05. +#' @param Pv_thre numeric, the threshold of P-values. Genes or drivers with P-values lower than the threshold will be kept. +#' Default is 0.05. +#' +#' @examples +#' \dontrun{ +#' analysis.par <- list() +#' analysis.par$out.dir.DATA <- system.file('demo1','driver/DATA/',package = "NetBID2") +#' NetBID.loadRData(analysis.par=analysis.par,step='ms-tab') +#' analysis.par$out.dir.PLOT <- 'test/' +#' NetBID.lazyMode.DriverVisualization(analysis.par=analysis.par, +#' intgroup='subgroup',use_comp='G4.Vs.others', +#' transfer_tab=analysis.par$transfer_tab, +#' logFC_thre=0.2,Pv_thre=1e-4) +#' } +#' @export +NetBID.lazyMode.DriverVisualization <- function(analysis.par=NULL,intgroup=NULL,use_comp=NULL, + main_id_type='external_gene_name', + transfer_tab=NULL,use_gs=c("H", "CP:BIOCARTA", "CP:REACTOME", "CP:KEGG"), + min_Size=30,max_Size=1000,top_number=30,top_strategy='Both', + logFC_thre=0.05,Pv_thre=0.05){ + all_input_para <- c('analysis.par','intgroup','use_comp','main_id_type', + 'min_Size','max_Size','top_number','top_strategy','logFC_thre','Pv_thre') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + if('final_ms_tab' %in% names(analysis.par)){ + ms_tab <- analysis.par$final_ms_tab + }else{ + message('final_ms_tab not included in the analysis.par list, please check and re-try!');return(FALSE) + } + if('cal.eset' %in% names(analysis.par)){ + exp_mat <- Biobase::exprs(analysis.par$cal.eset) + }else{ + message('cal.eset not included in the analysis.par list, please check and re-try!');return(FALSE) + } + if('merge.ac.eset' %in% names(analysis.par)){ + ac_mat <- Biobase::exprs(analysis.par$merge.ac.eset) + }else{ + message('merge.ac.eset not included in the analysis.par list, please check and re-try!');return(FALSE) + } + if(!'DE' %in% names(analysis.par)){ + message('DE not included in the analysis.par list, please check and re-try!');return(FALSE) + } + if(!'DA' %in% names(analysis.par)){ + message('DA not included in the analysis.par list, please check and re-try!');return(FALSE) + } + if(!use_comp %in% names(analysis.par$DA)){ + message(sprintf('%s not included in the analysis.par$DA list, please check and re-try!',use_comp));return(FALSE) + } + if(!use_comp %in% names(analysis.par$DE)){ + message(sprintf('%s not included in the analysis.par$DE list, please check and re-try!',use_comp));return(FALSE) + } + if(!intgroup %in% colnames(Biobase::pData(analysis.par$cal.eset))){ + message(sprintf('%s not included in the Biobase::pData(analysis.par$cal.eset), please check and re-try!',intgroup));return(FALSE) + } + if(exists('db_info')==FALSE){ + message('Please run db.preload() first to load in db_info !');return(FALSE) + } + if(is.null(transfer_tab)==TRUE & 'transfer_tab' %in% names(analysis.par)){ + transfer_tab <- analysis.par$transfer_tab + } + use_genes = base::unique(c(analysis.par$final_ms_tab$originalID, + analysis.par$merge.network$network_dat$target)) + if(is.null(transfer_tab)==TRUE){ + message('Begin get transfer table at gene level (for gene function enrichment analysis )') + transfer_tab <- get_IDtransfer2symbol2type(from_type=main_id_type,use_genes = use_genes,use_level = 'gene',ignore_version = TRUE) + }else{ + if(!'external_gene_name' %in% colnames(transfer_tab) & !'external_transcript_name' %in% colnames(transfer_tab)){ + message('Begin get transfer table at gene level (for gene function enrichment analysis )') + transfer_tab <- get_IDtransfer2symbol2type(from_type=main_id_type,use_genes = use_genes,use_level = 'gene',ignore_version = TRUE) + } + if(!'external_gene_name' %in% colnames(transfer_tab) & 'external_transcript_name' %in% colnames(transfer_tab)){ + transfer_tab$external_gene_name <- gsub('(.*)-.*','\\1',transfer_tab$external_transcript_name) + } + } + print(str(transfer_tab)) + if(exists('all_gs2gene')==FALSE){ + message('Please run gs.preload() first to load in all_gs2gene or prepare the same format of this object!');return(FALSE) + } + phe <- Biobase::pData(analysis.par$cal.eset) + DE <- analysis.par$DE[[use_comp]];DA <- analysis.par$DA[[use_comp]] + G1 <- gsub('Ave.(.*)','\\1',colnames(DE)[grep('^Ave\\.',colnames(DE))])[2] + G0 <- gsub('Ave.(.*)','\\1',colnames(DE)[grep('^Ave\\.',colnames(DE))])[1] + # + w1 <- which(ms_tab$Size>=min_Size & ms_tab$Size<=max_Size) + message(sprintf('%s out of %s drivers passed the size filteration!',base::length(w1),nrow(ms_tab))) + if(length(w1)==0){ + message('No drivers passed, will not use size filteration !'); + }else{ + ms_tab <- ms_tab[w1,] + } + print('Begin Volcano plot by draw.volcanoPlot() ') + res1 <- draw.volcanoPlot(dat=ms_tab,label_col = 'gene_label', + logFC_col = sprintf('logFC.%s_DA',use_comp),Pv_col = sprintf('P.Value.%s_DA',use_comp), + logFC_thre = logFC_thre,Pv_thre = Pv_thre, + pdf_file=sprintf('%s/%s_Volcano.pdf',analysis.par$out.dir.PLOT,use_comp)) + print('Finish Volcano plot ') + if(top_strategy=='Up') res1 <- res1[which(res1[,sprintf('logFC.%s_DA',use_comp)]>0),,drop=FALSE] + if(top_strategy=='Down') res1 <- res1[which(res1[,sprintf('logFC.%s_DA',use_comp)]<0),,drop=FALSE] + driver_list <- rownames(res1) + driver_DE_Z <- ms_tab[driver_list,sprintf('Z.%s_DE',use_comp)] + driver_DA_Z <- ms_tab[driver_list,sprintf('Z.%s_DA',use_comp)] + if(base::length(driver_list)>=top_number){ + driver_list <- driver_list[order(abs(driver_DA_Z),decreasing = TRUE)[1:top_number]] + } + driver_DE_Z <- ms_tab[driver_list,sprintf('Z.%s_DE',use_comp)];names(driver_DE_Z) <- driver_list + driver_DA_Z <- ms_tab[driver_list,sprintf('Z.%s_DA',use_comp)];names(driver_DA_Z) <- driver_list + if(nrow(res1)<5){ + message(sprintf('%s drivers remained, too few for draw top drivers, will skip plots for top drivers !',nrow(res1))); + }else{ + message(sprintf('%s drivers (%s) will be displayed!',base::length(driver_list),base::paste(driver_list,collapse=';'))) + print('Begin TopDA_GSEA plot by draw.GSEA.NetBID() ') + draw.GSEA.NetBID(DE=DE,profile_col = 'logFC',name_col = 'ID', + driver_list = driver_list,show_label=ms_tab[driver_list,'gene_label'], + driver_DE_Z = driver_DE_Z, + driver_DA_Z = driver_DA_Z, + target_list = analysis.par$merge.network$target_list, + pdf_file=sprintf('%s/%s_TopDA_GSEA.pdf',analysis.par$out.dir.PLOT,use_comp), + top_driver_number = top_number,main=use_comp) + print(sprintf('Finish TopDA_GSEA plot by draw.GSEA.NetBID(), please check %s',sprintf('%s/%s_TopDA_GSEA.pdf',analysis.par$out.dir.PLOT,use_comp))) + print('Begin TopDA_Heatmap_ac plot by draw.heatmap() ') + draw.heatmap(mat=ac_mat,use_genes = ms_tab[driver_list,'originalID_label'],use_gene_label = ms_tab[driver_list,'gene_label'], + phenotype_info = Biobase::pData(analysis.par$merge.ac.eset),use_phe=intgroup, + pdf_file=sprintf('%s/%s_TopDA_Heatmap_ac.pdf',analysis.par$out.dir.PLOT,use_comp),scale='row') + print(sprintf('Finish TopDA_Heatmap_ac plot by draw.heatmap(), please check %s',sprintf('%s/%s_TopDA_Heatmap_ac.pdf',analysis.par$out.dir.PLOT,use_comp))) + print('Begin TopDA_Heatmap_exp plot by draw.heatmap() ') + draw.heatmap(mat=exp_mat,use_genes = ms_tab[driver_list,'originalID'],use_gene_label = ms_tab[driver_list,'geneSymbol'], + phenotype_info = Biobase::pData(analysis.par$merge.ac.eset),use_phe=intgroup, + pdf_file=sprintf('%s/%s_TopDA_Heatmap_exp.pdf',analysis.par$out.dir.PLOT,use_comp),scale='row') + print(sprintf('Finish TopDA_Heatmap_exp plot by draw.heatmap(), please check %s',sprintf('%s/%s_TopDA_Heatmap_exp.pdf',analysis.par$out.dir.PLOT,use_comp))) + print('Begin TopDA_FuncEnrich plot by funcEnrich.Fisher() and draw.funcEnrich.cluster()') + res1 <- funcEnrich.Fisher(input_list = ms_tab[driver_list,'geneSymbol'],bg_list = ms_tab$geneSymbol, + Pv_thre = 0.1,use_gs=use_gs,Pv_adj = 'none') + out2excel(res1,out.xlsx = sprintf('%s/%s_TopDA_FuncEnrich.xlsx',analysis.par$out.dir.PLOT,use_comp)) + print(sprintf('Finish TopDA_FuncEnrich plot by funcEnrich.Fisher(), please check %s', + sprintf('%s/%s_TopDA_FuncEnrich.xlsx',analysis.par$out.dir.PLOT,use_comp))) + if(nrow(res1)>=3){ + draw.funcEnrich.cluster(res1,pdf_file=sprintf('%s/%s_TopDA_FuncEnrich.pdf',analysis.par$out.dir.PLOT,use_comp),top_number = top_number, + Pv_thre = 0.1) + print(sprintf('Finish TopDA_FuncEnrich plot by draw.funcEnrich.cluster(), please check %s', + sprintf('%s/%s_TopDA_FuncEnrich.pdf',analysis.par$out.dir.PLOT,use_comp))) + }else{ + message('Too few results for Function enrichment analysis for top drivers, will pass the FuncEnrich Plot!') + } + print('Begin TopDA_BubblePlot plot by draw.bubblePlot() ') + draw.bubblePlot(driver_list = driver_list,show_label = ms_tab[driver_list,'gene_label'], + transfer2symbol2type = transfer_tab, + target_list=analysis.par$merge.network$target_list, + Z_val=driver_DA_Z,Pv_thre=0.1,top_geneset_number = top_number, + top_driver_number = top_number,use_gs=use_gs, + pdf_file=sprintf('%s/%s_TopDA_BubblePlot.pdf',analysis.par$out.dir.PLOT,use_comp)) + print(sprintf('Finish TopDA_BubblePlot plot by draw.bubblePlot(), please check %s',sprintf('%s/%s_TopDA_BubblePlot.pdf',analysis.par$out.dir.PLOT,use_comp))) + } + ## detailed for top + pf <- DE$logFC; names(pf)<-DE$ID + print('Begin Each TopDA_GSEA plot by draw.GSEA() ') + pdf(sprintf('%s/%s_EachTopDA_GSEA.pdf',analysis.par$out.dir.PLOT,use_comp),width=8,height=8) + for(each_driver in driver_list){ + print(each_driver) + use_direction <- base::sign(analysis.par$merge.network$target_list[[each_driver]]$spearman) + if(length(unique(use_direction))==1){ + if(unique(use_direction)==1){ + use_direction <- NULL + } + } + draw.GSEA(rank_profile = pf,use_genes = analysis.par$merge.network$target_list[[each_driver]]$target, + use_direction = use_direction, + annotation=sprintf('P.Value:%s',get_z2p(driver_DA_Z[each_driver])), + left_annotation = sprintf('High in %s',G1), + right_annotation = sprintf('High in %s',G0),main=ms_tab[each_driver,'gene_label']) + } + dev.off() + print(sprintf('Finish EachTopDA_GSEA plot by draw.GSEA(), please check %s',sprintf('%s/%s_EachTopDA_GSEA.pdf',analysis.par$out.dir.PLOT,use_comp))) + # + print('Begin EachTopDA_CateBox plot by draw.categoryValue() ') + pdf(sprintf('%s/%s_EachTopDA_CateBox.pdf',analysis.par$out.dir.PLOT,use_comp),width=8,height=8) + for(each_driver in driver_list){ + if(ms_tab[each_driver,'originalID'] %in% rownames(exp_mat)){ + draw.categoryValue(ac_val = ac_mat[ms_tab[each_driver,'originalID_label'],], + exp_val = exp_mat[ms_tab[each_driver,'originalID'],], + main_ac = sprintf("%s\n(P.Value:%s)",ms_tab[each_driver,'gene_label'],get_z2p(driver_DA_Z[each_driver])), + main_exp = sprintf("%s\n(P.Value:%s)",ms_tab[each_driver,'geneSymbol'],get_z2p(driver_DE_Z[each_driver])), + use_obs_class=get_obs_label(phe,intgroup)) + }else{ + draw.categoryValue(ac_val = ac_mat[ms_tab[each_driver,'originalID_label'],], + main_ac = sprintf("%s\n(P.Value:%s)",ms_tab[each_driver,'gene_label'],get_z2p(driver_DA_Z[each_driver])), + use_obs_class=get_obs_label(phe,intgroup)) + } + } + dev.off() + print(sprintf('Finish EachTopDA_CateBox plot by draw.categoryValue(), please check %s',sprintf('%s/%s_EachTopDA_CateBox.pdf',analysis.par$out.dir.PLOT,use_comp))) + # + print('Begin EachTopDA_TargetNet plot by draw.targetNet() ') + pdf(sprintf('%s/%s_EachTopDA_TargetNet.pdf',analysis.par$out.dir.PLOT,use_comp),width=8,height=8) + for(each_driver in driver_list){ + if('spearman' %in% colnames(analysis.par$merge.network$target_list[[each_driver]])){ + es <- analysis.par$merge.network$target_list[[each_driver]]$MI*sign(analysis.par$merge.network$target_list[[each_driver]]$spearman); + }else{ + es <- rep(1,length.out=length(analysis.par$merge.network$target_list[[each_driver]]$target)) + } + names(es) <- analysis.par$merge.network$target_list[[each_driver]]$target + if(main_id_type!='external_gene_name'){ + es <- base::cbind(es,es) + colnames(es) <- c('Sample1','Sample2') + tmp_eset <- generate.eset(es) + if('external_transcript_name' %in% names(transfer_tab)){ + tmp_eset <- update_eset.feature(tmp_eset,use_feature_info = transfer_tab, + from_feature = main_id_type,to_feature = 'external_transcript_name') + }else{ + tmp_eset <- update_eset.feature(tmp_eset,use_feature_info = transfer_tab, + from_feature = main_id_type,to_feature = 'external_gene_name') + } + es <- Biobase::exprs(tmp_eset)[,1];names(es) <- rownames(Biobase::exprs(tmp_eset)) + } + if(base::length(es)<30){n_layer=1;label_cex=1;} + if(base::length(es)<50 & base::length(es)>=30){n_layer=1;label_cex=0.8;} + if(base::length(es)>=50){n_layer <- 1+floor(base::length(es)/50);label_cex=0.7;} + if(n_layer>=4){n_layer <- n_layer/2; label_cex<-label_cex-0.1;} + if(n_layer>=4){n_layer <- n_layer/2; label_cex<-label_cex-0.1;} + draw.targetNet(source_label = ms_tab[each_driver,'gene_label'],source_z = driver_DA_Z[each_driver], + edge_score=es,n_layer = n_layer,label_cex=label_cex) + } + dev.off() + print(sprintf('Finish EachTopDA_TargetNet plot by draw.targetNet(), please check %s',sprintf('%s/%s_EachTopDA_TargetNet.pdf',analysis.par$out.dir.PLOT,use_comp))) + message(sprintf('Finish All !!! Check %s',analysis.par$out.dir.PLOT)) + return(TRUE) +} + +## +#' Lazy mode for NetBID2 driver estimation +#' +#' \code{NetBID.lazyMode.DriverEstimation} is an integrated function for NetBID2 driver estimation. +#' +#' The function will return the complicated list object analysis.par if set return_analysis.par=TRUE. +#' Meanwhile, the master table and the RData containing analysis.par will be automatically saved. +#' +#' @param project_main_dir character, name or absolute path of the main working directory for driver analysis. +#' @param project_name character, name of the project folder. +#' @param tf.network.file character, the path of the TF network file (e.g. "XXX/consensus_network_ncol_.txt"). +#' @param sig.network.file character, the path of the SIG network file (e.g. "XXX/consensus_network_ncol_.txt"). +#' @param cal.eset ExpressionSet class, the ExpressionSet for analysis. +#' @param main_id_type character, the type of driver's ID. It comes from the attribute name in biomaRt package. +#' Such as "ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id", "ensembl_transcript_id_version" or "refseq_mrna". +#' For details, user can call \code{biomaRt::listAttributes()} to display all available attributes in the selected dataset. +#' @param cal.eset_main_id_type character, the type of cal.eset's ID. It comes from the attribute name in biomaRt package. +#' Such as "ensembl_gene_id", "ensembl_gene_id_version", "ensembl_transcript_id", "ensembl_transcript_id_version" or "refseq_mrna". +#' For details, user can call \code{biomaRt::listAttributes()} to display all available attributes in the selected dataset. +#' @param use_level character, users can choose "transcript" or "gene". Default is "gene". +#' @param transfer_tab data.frame, the ID conversion table. Users can call \code{get_IDtransfer} to get this table. +#' Only useful when "cal.eset_main_id_type" does not equal to "main_id_type". +#' This is mainly for converting ID for cal.eset, must include "cal.eset_main_id_type" and "main_id_type". +#' If NULL, will automatically generate it. +#' @param intgroup character, one interested phenotype group from the \code{cal.eset}. +#' @param G1_name character, the name of experimental group (e.g. "Male"), must be the character in \code{intgroup}. +#' @param G0_name character, the name of control group (e.g. "Female"), must be the character in \code{intgroup}. +#' @param comp_name character, the name of the comparison of interest. +#' @param do.QC logical, if TRUE, will perform network QC and activity eSet QC plots. Default is TRUE. +#' @param DE_strategy character, use limma or bid to calculate differentiated expression/activity. Default is 'bid'. +#' @param return_analysis.par logical, if TRUE, will return the complicated list object analysis.par. +#' +#' @examples +#' \dontrun{ +#' network.dir <- sprintf('%s/demo1/network/',system.file(package = "NetBID2")) # use demo +#' network.project.name <- 'project_2019-02-14' # demo project name +#' project_main_dir <- 'test/' +#' project_name <- 'test_driver' +#' tf.network.file <- sprintf('%s/SJAR/%s/output_tf_sjaracne_%s_out_.final/%s', +#' network.dir,network.project.name,network.project.name, +#' 'consensus_network_ncol_.txt') +#' sig.network.file <- sprintf('%s/SJAR/%s/output_sig_sjaracne_%s_out_.final/%s', +#' network.dir,network.project.name,network.project.name, +#' 'consensus_network_ncol_.txt') +#' load(sprintf('%s/DATA/network.par.Step.exp-QC.RData',network.dir)) +#' cal.eset <- network.par$net.eset +#' db.preload(use_level='gene') +#' analysis.par <- NetBID.lazyMode.DriverEstimation(project_main_dir=project_main_dir, +#' project_name=project_name, +#' tf.network.file=tf.network.file, +#' sig.network.file=sig.network.file, +#' cal.eset=cal.eset, +#' main_id_type='external_gene_name', +#' cal.eset_main_id_type='external_gene_name', +#' intgroup='subgroup', +#' G1_name='G4',G0_name='SHH', +#' comp_name='G4.Vs.SHH', +#' do.QC=FALSE,return_analysis.par=TRUE) +#' } +#' @export +NetBID.lazyMode.DriverEstimation <- function(project_main_dir=NULL,project_name=NULL, + tf.network.file=NULL,sig.network.file=NULL, + cal.eset=NULL, + main_id_type=NULL,cal.eset_main_id_type=NULL,use_level='gene', + transfer_tab=NULL, + intgroup=NULL,G1_name=NULL,G0_name=NULL,comp_name=NULL, + do.QC=TRUE,DE_strategy='bid',return_analysis.par=TRUE){ + # + if(exists('analysis.par')==TRUE){ + stop('analysis.par is occupied in the current session,please manually run: rm(analysis.par) and re-try, otherwise will not change !'); + } + all_input_para <- c('project_main_dir','project_name','tf.network.file','sig.network.file','cal.eset', + 'main_id_type','cal.eset_main_id_type','intgroup','G1_name','G0_name','DE_strategy','use_level') + check_res <- sapply(all_input_para,function(x)check_para(x,envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + + check_res <- c(check_option('DE_strategy',c('limma','bid'),envir=environment()), + check_option('use_level',c('gene','transcript'),envir=environment())) + if(base::min(check_res)==0){message('Please check and re-try!');return(FALSE)} + + if(exists('tf_sigs')==FALSE){ + message('Please run db.preload() first to load in db_info !');return(FALSE) + } + if('ensembl_transcript_id' %in% names(tf_sigs$tf)){ + message('You setting is at transcript level! If this setting is not TRUE, please run db.preload() again !') + }else{ + message('You setting is at gene level! If this setting is not TRUE, please run db.preload() again !') + } + print('Current db info:') + print(db_info) + if(!intgroup %in% colnames(Biobase::pData(cal.eset))){ + message(sprintf('%s not included in the Biobase::pData(cal.eset), please check and re-try!',use_comp));return(FALSE) + } + phe <- Biobase::pData(cal.eset) + G1 <- rownames(phe)[which(phe[,intgroup]==G1_name)];G0 <- rownames(phe)[which(phe[,intgroup]==G0_name)]; + if(base::length(G1)==0){message(sprintf('NO Sample annotated by %s in %s, please check and re-try!',G1_name,intgroup));return(FALSE)} + if(base::length(G0)==0){message(sprintf('NO Sample annotated by %s in %s, please check and re-try!',G0_name,intgroup));return(FALSE)} + if(is.null(comp_name)==TRUE) comp_name <- sprintf('%s.Vs.%s',G1_name,G0_name) + if(file.exists(tf.network.file)==FALSE){message('%s not exists, please check and re-try!');return(FALSE)} + if(file.exists(sig.network.file)==FALSE){message('%s not exists, please check and re-try!');return(FALSE)} + # create workspace + print('Begin create workspace by NetBID.analysis.dir.create() ') + analysis.par <- NetBID.analysis.dir.create(project_main_dir=project_main_dir, + project_name=project_name, + tf.network.file=tf.network.file, + sig.network.file=sig.network.file) + print('Finish create workspace') + # read in network + print('Begin read in network from file by get.SJAracne.network() ') + analysis.par$tf.network <- get.SJAracne.network(analysis.par$tf.network.file) + analysis.par$sig.network <- get.SJAracne.network(analysis.par$sig.network.file) + print('Finish read in network from file') + if(do.QC==TRUE & nrow(analysis.par$tf.network$network_dat)>0) draw.network.QC(analysis.par$tf.network$igraph_obj,outdir = analysis.par$out.dir.QC,prefix = 'TF_') + if(do.QC==TRUE & nrow(analysis.par$sig.network$network_dat)>0) draw.network.QC(analysis.par$sig.network$igraph_obj,outdir = analysis.par$out.dir.QC,prefix = 'SIG_') + analysis.par$merge.network <- merge_TF_SIG.network(analysis.par$tf.network,analysis.par$sig.network) + if(cal.eset_main_id_type!=main_id_type){ + message(sprintf('The ID type for the network is %s, and the ID type for the cal.eset is %s, need to transfer the ID type of cal.eset from %s to %s', + main_id_type,cal.eset_main_id_type,cal.eset_main_id_type,main_id_type)) + if(is.null(transfer_tab)==FALSE){ + w1 <- base::setdiff(c(cal.eset_main_id_type,main_id_type),colnames(transfer_tab)) + if(base::length(w1)>0){ + message(sprintf('Wrong ID type for the input transfer_tab, missing %s, try to prepare the correct transfer_tab or set it to NULL',base::paste(w1,collapse=';'))); + return(FALSE) + } + } + if(is.null(transfer_tab)==FALSE){ + transfer_tab1 <- transfer_tab + }else{ + transfer_tab1 <- get_IDtransfer(from_type = cal.eset_main_id_type,to_type=main_id_type,use_genes=rownames(Biobase::exprs(cal.eset)), + ignore_version=TRUE) + } + cal.eset <- update_eset.feature(cal.eset,use_feature_info = transfer_tab1, + from_feature = cal.eset_main_id_type,to_feature = main_id_type) + message('Finish transforming the cal.eset ID ! ') + } + es.method <- 'mean' + if('MI' %in% colnames(analysis.par$merge.network$network_dat) & 'spearman' %in% colnames(analysis.par$merge.network$network_dat)) es.method <- 'weightedmean' + print(sprintf('Begin calculate activity by cal.Activity(), the setting for es.method is %s ',es.method)) + ac_mat <- cal.Activity(igraph_obj = analysis.par$merge.network$igraph_obj, + cal_mat = Biobase::exprs(cal.eset),es.method=es.method) + analysis.par$merge.ac.eset <- generate.eset(exp_mat=ac_mat,phenotype_info = phe) + if(do.QC==TRUE) draw.eset.QC(analysis.par$merge.ac.eset,outdir = analysis.par$out.dir.QC,prefix = 'AC_') + print('Finish calculate activity ') + print(sprintf('Begin prepare ID transfer table to external_%s_name',use_level)) + use_genes = base::unique(c(analysis.par$final_ms_tab$originalID, + analysis.par$merge.network$network_dat$target)) + transfer_tab <- get_IDtransfer2symbol2type(from_type=main_id_type, + use_genes = use_genes, + use_level=use_level,ignore_version=TRUE) + print(str(transfer_tab)) + print('Finish prepare ID transfer table ') + # DE/DA + print(sprintf('Begin get DE/DA for %s by getDE.limma.2G() ',comp_name)) + if(DE_strategy=='limma'){ + de <- getDE.limma.2G(cal.eset,G1 = G1,G0=G0,G1_name=G1_name,G0_name=G0_name) + da <- getDE.limma.2G(analysis.par$merge.ac.eset,G1 = G1,G0=G0,G1_name=G1_name,G0_name=G0_name) + }else{ + de <- getDE.BID.2G(cal.eset,G1 = G1,G0=G0,G1_name=G1_name,G0_name=G0_name) + da <- getDE.BID.2G(analysis.par$merge.ac.eset,G1 = G1,G0=G0,G1_name=G1_name,G0_name=G0_name) + } + DE <- list(de); DA <- list(da); names(DE) <- names(DA) <- comp_name + print(sprintf('Finish get DE/DA for %s',comp_name)) + # + print('Begin generate master table by generate.masterTable() ') + ms_tab <- generate.masterTable(use_comp=comp_name,DE=DE,DA=DA,target_list = analysis.par$merge.network$target_list, + main_id_type = main_id_type,transfer_tab=transfer_tab,tf_sigs = tf_sigs) + print('Finish generate master table') + out_file <- sprintf('%s/%s_ms_tab.xlsx',analysis.par$out.dir.DATA,analysis.par$project.name) + out2excel(ms_tab,out.xlsx = out_file) + message(sprintf('Finish output master table to excel file, please check %s',out_file)) + analysis.par$DE <- DE;analysis.par$DA <- DA; + analysis.par$final_ms_tab <- ms_tab;analysis.par$cal.eset <- cal.eset + analysis.par$transfer_tab <- transfer_tab + NetBID.saveRData(analysis.par = analysis.par,step = 'ms-tab') + message(sprintf('Finish save RData to file, please check %s/analysis.par.Step.ms-tab.RData',analysis.par$out.dir.DATA)) + if(return_analysis.par==TRUE) return(analysis.par) else return(TRUE) +} +