add the src folder and update the documentation

cole-trapnell-lab · Nov 13, 2017 · aeae6cb · aeae6cb
1 parent 9affc08
commit aeae6cb
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 12 deletions.
diff --git a/R/clustering.R b/R/clustering.R
@@ -54,16 +54,15 @@ clusterGenes<-function(expr_matrix, k, method=function(x){as.dist((1 - cor(Matri
 #' @param peaks A numeric vector indicates the index of density peaks used for clustering. This vector should be retrieved from the decision plot with caution. No checking involved.  
 #' will automatically calculated based on the top num_cluster product of rho and sigma. 
 #' @param gaussian A logic flag passed to densityClust function in desnityClust package to determine whether or not Gaussian kernel will be used for calculating the local density
-#' @param frequency_thresh When a CellTypeHierarchy is provided, cluster cells will impute cell types in clusters that are composed of at least this much of exactly one cell type.
-#' @param verbose Verbose parameter for DDRTree
 #' @param cell_type_hierarchy A data structure used for organizing functions that can be used for organizing cells 
+#' @param frequency_thresh When a CellTypeHierarchy is provided, cluster cells will impute cell types in clusters that are composed of at least this much of exactly one cell type.
 #' @param enrichment_thresh fraction to be multipled by each cell type percentage. Only used if frequency_thresh is NULL, both cannot be NULL
 #' @param clustering_genes a vector of feature ids (from the CellDataSet's featureData) used for ordering cells
-#' @param k number of kNN used in creating the k nearest neighbor graph for Louvain clustering. Default to be 50
+#' @param k number of kNN used in creating the k nearest neighbor graph for Louvain clustering. The number of kNN is related to the resolution of the clustering result, bigger number of kNN gives low resolution and vice versa. Default to be 50
 #' @param louvain_iter number of iterations used for Louvain clustering. The clustering result gives the largest modularity score will be used as the final clustering result.  Default to be 1. 
 #' @param weight A logic argument to determine whether or not we will use Jaccard coefficent for two nearest neighbors (based on the overlapping of their kNN) as the weight used for Louvain clustering. Default to be FALSE.
-#' @param method method for clustering cells. By default, we use density peak clustering algorithm for clustering. 
-#' The other method is based on DDRTree. 
+#' @param method method for clustering cells. Three methods are available, including densityPeak, louvian and DDRTree. By default, we use density peak clustering algorithm for clustering. For big datasets (like data with 50 k cells or so), we recommend using the louvain clustering algorithm. 
+#' @param verbose Verbose A logic flag to determine whether or not we should print the running details. 
 #' @param ... Additional arguments passed to \code{\link{densityClust}()}
 #' @return an updated CellDataSet object, in which phenoData contains values for Cluster for each cell
 #' @importFrom densityClust densityClust findClusters
@@ -72,8 +71,8 @@ clusterGenes<-function(expr_matrix, k, method=function(x){as.dist((1 - cor(Matri
 #' @importFrom RANN nn2
 #' @useDynLib monocle
 #' @references Rodriguez, A., & Laio, A. (2014). Clustering by fast search and find of density peaks. Science, 344(6191), 1492-1496. doi:10.1126/science.1242072
+#' @references Vincent D. Blondel, Jean-Loup Guillaume, Renaud Lambiotte, Etienne Lefebvre: Fast unfolding of communities in large networks. J. Stat. Mech. (2008) P10008
 #' @references Jacob H. Levine and et.al. Data-Driven Phenotypic Dissection of AML Reveals Progenitor-like Cells that Correlate with Prognosis. Cell, 2015. 
-#' @references 
 #' @export
 
 clusterCells <- function(cds, 
@@ -98,7 +97,7 @@ clusterCells <- function(cds,
 
   if(ncol(cds) > 500000) {
     if(method %in% c('densityPeak', "DDRTree")) {
-      warning('Number of cells in your data is larger than 50 k, clusterCells with densityPeak or DDRTree may crash!')
+      warning('Number of cells in your data is larger than 50 k, clusterCells with densityPeak or DDRTree may crash. Please try to use the newly added Louvain clustering algorithm!')
     }
   }
 
@@ -248,7 +247,7 @@ clusterCells <- function(cds,
     }
 
     if(verbose) {
-      message("Run Rphenograph starts:","\n", 
+      message("Run phenograph starts:","\n", 
               "  -Input data of ", nrow(data)," rows and ", ncol(data), " columns","\n",
               "  -k is set to ", k)
     }

diff --git a/man/clusterCells.Rd b/man/clusterCells.Rd
diff --git a/src/.gitignore b/src/.gitignore
@@ -0,0 +1,3 @@
+*.o
+*.so
+*.dll
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -0,0 +1,29 @@
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// jaccard_coeff
+NumericMatrix jaccard_coeff(NumericMatrix idx, bool weight);
+RcppExport SEXP _monocle_jaccard_coeff(SEXP idxSEXP, SEXP weightSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< NumericMatrix >::type idx(idxSEXP);
+    Rcpp::traits::input_parameter< bool >::type weight(weightSEXP);
+    rcpp_result_gen = Rcpp::wrap(jaccard_coeff(idx, weight));
+    return rcpp_result_gen;
+END_RCPP
+}
+
+static const R_CallMethodDef CallEntries[] = {
+    {"_monocle_jaccard_coeff", (DL_FUNC) &_monocle_jaccard_coeff, 2},
+    {NULL, NULL, 0}
+};
+
+RcppExport void R_init_monocle(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/src/clustering.cpp b/src/clustering.cpp
@@ -0,0 +1,59 @@
+#include <Rcpp.h>
+using namespace Rcpp;
+
+// Compute jaccard coefficient between nearest-neighbor sets
+//
+// Weights of both i->j and j->i are recorded if they have intersection. In this case
+// w(i->j) should be equal to w(j->i). In some case i->j has weights while j<-i has no
+// intersections, only w(i->j) is recorded. This is determinded in code `if(u>0)`. 
+// In this way, the undirected graph is symmetrized by halfing the weight 
+// in code `weights(r, 2) = u/(2.0*ncol - u)/2`.
+//
+// Author: Chen Hao, Date: 25/09/2015; updated by Xiaojie Qiu Nov. 12, 2017
+
+
+// [[Rcpp::export]]
+NumericMatrix jaccard_coeff(NumericMatrix idx, bool weight) {
+  int nrow = idx.nrow(), ncol = idx.ncol(), r = 0;
+  NumericMatrix weights(nrow*ncol, 3);
+
+  for(int i = 0; i < nrow; i ++) {
+    for(int j = 0; j < ncol; j ++) {
+      int k = idx(i,j) - 1;
+
+      weights(r, 0) = i + 1;
+      weights(r, 1) = k + 1;
+      weights(r, 2) = 1;
+
+      if(weight == TRUE) {
+
+        NumericVector nodei = idx(i, _);
+        NumericVector nodej = idx(k, _);
+
+        int u = intersect(nodei, nodej).size();  // count intersection number
+        int v = 2 * ncol - u;  // count union number
+
+        if(u>0) { 
+          // weights(r, 0) = i + 1;
+          // weights(r, 1) = k + 1;
+          // weights(r, 2) = u / (2.0 * ncol - u) / 2;  // symmetrize the graph
+
+          weights(r, 2) = u / v;  // normalize the values
+        }
+      }
+
+      r ++;
+
+    }
+  }
+
+  weights(_, 2) = weights(_, 2) / max(weights(_, 2));
+
+  return weights;
+}
+
+/***
+ edges$C = jaccard_dist
+ edges = subset(edges, C != 0)
+ edges$C = edges$C/max(edges$C)
+ */
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    *.o
+    *.so
+    *.dll