Merge pull request #21 from pkathail/develop

Latest develop changes
KrishnaswamyLab · Mar 31, 2017 · b53d723 · b53d723
2 parents 2054d0e + b7873e9
commit b53d723
Show file tree

Hide file tree

Showing 11 changed files with 1,008 additions and 551 deletions.
diff --git a/README.md b/README.md
@@ -1,25 +1,29 @@
 Markov Affinity-based Graph Imputation of Cells (MAGIC)
 -------------------------------------------------------
 
+MAGIC has been implemented in Python3 and Matlab.
+
+#### Installation and dependencies for the Python version
+1. The Python3 version of MAGIC can be installed using:
 
-#### Installation and dependencies
-1. MAGIC has been implemented in Python3 and can be installed using
-2. 
         $> git clone git://github.com/pkathail/magic.git
         $> cd magic
-        $> sudo pip3 install .
+        $> sudo -H pip3 install .
 
 2. MAGIC depends on a number of `python3` packages available on pypi and these dependencies are listed in `setup.py`
 All the dependencies will be automatically installed using the above commands
 
 #### Usage
 
 ##### Command line
-A tutorial on MAGIC usage and results visualization for single cell RNA-seq data can be found in this notebook: http://nbviewer.jupyter.org/github/pkathail/magic/blob/magic_develop/notebooks/Magic_single_cell_RNAseq.ipynb
+A tutorial on MAGIC usage and results visualization for single cell RNA-seq data can be found in this notebook: http://nbviewer.jupyter.org/github/pkathail/magic/blob/develop/notebooks/Magic_single_cell_RNAseq.ipynb
 
 
 ##### GUI
 A python GUI is now available for MAGIC. After following the installation steps listed below, the GUI can be invoked using
 
         $> magic_gui.py
 
+#### Installation and dependencies for the Matlab version
+1. Matlab implementation of MAGIC uses Mauro Maggioni's Diffusion Geometry code. Download from here: http://www.math.jhu.edu/~mauro/Code/DiffusionGeometry_01.zip or use included DiffusionGeometry_01.zip
+2. test_magic.m shows how to run MAGIC. Also included is a function for loading 10x format data (load_10x.m)
diff --git a/docs/magic_tutorial.pptx b/docs/magic_tutorial.pptx
diff --git a/matlab/DiffusionGeometry_01.zip b/matlab/DiffusionGeometry_01.zip
diff --git a/matlab/load_10x.m b/matlab/load_10x.m
@@ -0,0 +1,60 @@
+function [data, gene_names, gene_ids, cells] = load_10x(data_dir, varargin)
+% [data, gene_names, gene_ids, cells] = load_10x(data_dir, varargin)
+%   loads 10x sparse format data
+%   data_dir is dir that contains matrix.mtx, genes.tsv and barcodes.tsv
+%   varargin
+%       'sparse', true -- returns data matrix in sparse format (default 'false')
+
+return_sparse = false;
+
+if isempty(data_dir)
+    data_dir = './';
+elseif data_dir(end) ~= '/'
+    data_dir = [data_dir '/']; 
+end
+
+for i=1:length(varargin)-1
+    if (strcmp(varargin{i}, 'sparse'))
+        return_sparse = varargin{i+1};
+    end
+end
+
+filename_dataMatrix = [data_dir 'matrix.mtx'];
+filename_genes = [data_dir 'genes.tsv'];
+filename_cells = [data_dir 'barcodes.tsv'];
+
+
+% Read in gene expression matrix (sparse matrix)
+% Rows = genes, columns = cells
+fprintf('LOADING\n')
+dataMatrix = mmread(filename_dataMatrix);
+fprintf('  Data matrix (%i cells x %i genes): %s\n', ...
+        size(dataMatrix'), ['''' filename_dataMatrix '''' ])
+
+% Read in row names (gene names / IDs)
+dataMatrix_genes = table2cell( ...
+                   readtable(filename_genes, ...
+                             'FileType','text','ReadVariableNames',0));
+dataMatrix_cells = table2cell( ...
+                   readtable(filename_cells, ...
+                             'FileType','text','ReadVariableNames',0));
+
+% Remove empty cells
+col_keep = any(dataMatrix,1);
+dataMatrix = dataMatrix(:,col_keep);
+dataMatrix_cells = dataMatrix_cells(col_keep,:);
+fprintf('  Removed %i empty cells\n', full(sum(~col_keep)))
+
+% Remove empty genes
+genes_keep = any(dataMatrix,2);
+dataMatrix = dataMatrix(genes_keep,:);
+dataMatrix_genes = dataMatrix_genes(genes_keep,:);
+fprintf('  Removed %i empty genes\n', full(sum(~genes_keep)))
+
+data = dataMatrix';
+if ~return_sparse
+    data = full(data);
+end
+gene_names = dataMatrix_genes(:,2);
+gene_ids = dataMatrix_genes(:,1);
+cells = dataMatrix_cells;
diff --git a/matlab/run_magic.m b/matlab/run_magic.m
@@ -0,0 +1,91 @@
+function data_imputed = run_magic(data, t, varargin)
+% run MAGIC
+
+% data must have cells on the rows and genes on the columns
+% t is diffusion time
+% varargin:
+%   'npca' (defauklt = 20)
+%       perform fast random PCA before computing distances
+%   'ka' (default = 10)
+%       k of adaptive kernel
+%       0 for non-adaptive (standard gaussian) kernel with bandwidth sigma
+%   'k' (defauklt = 30)
+%       k of kNN graph
+%   'sigma' (default = 1)
+%       sigma of kernel bandwidth
+%       for adaptive kernel (ka>0) the effective bandwidth is sigma * the
+%       distance to the ka-th neighbor
+%   'rescale_to' (default = 0, no rescale)
+%       rescale genes to 'rescale_to' percentile
+%       set to 0 for log scaled data
+
+% set up default parameters
+k = 30;
+ka = 10;
+npca = 20;
+sigma = 1;
+rescale_to = 0;
+
+% get the input parameters
+if ~isempty(varargin)
+    for j = 1:length(varargin)
+        % k nearest neighbor 
+        if strcmp(varargin{j}, 'ka')
+            ka = varargin{j+1};
+        end
+        % for knn-autotune
+        if strcmp(varargin{j}, 'k')
+            k = varargin{j+1};
+        end
+        % npca to project data
+        if strcmp(varargin{j}, 'npca')
+            npca = varargin{j+1};
+        end
+        % sigma of kernel bandwidth
+        if strcmp(varargin{j}, 'sigma')
+            sigma = varargin{j+1};
+        end
+        % sigma of kernel bandwidth
+        if strcmp(varargin{j}, 'rescale_to')
+            rescale_to = varargin{j+1};
+        end
+    end
+end
+
+% Kernel
+disp 'Computing kernel'
+Options.Display = 1;
+Options.Epsilon = sigma;
+Options.kNN = k;
+Options.kNNAutotune = ka;
+Options.NNMaxDim = npca;
+Options.Normalization = 'markov';
+G = GraphDiffusion(data', 0, Options);
+L = full(G.T);
+
+% Diffuse
+disp(['Diffusing for ' num2str(t) ' steps']);
+L_t = L^t;
+
+% Impute
+disp 'Imputing'
+data_imputed = L_t * data;
+
+% Rescale
+if rescale_to > 0
+    if ~any(data(:)<0)
+        disp 'Rescaling'
+        MR = prctile(data, rescale_to);
+        M = max(data);
+        MR(MR == 0) = M(MR == 0);
+        MR_new = prctile(data_imputed, rescale_to);
+        M_new = max(data_imputed);
+        MR_new(MR_new == 0) = M_new(MR_new == 0);
+        max_ratio = MR ./ MR_new;
+        data_imputed = data_imputed .* repmat(max_ratio, size(data,1), 1);
+    else
+        disp('Negative values detected (log scaled?) so no rescale is done.')
+    end
+end
+
+disp 'done'
diff --git a/matlab/test_magic.m b/matlab/test_magic.m
@@ -0,0 +1,87 @@
+%% init
+% Matlab implementation of MAGIC uses Mauro Maggioni's Diffusion Geometry code
+% download from: http://www.math.jhu.edu/~mauro/Code/DiffusionGeometry_01.zip
+addpath(genpath('DiffusionGeometry/'));
+
+%% load data (e.g. 10x data)
+% data should be cells as rows and genes as columns
+sample_dir = 'path_to_data/';
+[data, gene_names, gene_ids, cells] = load_10x(sample_dir);
+
+%% library size normalization
+libsize  = sum(data,2);
+data = bsxfun(@rdivide, data, libsize) * median(libsize);
+
+%% log transform -- some data requires log transform
+%data = log(data + 0.1); % 0.1 is pseudocount
+
+%% MAGIC
+npca = 20; % ususally between 10 and 200
+ka = 10; % can be smaller, eg 3
+k = 30; % can be smaller, eg 9
+t = 6; % usually between 6 and 12, smaller ka/k requitres bigger t
+rescale_to = 99; % 0 (no rescale) if data is log scaled
+data_imputed = run_magic(data, t, 'npca', npca, 'ka', ka, 'k', k, 'rescale_to', rescale_to);
+
+%% plot
+plot_genes = {'Cdh1', 'Vim', 'Fn1', 'Zeb1'};
+ms = 20;
+v = [-45 20];
+% before MAGIC
+x = data(:, ismember(lower(gene_names), lower(plot_genes{1})));
+y = data(:, ismember(lower(gene_names), lower(plot_genes{2})));
+z = data(:, ismember(lower(gene_names), lower(plot_genes{3})));
+c = data(:, ismember(lower(gene_names), lower(plot_genes{4})));
+figure;
+subplot(2,2,1);
+scatter(y, x, ms, c, 'filled');
+colormap(parula);
+axis tight
+xlabel(plot_genes{2});
+ylabel(plot_genes{1});
+%h = colorbar;
+%ylabel(h,plot_genes{4});
+title 'Before MAGIC'
+
+subplot(2,2,2);
+scatter3(x, y, z, ms, c, 'filled');
+colormap(parula);
+axis tight
+xlabel(plot_genes{1});
+ylabel(plot_genes{2});
+zlabel(plot_genes{3});
+h = colorbar;
+ylabel(h,plot_genes{4});
+view(v);
+title 'Before MAGIC'
+
+% plot after MAGIC
+x = data_imputed(:, ismember(lower(gene_names), lower(plot_genes{1})));
+y = data_imputed(:, ismember(lower(gene_names), lower(plot_genes{2})));
+z = data_imputed(:, ismember(lower(gene_names), lower(plot_genes{3})));
+c = data_imputed(:, ismember(lower(gene_names), lower(plot_genes{4})));
+subplot(2,2,3);
+scatter(y, x, ms, c, 'filled');
+colormap(parula);
+axis tight
+xlabel(plot_genes{2});
+ylabel(plot_genes{1});
+%h = colorbar;
+%ylabel(h,plot_genes{4});
+title 'After MAGIC'
+
+subplot(2,2,4);
+scatter3(x, y, z, ms, c, 'filled');
+colormap(parula);
+axis tight
+xlabel(plot_genes{1});
+ylabel(plot_genes{2});
+zlabel(plot_genes{3});
+h = colorbar;
+ylabel(h,plot_genes{4});
+view(v);
+title 'After MAGIC'
+
+
+
+
diff --git a/notebooks/Magic_single_cell_RNAseq.ipynb b/notebooks/Magic_single_cell_RNAseq.ipynb
diff --git a/setup.py b/setup.py
@@ -1,18 +1,13 @@
 import os
 import sys
 import shutil
-from subprocess import call
 from setuptools import setup
 from warnings import warn
 
 if sys.version_info.major != 3:
     raise RuntimeError('Magic requires Python 3')
 
 
-# install phenograph
-call(['pip3', 'install', 'git+https://github.com/jacoblevine/phenograph.git'])
-
-
 setup(name='magic',
       version='0.0',
       description='MAGIC',
@@ -29,17 +24,11 @@
           'sklearn',
           'networkx',
           'fcsparser',
-          'statsmodels'],
+          'statsmodels',
+      ],
       scripts=['src/magic/magic_gui.py'],
       )
 
 
 # get location of setup.py
 setup_dir = os.path.dirname(os.path.realpath(__file__))
-
-# Copy test data
-data_dir = os.path.expanduser('~/.magic/data')
-if os.path.isdir(data_dir):
-    shutil.rmtree(data_dir)
-shutil.copytree(setup_dir + '/data/', data_dir)
-
diff --git a/src/magic/MAGIC.py b/src/magic/MAGIC.py
@@ -7,25 +7,18 @@
 from scipy.spatial.distance import squareform
 from sklearn.neighbors import NearestNeighbors
 
-def magic(data, kernel='gaussian', n_pca_components=20, random_pca=True, 
-          t=6, knn=30, knn_autotune=10, epsilon=1, rescale=99, k_knn=100, perplexity=30):
+def magic(data, n_pca_components=20, random_pca=True, 
+          t=6, knn=30, knn_autotune=10, epsilon=1, rescale=99):
 
-    if kernel not in ['gaussian']:
-        raise RuntimeError('Invalid kernel type. Must be "gaussian".')
-
-    #library size normalization
-    #create data_norm
-
-    #always pass in data_norm
     if n_pca_components != None:
+        print('doing PCA')
         pca_projected_data = run_pca(data, n_components=n_pca_components, random=random_pca)
     else:
         pca_projected_data = data
 
-    if kernel == 'gaussian':
-        #run diffusion maps to get markov matrix
-        L = compute_markov(pca_projected_data, knn=knn, epsilon=epsilon, 
-                                       distance_metric='euclidean', knn_autotune=knn_autotune)
+    #run diffusion maps to get markov matrix
+    L = compute_markov(pca_projected_data, knn=knn, epsilon=epsilon, 
+                       distance_metric='euclidean', knn_autotune=knn_autotune)
 
     #remove tsne kernel for now
     # else:
@@ -37,7 +30,6 @@ def magic(data, kernel='gaussian', n_pca_components=20, random_pca=True,
     #         P = _joint_probabilities(distances, perplexity, 1)
     #     P = squareform(P)
 
-    ## QUESTION -- should this happen for gaussian kernel too??
     #     #markov normalize P
     #     L = np.divide(P, np.sum(P, axis=1))
 
@@ -75,6 +67,11 @@ def impute_fast(data, L, t, rescale_percent=0, L_t=None, tprev=None):
 
     #rescale data
     if rescale_percent != 0:
+        if len(np.where(data_new < 0)[0]) > 0:
+            print('Rescaling should not be performed on log-transformed '
+                  '(or other negative) values. Imputed data returned unscaled.')
+            return data_new, L_t
+
         M99 = np.percentile(data, rescale_percent, axis=0)
         M100 = data.max(axis=0)
         indices = np.where(M99 == 0)[0]
@@ -94,6 +91,7 @@ def compute_markov(data, knn=10, epsilon=1, distance_metric='euclidean', knn_aut
     N = data.shape[0]
 
     # Nearest neighbors
+    print('Computing distances')
     nbrs = NearestNeighbors(n_neighbors=knn, metric=distance_metric).fit(data)
     distances, indices = nbrs.kneighbors(data)
 
@@ -108,6 +106,7 @@ def compute_markov(data, knn=10, epsilon=1, distance_metric='euclidean', knn_aut
                 distances[j] = np.divide(distances[j], temp[lMaxTempIdxs])
 
     # Adjacency matrix
+    print('Computing kernel')
     rows = np.zeros(N * knn, dtype=np.int32)
     cols = np.zeros(N * knn, dtype=np.int32)
     dists = np.zeros(N * knn)