Skip to content

Commit

Permalink
Merge pull request #21 from pkathail/develop
Browse files Browse the repository at this point in the history
Latest develop changes
  • Loading branch information
pkathail authored Mar 31, 2017
2 parents 2054d0e + b7873e9 commit b53d723
Show file tree
Hide file tree
Showing 11 changed files with 1,008 additions and 551 deletions.
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,29 @@
Markov Affinity-based Graph Imputation of Cells (MAGIC)
-------------------------------------------------------

MAGIC has been implemented in Python3 and Matlab.

#### Installation and dependencies for the Python version
1. The Python3 version of MAGIC can be installed using:

#### Installation and dependencies
1. MAGIC has been implemented in Python3 and can be installed using
2.
$> git clone git://github.com/pkathail/magic.git
$> cd magic
$> sudo pip3 install .
$> sudo -H pip3 install .

2. MAGIC depends on a number of `python3` packages available on pypi and these dependencies are listed in `setup.py`
All the dependencies will be automatically installed using the above commands

#### Usage

##### Command line
A tutorial on MAGIC usage and results visualization for single cell RNA-seq data can be found in this notebook: http://nbviewer.jupyter.org/github/pkathail/magic/blob/magic_develop/notebooks/Magic_single_cell_RNAseq.ipynb
A tutorial on MAGIC usage and results visualization for single cell RNA-seq data can be found in this notebook: http://nbviewer.jupyter.org/github/pkathail/magic/blob/develop/notebooks/Magic_single_cell_RNAseq.ipynb


##### GUI
A python GUI is now available for MAGIC. After following the installation steps listed below, the GUI can be invoked using

$> magic_gui.py

#### Installation and dependencies for the Matlab version
1. Matlab implementation of MAGIC uses Mauro Maggioni's Diffusion Geometry code. Download from here: http://www.math.jhu.edu/~mauro/Code/DiffusionGeometry_01.zip or use included DiffusionGeometry_01.zip
2. test_magic.m shows how to run MAGIC. Also included is a function for loading 10x format data (load_10x.m)
Binary file added docs/magic_tutorial.pptx
Binary file not shown.
Binary file added matlab/DiffusionGeometry_01.zip
Binary file not shown.
60 changes: 60 additions & 0 deletions matlab/load_10x.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
function [data, gene_names, gene_ids, cells] = load_10x(data_dir, varargin)
% [data, gene_names, gene_ids, cells] = load_10x(data_dir, varargin)
% loads 10x sparse format data
% data_dir is dir that contains matrix.mtx, genes.tsv and barcodes.tsv
% varargin
% 'sparse', true -- returns data matrix in sparse format (default 'false')

return_sparse = false;

if isempty(data_dir)
data_dir = './';
elseif data_dir(end) ~= '/'
data_dir = [data_dir '/'];
end

for i=1:length(varargin)-1
if (strcmp(varargin{i}, 'sparse'))
return_sparse = varargin{i+1};
end
end

filename_dataMatrix = [data_dir 'matrix.mtx'];
filename_genes = [data_dir 'genes.tsv'];
filename_cells = [data_dir 'barcodes.tsv'];


% Read in gene expression matrix (sparse matrix)
% Rows = genes, columns = cells
fprintf('LOADING\n')
dataMatrix = mmread(filename_dataMatrix);
fprintf(' Data matrix (%i cells x %i genes): %s\n', ...
size(dataMatrix'), ['''' filename_dataMatrix '''' ])

% Read in row names (gene names / IDs)
dataMatrix_genes = table2cell( ...
readtable(filename_genes, ...
'FileType','text','ReadVariableNames',0));
dataMatrix_cells = table2cell( ...
readtable(filename_cells, ...
'FileType','text','ReadVariableNames',0));

% Remove empty cells
col_keep = any(dataMatrix,1);
dataMatrix = dataMatrix(:,col_keep);
dataMatrix_cells = dataMatrix_cells(col_keep,:);
fprintf(' Removed %i empty cells\n', full(sum(~col_keep)))

% Remove empty genes
genes_keep = any(dataMatrix,2);
dataMatrix = dataMatrix(genes_keep,:);
dataMatrix_genes = dataMatrix_genes(genes_keep,:);
fprintf(' Removed %i empty genes\n', full(sum(~genes_keep)))

data = dataMatrix';
if ~return_sparse
data = full(data);
end
gene_names = dataMatrix_genes(:,2);
gene_ids = dataMatrix_genes(:,1);
cells = dataMatrix_cells;
91 changes: 91 additions & 0 deletions matlab/run_magic.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
function data_imputed = run_magic(data, t, varargin)
% run MAGIC

% data must have cells on the rows and genes on the columns
% t is diffusion time
% varargin:
% 'npca' (defauklt = 20)
% perform fast random PCA before computing distances
% 'ka' (default = 10)
% k of adaptive kernel
% 0 for non-adaptive (standard gaussian) kernel with bandwidth sigma
% 'k' (defauklt = 30)
% k of kNN graph
% 'sigma' (default = 1)
% sigma of kernel bandwidth
% for adaptive kernel (ka>0) the effective bandwidth is sigma * the
% distance to the ka-th neighbor
% 'rescale_to' (default = 0, no rescale)
% rescale genes to 'rescale_to' percentile
% set to 0 for log scaled data

% set up default parameters
k = 30;
ka = 10;
npca = 20;
sigma = 1;
rescale_to = 0;

% get the input parameters
if ~isempty(varargin)
for j = 1:length(varargin)
% k nearest neighbor
if strcmp(varargin{j}, 'ka')
ka = varargin{j+1};
end
% for knn-autotune
if strcmp(varargin{j}, 'k')
k = varargin{j+1};
end
% npca to project data
if strcmp(varargin{j}, 'npca')
npca = varargin{j+1};
end
% sigma of kernel bandwidth
if strcmp(varargin{j}, 'sigma')
sigma = varargin{j+1};
end
% sigma of kernel bandwidth
if strcmp(varargin{j}, 'rescale_to')
rescale_to = varargin{j+1};
end
end
end

% Kernel
disp 'Computing kernel'
Options.Display = 1;
Options.Epsilon = sigma;
Options.kNN = k;
Options.kNNAutotune = ka;
Options.NNMaxDim = npca;
Options.Normalization = 'markov';
G = GraphDiffusion(data', 0, Options);
L = full(G.T);

% Diffuse
disp(['Diffusing for ' num2str(t) ' steps']);
L_t = L^t;

% Impute
disp 'Imputing'
data_imputed = L_t * data;

% Rescale
if rescale_to > 0
if ~any(data(:)<0)
disp 'Rescaling'
MR = prctile(data, rescale_to);
M = max(data);
MR(MR == 0) = M(MR == 0);
MR_new = prctile(data_imputed, rescale_to);
M_new = max(data_imputed);
MR_new(MR_new == 0) = M_new(MR_new == 0);
max_ratio = MR ./ MR_new;
data_imputed = data_imputed .* repmat(max_ratio, size(data,1), 1);
else
disp('Negative values detected (log scaled?) so no rescale is done.')
end
end

disp 'done'
87 changes: 87 additions & 0 deletions matlab/test_magic.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
%% init
% Matlab implementation of MAGIC uses Mauro Maggioni's Diffusion Geometry code
% download from: http://www.math.jhu.edu/~mauro/Code/DiffusionGeometry_01.zip
addpath(genpath('DiffusionGeometry/'));

%% load data (e.g. 10x data)
% data should be cells as rows and genes as columns
sample_dir = 'path_to_data/';
[data, gene_names, gene_ids, cells] = load_10x(sample_dir);

%% library size normalization
libsize = sum(data,2);
data = bsxfun(@rdivide, data, libsize) * median(libsize);

%% log transform -- some data requires log transform
%data = log(data + 0.1); % 0.1 is pseudocount

%% MAGIC
npca = 20; % ususally between 10 and 200
ka = 10; % can be smaller, eg 3
k = 30; % can be smaller, eg 9
t = 6; % usually between 6 and 12, smaller ka/k requitres bigger t
rescale_to = 99; % 0 (no rescale) if data is log scaled
data_imputed = run_magic(data, t, 'npca', npca, 'ka', ka, 'k', k, 'rescale_to', rescale_to);

%% plot
plot_genes = {'Cdh1', 'Vim', 'Fn1', 'Zeb1'};
ms = 20;
v = [-45 20];
% before MAGIC
x = data(:, ismember(lower(gene_names), lower(plot_genes{1})));
y = data(:, ismember(lower(gene_names), lower(plot_genes{2})));
z = data(:, ismember(lower(gene_names), lower(plot_genes{3})));
c = data(:, ismember(lower(gene_names), lower(plot_genes{4})));
figure;
subplot(2,2,1);
scatter(y, x, ms, c, 'filled');
colormap(parula);
axis tight
xlabel(plot_genes{2});
ylabel(plot_genes{1});
%h = colorbar;
%ylabel(h,plot_genes{4});
title 'Before MAGIC'

subplot(2,2,2);
scatter3(x, y, z, ms, c, 'filled');
colormap(parula);
axis tight
xlabel(plot_genes{1});
ylabel(plot_genes{2});
zlabel(plot_genes{3});
h = colorbar;
ylabel(h,plot_genes{4});
view(v);
title 'Before MAGIC'

% plot after MAGIC
x = data_imputed(:, ismember(lower(gene_names), lower(plot_genes{1})));
y = data_imputed(:, ismember(lower(gene_names), lower(plot_genes{2})));
z = data_imputed(:, ismember(lower(gene_names), lower(plot_genes{3})));
c = data_imputed(:, ismember(lower(gene_names), lower(plot_genes{4})));
subplot(2,2,3);
scatter(y, x, ms, c, 'filled');
colormap(parula);
axis tight
xlabel(plot_genes{2});
ylabel(plot_genes{1});
%h = colorbar;
%ylabel(h,plot_genes{4});
title 'After MAGIC'

subplot(2,2,4);
scatter3(x, y, z, ms, c, 'filled');
colormap(parula);
axis tight
xlabel(plot_genes{1});
ylabel(plot_genes{2});
zlabel(plot_genes{3});
h = colorbar;
ylabel(h,plot_genes{4});
view(v);
title 'After MAGIC'




239 changes: 151 additions & 88 deletions notebooks/Magic_single_cell_RNAseq.ipynb

Large diffs are not rendered by default.

15 changes: 2 additions & 13 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,13 @@
import os
import sys
import shutil
from subprocess import call
from setuptools import setup
from warnings import warn

if sys.version_info.major != 3:
raise RuntimeError('Magic requires Python 3')


# install phenograph
call(['pip3', 'install', 'git+https://github.com/jacoblevine/phenograph.git'])


setup(name='magic',
version='0.0',
description='MAGIC',
Expand All @@ -29,17 +24,11 @@
'sklearn',
'networkx',
'fcsparser',
'statsmodels'],
'statsmodels',
],
scripts=['src/magic/magic_gui.py'],
)


# get location of setup.py
setup_dir = os.path.dirname(os.path.realpath(__file__))

# Copy test data
data_dir = os.path.expanduser('~/.magic/data')
if os.path.isdir(data_dir):
shutil.rmtree(data_dir)
shutil.copytree(setup_dir + '/data/', data_dir)

27 changes: 13 additions & 14 deletions src/magic/MAGIC.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,18 @@
from scipy.spatial.distance import squareform
from sklearn.neighbors import NearestNeighbors

def magic(data, kernel='gaussian', n_pca_components=20, random_pca=True,
t=6, knn=30, knn_autotune=10, epsilon=1, rescale=99, k_knn=100, perplexity=30):
def magic(data, n_pca_components=20, random_pca=True,
t=6, knn=30, knn_autotune=10, epsilon=1, rescale=99):

if kernel not in ['gaussian']:
raise RuntimeError('Invalid kernel type. Must be "gaussian".')

#library size normalization
#create data_norm

#always pass in data_norm
if n_pca_components != None:
print('doing PCA')
pca_projected_data = run_pca(data, n_components=n_pca_components, random=random_pca)
else:
pca_projected_data = data

if kernel == 'gaussian':
#run diffusion maps to get markov matrix
L = compute_markov(pca_projected_data, knn=knn, epsilon=epsilon,
distance_metric='euclidean', knn_autotune=knn_autotune)
#run diffusion maps to get markov matrix
L = compute_markov(pca_projected_data, knn=knn, epsilon=epsilon,
distance_metric='euclidean', knn_autotune=knn_autotune)

#remove tsne kernel for now
# else:
Expand All @@ -37,7 +30,6 @@ def magic(data, kernel='gaussian', n_pca_components=20, random_pca=True,
# P = _joint_probabilities(distances, perplexity, 1)
# P = squareform(P)

## QUESTION -- should this happen for gaussian kernel too??
# #markov normalize P
# L = np.divide(P, np.sum(P, axis=1))

Expand Down Expand Up @@ -75,6 +67,11 @@ def impute_fast(data, L, t, rescale_percent=0, L_t=None, tprev=None):

#rescale data
if rescale_percent != 0:
if len(np.where(data_new < 0)[0]) > 0:
print('Rescaling should not be performed on log-transformed '
'(or other negative) values. Imputed data returned unscaled.')
return data_new, L_t

M99 = np.percentile(data, rescale_percent, axis=0)
M100 = data.max(axis=0)
indices = np.where(M99 == 0)[0]
Expand All @@ -94,6 +91,7 @@ def compute_markov(data, knn=10, epsilon=1, distance_metric='euclidean', knn_aut
N = data.shape[0]

# Nearest neighbors
print('Computing distances')
nbrs = NearestNeighbors(n_neighbors=knn, metric=distance_metric).fit(data)
distances, indices = nbrs.kneighbors(data)

Expand All @@ -108,6 +106,7 @@ def compute_markov(data, knn=10, epsilon=1, distance_metric='euclidean', knn_aut
distances[j] = np.divide(distances[j], temp[lMaxTempIdxs])

# Adjacency matrix
print('Computing kernel')
rows = np.zeros(N * knn, dtype=np.int32)
cols = np.zeros(N * knn, dtype=np.int32)
dists = np.zeros(N * knn)
Expand Down
Loading

0 comments on commit b53d723

Please sign in to comment.