-
Notifications
You must be signed in to change notification settings - Fork 96
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #114 from KrishnaswamyLab/develop
MATLAB MAGIC2
- Loading branch information
Showing
17 changed files
with
615 additions
and
343 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,3 +20,5 @@ python/*.dll | |
python/*.egg-info | ||
python/magic/__pycache__ | ||
python/magic/*.pyc | ||
.DS_Store | ||
matlab/EMT.csv |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
function K = compute_alpha_kernel_sparse(data, varargin) | ||
% K = computer_alpha_kernel_sparse(data, varargin) | ||
% Computes sparse alpha-decay kernel | ||
% varargin: | ||
% 'npca' (default = [], no PCA) | ||
% Perform fast random PCA before computing distances | ||
% 'k' (default = 5) | ||
% k for the knn distances for the locally adaptive bandwidth | ||
% 'a' (default = 10) | ||
% The alpha exponent in the alpha-decaying kernel | ||
% 'distfun' (default = 'euclidean') | ||
% Input distance function | ||
k = 5; | ||
a = 10; | ||
npca = []; | ||
distfun = 'euclidean'; | ||
% get the input parameters | ||
if ~isempty(varargin) | ||
for j = 1:length(varargin) | ||
% k nearest neighbora | ||
if strcmp(varargin{j}, 'k') | ||
k = varargin{j+1}; | ||
end | ||
% alpha | ||
if strcmp(varargin{j}, 'a') | ||
a = varargin{j+1}; | ||
end | ||
% npca to project data | ||
if strcmp(varargin{j}, 'npca') | ||
npca = varargin{j+1}; | ||
end | ||
% distfun | ||
if strcmp(varargin{j}, 'distfun') | ||
distfun = varargin{j+1}; | ||
end | ||
end | ||
end | ||
|
||
th = 1e-4; | ||
|
||
k_knn = k * 20; | ||
|
||
bth=(-log(th))^(1/a); | ||
|
||
disp 'Computing alpha decay kernel:' | ||
|
||
N = size(data, 1); % number of cells | ||
|
||
if ~isempty(npca) | ||
disp ' PCA' | ||
data_centr = bsxfun(@minus, data, mean(data,1)); | ||
[U,~,~] = randPCA(data_centr', npca); % fast random svd | ||
%[U,~,~] = svds(data', npca); | ||
data_pc = data_centr * U; % PCA project | ||
else | ||
data_pc = data; | ||
end | ||
|
||
disp(['Number of samples = ' num2str(N)]) | ||
|
||
% Initial knn search and set the radius | ||
disp(['First iteration: k = ' num2str(k_knn)]) | ||
[idx, kdist]=knnsearch(data_pc,data_pc,'k',k_knn,'Distance',distfun); | ||
epsilon=kdist(:,k+1); | ||
|
||
% Find the points that have large enough distance to be below the kernel | ||
% threshold | ||
below_thresh=kdist(:,end)>=bth*epsilon; | ||
|
||
idx_thresh=find(below_thresh); | ||
|
||
if ~isempty(idx_thresh) | ||
K=exp(-(kdist(idx_thresh,:)./epsilon(idx_thresh)).^a); | ||
K(K<=th)=0; | ||
K=K(:); | ||
i = repmat(idx_thresh',1,size(idx,2)); | ||
i = i(:); | ||
idx_temp=idx(idx_thresh,:); | ||
j = idx_temp(:); | ||
end | ||
|
||
disp(['Number of samples below the threshold from 1st iter: ' num2str(length(idx_thresh))]) | ||
|
||
% Loop increasing k by factor of 20 until we cover 90% of the data | ||
while length(idx_thresh)<.9*N | ||
k_knn=min(20*k_knn,N); | ||
data_pc2=data_pc(~below_thresh,:); | ||
epsilon2=epsilon(~below_thresh); | ||
disp(['Next iteration: k= ' num2str(k_knn)]) | ||
[idx2, kdist2]=knnsearch(data_pc,data_pc2,'k',k_knn,'Distance',distfun); | ||
|
||
% Find the points that have large enough distance | ||
below_thresh2=kdist2(:,end)>=bth*epsilon2; | ||
idx_thresh2=find(below_thresh2); | ||
|
||
if ~isempty(idx_thresh2) | ||
K2=exp(-(kdist2(idx_thresh2,:)./epsilon2(idx_thresh2)).^a); | ||
K2(K2<=th)=0; | ||
idx_notthresh=find(~below_thresh); | ||
i2=repmat(idx_notthresh(idx_thresh2)',1,size(idx2,2)); | ||
i2=i2(:); | ||
idx_temp=idx2(idx_thresh2,:); | ||
j2=idx_temp(:); | ||
|
||
i=[i; i2]; | ||
j=[j; j2]; | ||
K=[K; K2(:)]; | ||
% Add the newly thresholded points to the old ones | ||
below_thresh(idx_notthresh(idx_thresh2))=1; | ||
idx_thresh=find(below_thresh); | ||
end | ||
disp(['Number of samples below the threshold from the next iter: ' num2str(length(idx_thresh))]) | ||
end | ||
|
||
% Radius search for the rest | ||
if length(idx_thresh)<N | ||
disp(['Using radius based search for the rest']) | ||
data_pc2=data_pc(~below_thresh,:); | ||
epsilon2=epsilon(~below_thresh); | ||
[idx2, kdist2]=rangesearch(data_pc,data_pc2,bth*max(epsilon2),'Distance',distfun); | ||
idx_notthresh=find(~below_thresh); | ||
for m=1:length(idx2) | ||
i=[i; idx_notthresh(m)*ones(length(idx2{m}),1)]; | ||
j=[j; idx2{m}']; | ||
K2=exp(-(kdist2{m}./epsilon2(m)).^a); | ||
K2(K2<=th)=0; | ||
K=[K; K2(:)]; | ||
end | ||
|
||
end | ||
|
||
% Build the kernel | ||
K = sparse(i, j, K); | ||
|
||
disp ' Symmetrize affinities' | ||
K = K + K'; | ||
disp ' Done computing kernel' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,80 +1,71 @@ | ||
function [t_opt, r2_vec] = compute_optimal_t(data, DiffOp, varargin) | ||
% [t_opt, r2_vec] = compute_optimal_t(data, DiffOp, varargin) | ||
% data - input data | ||
% DiffOp - diffusion operator | ||
% varargin: | ||
% t_max - max t to try | ||
% n_genes - number of random genes to compute optimal t on, should be | ||
% at least 100, fewer is faster | ||
% make_plots - draw convergence as a function of t with which we | ||
% select the optimal t | ||
function [data_opt_t, t_opt] = compute_optimal_t(data, DiffOp, varargin) | ||
|
||
t_max = 32; | ||
n_genes = size(data,2); | ||
make_plots = true; | ||
make_plot = true; | ||
th = 1e-3; | ||
data_opt_t = []; | ||
|
||
if ~isempty(varargin) | ||
for j = 1:length(varargin) | ||
if strcmp(varargin{j}, 't_max') | ||
t_max = varargin{j+1}; | ||
end | ||
if strcmp(varargin{j}, 'n_genes') | ||
n_genes = varargin{j+1}; | ||
if strcmp(varargin{j}, 'make_plot') | ||
make_plot = varargin{j+1}; | ||
end | ||
if strcmp(varargin{j}, 'make_plots') | ||
make_plots = varargin{j+1}; | ||
if strcmp(varargin{j}, 'th') | ||
th = varargin{j+1}; | ||
end | ||
end | ||
end | ||
|
||
if ~issparse(DiffOp) | ||
DiffOp = sparse(DiffOp); | ||
data_prev = data; | ||
if make_plot | ||
error_vec = nan(t_max,1); | ||
for I=1:t_max | ||
disp(['t = ' num2str(I)]); | ||
data_curr = DiffOp * data_prev; | ||
error_vec(I) = procrustes(data_prev, data_curr); | ||
if error_vec(I) < th && isempty(data_opt_t) | ||
data_opt_t = data_curr; | ||
end | ||
data_prev = data_curr; | ||
end | ||
t_opt = find(error_vec < th, 1, 'first'); | ||
|
||
figure; | ||
hold all; | ||
plot(1:t_max, error_vec, '*-'); | ||
plot(t_opt, error_vec(t_opt), 'or', 'markersize', 10); | ||
xlabel 't' | ||
ylabel 'error' | ||
axis tight | ||
ylim([0 ceil(max(error_vec)*10)/10]); | ||
plot(xlim, [th th], '--k'); | ||
legend({'y' 'optimal t' ['y=' num2str(th)]}); | ||
set(gca,'xtick',1:t_max); | ||
set(gca,'ytick',0:0.1:1); | ||
else | ||
for I=1:t_max | ||
disp(['t = ' num2str(I)]); | ||
data_curr = DiffOp * data_prev; | ||
error = procrustes(data_prev, data_curr); | ||
if error < th | ||
t_opt = I; | ||
data_opt_t = data_curr; | ||
break | ||
end | ||
data_prev = data_curr; | ||
end | ||
end | ||
|
||
if n_genes > size(data,2) | ||
disp 'n_genes too large, capping n_genes at maximum possible number of genes' | ||
n_genes = size(data,2) | ||
end | ||
disp(['optimal t = ' num2str(t_opt)]); | ||
|
||
|
||
idx_genes = randsample(size(data,2), n_genes); | ||
data_imputed = data; | ||
data_imputed = data_imputed(:,idx_genes); | ||
|
||
if min(data_imputed(:)) < 0 | ||
disp 'data has negative values, shifting to positive' | ||
data_imputed = data_imputed - min(data_imputed(:)); | ||
end | ||
|
||
r2_vec = nan(t_max,1); | ||
data_prev = data_imputed; | ||
data_prev = bsxfun(@rdivide, data_prev, sum(data_prev)); | ||
disp 'computing optimal t' | ||
for I=1:t_max | ||
data_imputed = DiffOp * data_imputed; | ||
data_curr = data_imputed; | ||
data_curr = bsxfun(@rdivide, data_curr, sum(data_curr)); | ||
r2 = rsquare(data_prev(:), data_curr(:)); | ||
r2_vec(I) = 1 - r2; | ||
data_prev = data_curr; | ||
end | ||
|
||
t_opt = find(r2_vec < 0.05, 1, 'first') + 1; | ||
|
||
disp(['optimal t = ' num2str(t_opt)]); | ||
|
||
if make_plots | ||
figure; | ||
hold all; | ||
plot(1:t_max, r2_vec, '*-'); | ||
plot(t_opt, r2_vec(t_opt), 'or', 'markersize', 10); | ||
xlabel 't' | ||
ylabel '1 - R^2(data_{t},data_{t-1})' | ||
axis tight | ||
ylim([0 1]); | ||
plot(xlim, [0.05 0.05], '--k'); | ||
legend({'y' 'optimal t' 'y=0.05'}); | ||
set(gca,'xtick',1:t_max); | ||
set(gca,'ytick',0:0.1:1); | ||
end | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
function [t_opt, error_vec, data_imputed] = compute_optimal_t_fast(data, DiffOp, varargin) | ||
|
||
t_max = 20; | ||
make_plots = true; | ||
th = 0.001; | ||
|
||
if ~isempty(varargin) | ||
for j = 1:length(varargin) | ||
if strcmp(varargin{j}, 't_max') | ||
t_max = varargin{j+1}; | ||
end | ||
if strcmp(varargin{j}, 'make_plots') | ||
make_plots = varargin{j+1}; | ||
end | ||
end | ||
end | ||
|
||
if make_plots | ||
error_vec = nan(t_max,1); | ||
data_prev = data; | ||
for I=1:t_max | ||
data_curr = DiffOp * data_prev; | ||
error_vec(I) = procrustes(data_prev, data_curr); | ||
data_prev = data_curr; | ||
end | ||
t_opt = find(error_vec < th, 1, 'first'); | ||
disp(['optimal t = ' num2str(t_opt)]); | ||
end | ||
|
||
|
||
|
||
|
||
|
||
|
||
error_vec = nan(t_max,1); | ||
data_prev = data; | ||
for I=1:t_max | ||
data_curr = DiffOp * data_prev; | ||
error_vec(I) = procrustes(data_prev, data_curr); | ||
if error_vec(I) < th && ~make_plots | ||
break | ||
end | ||
data_prev = data_curr; | ||
end | ||
|
||
t_opt = find(error_vec < th, 1, 'first'); | ||
|
||
disp(['optimal t = ' num2str(t_opt)]); | ||
|
||
data_imputed = | ||
|
||
if make_plots | ||
figure; | ||
hold all; | ||
plot(1:t_max, error_vec, '*-'); | ||
plot(t_opt, error_vec(t_opt), 'or', 'markersize', 10); | ||
xlabel 't' | ||
ylabel 'error' | ||
axis tight | ||
ylim([0 ceil(max(error_vec)*10)/10]); | ||
plot(xlim, [th th], '--k'); | ||
legend({'y' 'optimal t' ['y=' num2str(th)]}); | ||
set(gca,'xtick',1:t_max); | ||
set(gca,'ytick',0:0.1:1); | ||
end | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
function [M, genes_found, gene_idx] = project_genes(genes, genes_all, pc_imputed, U) | ||
% project_genes -- obtain gene values from compressed imputed data | ||
% [M, genes_found, gene_idx] = project_genes(genes, genes_all, pc_imputed, U) computes | ||
% gene values (M) for given gene names (genes) given all gene names (genes_all), loadings | ||
% (U), and imputed principal components (pc_imputed). | ||
% | ||
% Since pc_imputed and U are both narrow matrices the imputed data can be | ||
% stored in a memory efficient way, without having to store the dense | ||
% matrix. | ||
|
||
[gene_idx,locb] = ismember(lower(genes_all), lower(genes)); | ||
[~,sidx] = sort(locb(gene_idx)); | ||
idx = find(gene_idx); | ||
idx = idx(sidx); | ||
M = pc_imputed * U(idx,:)'; % project | ||
genes_found = genes_all(idx); | ||
gene_idx = find(gene_idx); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
function [M, genes_found, gene_idx] = project_genes(genes, genes_all, pc_t, U) | ||
% project_genes -- obtain gene values from compressed imputed data | ||
% [M, genes_found, lia] = project_genes(genes, genes_all, pc_t, U) computes | ||
% gene values (M) for genes given all gene names (genes_all), loadings | ||
% (U), and imputed principal components (pc_t). | ||
|
||
[gene_idx,locb] = ismember(lower(genes_all), lower(genes)); | ||
[~,sidx] = sort(locb(gene_idx)); | ||
idx = find(gene_idx); | ||
idx = idx(sidx); | ||
M = pc_t * U(idx,:)'; | ||
genes_found = genes_all(idx); | ||
gene_idx = find(gene_idx); |
Oops, something went wrong.