Skip to content

Commit

Permalink
New strategy for MP DB
Browse files Browse the repository at this point in the history
Metaproteogenomics database is generated from the sequences of the taxonomies identified in MG
Integrated dereplication with dRep
  • Loading branch information
iquasere committed May 29, 2024
1 parent dadfa2d commit cf464a0
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 24 deletions.
18 changes: 18 additions & 0 deletions workflow/rules/binning.smk
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,21 @@ rule binning:
"../envs/binning.yaml"
script:
"../scripts/binning.py"

rule dereplication:
input:
expand("{output}/Binning/{{sample}}/checkm.tsv", output=OUTPUT)
output:
expand("{output}/Binning/{{sample}}/dereplicated/checkm.tsv", output=OUTPUT)
threads:
config["threads"]
params:
sample = lambda wildcards: wildcards.sample,
output_dir = lambda wildcards: f'{OUTPUT}/Binning/{wildcards.sample}/dereplicated',
conda:
"../envs/binning.yaml"
shell:
"""
dRep dereplicate {params.output_dir}/dereplicated -g {params.output_dir}/*.fasta
checkm lineage_wf -x fasta -r --ali --nt -t {threads} {params.output_dir}/dereplicated --reduced_tree {params.output_dir}/dereplicated --tab_table --file {params.output_dir}/dereplicated/checkm.tsv
"""
4 changes: 3 additions & 1 deletion workflow/rules/metaproteomics.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ rule metaproteomics:
config["threads"]
params:
output = lambda wildcards: f'{OUTPUT}/Metaproteomics/{wildcards.sample}',
sample = lambda wildcards: wildcards.sample,
mg_db = lambda wildcards: f'{OUTPUT}/Annotation/{wildcards.sample}/fgs.faa',
up_res = lambda wildcards: f'{OUTPUT}/Annotation/{wildcards.sample}/UPIMAPI_results.tsv',
spectra_folders = lambda wildcards: mp_exps[mp_exps['Sample'] == wildcards.sample]['Files'].tolist(),
Expand All @@ -17,7 +18,8 @@ rule metaproteomics:
max_memory = config["max_memory"],
resources = config["resources_directory"],
add_reference_proteomes = config["metaproteomics_add_reference_proteomes"],
inside_container = config["inside_container"]
inside_container = config["inside_container"],
taxa_lvl = config["mp_database_taxa_level"],
conda:
"../envs/metaproteomics.yaml"
script:
Expand Down
5 changes: 3 additions & 2 deletions workflow/scripts/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import shutil
import pathlib
import snakemake
import os


class Binner:
Expand Down Expand Up @@ -96,8 +97,8 @@ def iterative_binning(self, contigs, output, threads=8, reads=None, reads2=None,
else:
shutil.rmtree(f'{output}/{prob_threshold}')
print(f'Removed files for probability threshold: {prob_threshold} %')

shutil.copyfile(f'{output}/{best_bin}/checkm.tsv', f'{output}/checkm.tsv')
for file_name in os.listdir(f'{output}/{best_bin}'):
shutil.move(os.path.join(f'{output}/{best_bin}', file_name), output)
print(f'Best probability threshold: {best_bin} %')
with open(f'{output}/result.txt', 'w') as f:
f.write(f'Best probability threshold: {best_bin}')
Expand Down
49 changes: 28 additions & 21 deletions workflow/scripts/metaproteomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import requests
from time import sleep
from mosca_tools import run_command, run_pipe_command, multiprocess_fun
import snakemake


class MetaproteomicsAnalyser:
Expand All @@ -37,9 +38,9 @@ def get_proteome_uniprot(self, taxid, max_tries=3):
sleep(10)
return res.content.decode('utf8')

def add_reference_proteomes(self, upimapi_res, output):
def add_reference_proteomes(self, upimapi_res, output, tax_lvl='genus'):
taxids = pd.read_csv(upimapi_res, sep='\t', low_memory=False)[
'Taxonomic lineage IDs (SPECIES)'].dropna().unique().astype(int).tolist()
f'Taxonomic lineage IDs ({tax_lvl.upper()})'].dropna().unique().astype(int).tolist()
with open(output, 'w') as f:
for taxid in tqdm(taxids, desc=f'Retrieving reference proteomes for {len(taxids)} taxa from UniProt'):
try:
Expand All @@ -49,20 +50,22 @@ def add_reference_proteomes(self, upimapi_res, output):

def database_generation(
self, mg_orfs, output, upimapi_res, contaminants_database=None, protease='Trypsin', threads=1,
add_reference_proteomes=True):
add_reference_proteomes=True, tax_lvl='genus'):
"""
Build database from MG analysis
:param mg_orfs:
:param output:
:param mpa_result:
:param upimapi_res:
:param contaminants_database:
:param protease:
:param threads:
:param add_reference_proteomes:
:param tax_lvl:
"""
print(f'Generating new database in {output}')
# Get reference proteomes for the various taxa
if add_reference_proteomes:
self.add_reference_proteomes(upimapi_res, f'{output}/ref_proteomes.fasta')
self.add_reference_proteomes(upimapi_res, f'{output}/ref_proteomes.fasta', tax_lvl=tax_lvl)
# Add protease
if protease == 'Trypsin':
if not os.path.isfile(f'{output}/P00761.fasta'):
Expand Down Expand Up @@ -228,16 +231,13 @@ def browse_identification_results(
except:
print('Producing Peptide-Shaker result failed! Maybe no identifications were obtained?')

def generate_reports(self, peptideshaker_output, reports_folder, reports=["10"], max_memory=40960):
def generate_reports(self, peptideshaker_output, reports_folder, reports=("10"), max_memory=40960):
"""
input:
peptideshaker_output: peptideshaker output filename
reports_folder: folder to where output reports
reports_list: list of INTEGERS from 0 to 11 corresponding to the reports
to output
output:
if it doesn't exist, "reports_folder" will be created
reports will be outputed to "reports_folder"
Generates reports from output of PeptideShaker
:param peptideshaker_output: peptideshaker output filename
:param reports_folder: folder to where output reports
:param reports: list of INTEGERS from 0 to 11 corresponding to the reports to output
:param max_memory: maximum memory to use
"""
print(f'Created {reports_folder}')
Path(reports_folder).mkdir(parents=True, exist_ok=True) # creates folder for reports
Expand All @@ -261,7 +261,7 @@ def join_ps_reports(self, files, local_fdr=None, validation=False):
return result

def compomics_run(
self, database, output, spectra_folders, name, params, threads=1, max_memory=4096, reports=['10']):
self, database, output, spectra_folders, name, params, threads=1, max_memory=4096, reports=('10')):
"""
Run compomics workflow on the given spectra folders
:param params:
Expand Down Expand Up @@ -305,7 +305,8 @@ def run(self):
self.database_generation(
snakemake.params.mg_db, snakemake.params.output, snakemake.params.up_res,
contaminants_database=snakemake.params.contaminants_database,
protease=snakemake.params.protease, add_reference_proteomes=snakemake.params.add_reference_proteomes)
protease=snakemake.params.protease, add_reference_proteomes=snakemake.params.add_reference_proteomes,
tax_lvl=snakemake.params.taxa_lvl)
self.create_decoy_database(f'{snakemake.params.output}/1st_search_database.fasta')
self.split_database(
f'{snakemake.params.output}/1st_search_database_concatenated_target_decoy.fasta', n_proteins=5000000)
Expand All @@ -314,8 +315,11 @@ def run(self):
except:
print('An illegal reflective access operation has occurred. But MOSCA can handle it.')

# 2nd database construction
proteins_for_second_search = []
# 2nd database construction - peak-pick spectra and get proteins from taxa with at least one identification
up_res = pd.read_csv(f'real_mgmp_output/Annotation/{sample}/UPIMAPI_results.tsv', sep='\t', low_memory=False)

proteins_identified = []
tax_lvl = snakemake.params.tax_level
for i in range(len(snakemake.params.names)):
out = f'{snakemake.params.output}/{snakemake.params.names[i]}'
for foldername in ['spectra', '2nd_search']:
Expand All @@ -329,16 +333,19 @@ def run(self):
self.compomics_run(
database, f'{out}_{j}/1st_search', f'{out}/spectra', snakemake.params.names[i],
f'{snakemake.params.output}/1st_params.par', threads=snakemake.threads,
max_memory=snakemake.params.max_memory, reports=['4'])
max_memory=snakemake.params.max_memory, reports=('4'))
df = pd.read_csv(
f'{out}_{j}/1st_search/reports/{snakemake.params.names[i]}_'
f'Default_PSM_Report_with_non-validated_matches.txt',
sep='\t', index_col=0)
for protein_group in df['Protein(s)'].str.split(','):
proteins_for_second_search += protein_group
proteins_identified += protein_group
j += 1
identified_taxa = up_res[up_res[f'Taxonomic lineage ({tax_lvl})'].notnull() & up_res['qseqid'].isin(
proteins_identified)][f'Taxonomic lineage ({tax_lvl})'].unique()
with open(f'{snakemake.params.output}/2nd_search_proteins.txt', 'w') as f:
f.write('\n'.join(set(proteins_for_second_search)))
# write proteins corresponding to identified taxa
f.write('\n'.join(set(up_res[up_res[f'Taxonomic lineage ({tax_lvl})'].isin(identified_taxa)]['qseqid'])))
run_command(
f'seqkit grep -f {snakemake.params.output}/2nd_search_proteins.txt -o '
f'{snakemake.params.output}/2nd_search_database.fasta {snakemake.params.output}/1st_search_database.fasta')
Expand Down

0 comments on commit cf464a0

Please sign in to comment.