Skip to content

Commit

Permalink
Some fixes on "sample"s handling
Browse files Browse the repository at this point in the history
  • Loading branch information
iquasere committed Nov 23, 2023
1 parent 638baa3 commit 89bc83d
Show file tree
Hide file tree
Showing 7 changed files with 17 additions and 18 deletions.
2 changes: 1 addition & 1 deletion workflow/rules/metaproteomics.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ rule metaproteomics:
[directory(folder) for folder in mp_exps[mp_exps['Sample'] == (lambda wildcards: wildcards.sample)]['Files']],
"{output}/Annotation/{sample}/UPIMAPI_results.tsv"
output:
"{output}/Metaproteomics/{sample}/spectracounts.tsv"
"{output}/Metaproteomics/{sample}_mp_spectracounts.tsv"
threads:
config["threads"]
params:
Expand Down
6 changes: 3 additions & 3 deletions workflow/rules/protein_report.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ rule protein_report:
input:
expand("{output}/Annotation/{sample}/UPIMAPI_results.tsv", output=OUTPUT, sample=set(EXPS['Sample'])),
expand("{output}/Annotation/{sample}/reCOGnizer_results.xlsx", output=OUTPUT, sample=set(EXPS["Sample"])),
expand("{output}/Quantification/{sample}/mg.readcounts.norm", output=OUTPUT, sample=set(mg_exps['Sample'])),
expand("{output}/Quantification/{sample}/mt.readcounts.norm", output=OUTPUT, sample=set(mt_exps['Sample'])),
expand("{output}/Metaproteomics/{sample}/spectracounts.tsv", output=OUTPUT, sample=set(mp_exps['Sample']))
expand("{output}/Quantification/{sample}_mg_readcounts.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
expand("{output}/Quantification/{sample}_mt_readcounts.tsv", output=OUTPUT, sample=set(mt_exps['Sample'])),
expand("{output}/Metaproteomics/{sample}_mp_spectracounts.tsv", output=OUTPUT, sample=set(mp_exps['Sample']))
output:
expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
f"{OUTPUT}/MOSCA_Protein_Report.xlsx",
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/quantification.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ rule quantification:
expand("{output}/Annotation/{sample}/fgs.ffn", output=OUTPUT, sample=set(EXPS["Sample"]))
output:
expand("{output}/Quantification/{name}.readcounts", output=OUTPUT, name=set(not_mp_exps['Name'])),
expand("{output}/Quantification/{sample}/mg.readcounts.norm", output=OUTPUT, sample=set(mg_exps['Sample'])),
expand("{output}/Quantification/{sample}/mt.readcounts.norm", output=OUTPUT, sample=set(mt_exps['Sample']))
expand("{output}/Quantification/{sample}_mg_readcounts.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
expand("{output}/Quantification/{sample}_mt_readcounts.tsv", output=OUTPUT, sample=set(mt_exps['Sample']))
threads:
config["threads"]
params:
Expand Down
4 changes: 2 additions & 2 deletions workflow/scripts/main_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ def make_protein_reports(out, exps, max_lines=1000000):
mg_report = mg_report.groupby('Entry')[mg_report.columns.tolist()[1:]].sum().reset_index()
mt_report = mt_report.groupby('Entry')[mt_report.columns.tolist()[1:]].sum().reset_index()
mp_report = mp_report.groupby('Entry')[mp_report.columns.tolist()[1:]].sum().reset_index()
mg_report.to_csv(f'{out}/Quantification/mg.readcounts', sep='\t', index=False)
mt_report.to_csv(f'{out}/Quantification/mt.readcounts', sep='\t', index=False)
mg_report.to_csv(f'{out}/Quantification/mg_entries.tsv', sep='\t', index=False)
mt_report.to_csv(f'{out}/Quantification/mt_entries.tsv', sep='\t', index=False)
mp_report[mp_report[mp_report.columns.tolist()[1:]].isnull().sum(
axis=1) < len(mp_report.columns.tolist()[1:])].drop_duplicates().to_csv(
f'{out}/Metaproteomics/mp.spectracounts', sep='\t', index=False)
Expand Down
2 changes: 1 addition & 1 deletion workflow/scripts/metaproteomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ def run(self):
f'{out}/2nd_search/reports/{name}_Default_Protein_Report.txt', sep='\t', index_col=0
)[['Main Accession', '#PSMs']].rename(columns={'#PSMs': name}), how='outer', on='Main Accession')
spectracounts.groupby('Main Accession')[snakemake.params.names].sum().reset_index().fillna(value=0.0).to_csv(
f'{snakemake.params.output}/spectracounts.tsv', sep='\t', index=False)
f'{snakemake.params.output}/{snakemake.wildcards.sample}_mp_spectracounts.tsv', sep='\t', index=False)


if __name__ == '__main__':
Expand Down
6 changes: 3 additions & 3 deletions workflow/scripts/mosca_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ def perform_alignment(reference, reads, basename, threads=1):
else:
print(f'{basename}.log was found!')
run_pipe_command(
f"""samtools view -F 260 -S {basename}.sam | cut -f 3 | sort | uniq -c |
awk '{{printf("%s\\t%s\\n", $2, $1)}}'""", output=f'{basename}.readcounts')
f"""samtools view -F 260 -S {basename}.sam | cut -f 3 | sort | uniq -c | \
awk '{{printf("%s\\t%s\\n", $2, $1)}}'""", output=f'{basename}.readcounts')


def fastq2fasta(fastq, output):
Expand All @@ -156,7 +156,7 @@ def timed_message(message):

def normalize_counts_by_size(readcounts, reference):
run_pipe_command(
f"seqkit fx2tab {reference} | sort | awk '{{print $1\"\\t\"length($2)}}' | join - {readcounts} | "
f"seqkit fx2tab {reference} | sort -k 1b,1 | awk '{{print $1\"\\t\"length($2)}}' | join - {readcounts} | "
f"awk '{{print $1\"\\t\"$3/$2}}'", output=readcounts.replace('.readcounts', '.readcounts.norm'))


Expand Down
11 changes: 5 additions & 6 deletions workflow/scripts/quantification.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,18 @@ def run():
normalize_counts_by_size(
f"{snakemake.params.output}/Quantification/{pexps.loc[i]['Name']}.readcounts", reference)
# Read the results of alignment and add them to the readcounts result at sample level
normalized_by_gene_size = pd.read_csv(
normalized_counts = pd.read_csv(
f"{snakemake.params.output}/Quantification/{pexps.loc[i]['Name']}.readcounts.norm",
sep='\t', names=['Gene' if pexps.loc[i]['Data type'] == 'mrna' else 'Contig', pexps.loc[i]['Name']])
if pexps.loc[i]['Data type'] == 'dna':
mg_result = pd.merge(mg_result, normalized_by_gene_size, how='outer', on='Contig')
mg_result = pd.merge(mg_result, normalized_counts, how='outer', on='Contig')
else:
mt_result = pd.merge(mt_result, normalized_by_gene_size, how='outer', on='Gene')
Path(f"{snakemake.params.output}/Quantification/{sample}").mkdir(parents=True, exist_ok=True)
mt_result = pd.merge(mt_result, normalized_counts, how='outer', on='Gene')
if len(mg_result) > 0:
mg_result.to_csv(f"{snakemake.params.output}/Quantification/{sample}/mg.readcounts", sep='\t', index=False)
mg_result.to_csv(f"{snakemake.params.output}/Quantification/{sample}_mg_readcounts.tsv", sep='\t', index=False)
if len(mt_result) > 0:
mt_result.astype(int, errors='ignore').to_csv(
f"{snakemake.params.output}/Quantification/{sample}/mt.readcounts", sep='\t', index=False)
f"{snakemake.params.output}/Quantification/{sample}_mt_readcounts.tsv", sep='\t', index=False)


if __name__ == '__main__':
Expand Down

0 comments on commit 89bc83d

Please sign in to comment.