Some fixes on "sample"s handling

iquasere · Nov 23, 2023 · 89bc83d · 89bc83d
1 parent 638baa3
commit 89bc83d
Show file tree

Hide file tree

Showing 7 changed files with 17 additions and 18 deletions.
diff --git a/workflow/rules/metaproteomics.smk b/workflow/rules/metaproteomics.smk
@@ -3,7 +3,7 @@ rule metaproteomics:
         [directory(folder) for folder in mp_exps[mp_exps['Sample'] == (lambda wildcards: wildcards.sample)]['Files']],
         "{output}/Annotation/{sample}/UPIMAPI_results.tsv"
     output:
-        "{output}/Metaproteomics/{sample}/spectracounts.tsv"
+        "{output}/Metaproteomics/{sample}_mp_spectracounts.tsv"
     threads:
         config["threads"]
     params:

diff --git a/workflow/rules/protein_report.smk b/workflow/rules/protein_report.smk
@@ -2,9 +2,9 @@ rule protein_report:
     input:
         expand("{output}/Annotation/{sample}/UPIMAPI_results.tsv", output=OUTPUT, sample=set(EXPS['Sample'])),
         expand("{output}/Annotation/{sample}/reCOGnizer_results.xlsx", output=OUTPUT, sample=set(EXPS["Sample"])),
-        expand("{output}/Quantification/{sample}/mg.readcounts.norm", output=OUTPUT, sample=set(mg_exps['Sample'])),
-        expand("{output}/Quantification/{sample}/mt.readcounts.norm", output=OUTPUT, sample=set(mt_exps['Sample'])),
-        expand("{output}/Metaproteomics/{sample}/spectracounts.tsv", output=OUTPUT, sample=set(mp_exps['Sample']))
+        expand("{output}/Quantification/{sample}_mg_readcounts.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
+        expand("{output}/Quantification/{sample}_mt_readcounts.tsv", output=OUTPUT, sample=set(mt_exps['Sample'])),
+        expand("{output}/Metaproteomics/{sample}_mp_spectracounts.tsv", output=OUTPUT, sample=set(mp_exps['Sample']))
     output:
         expand("{output}/MOSCA_{sample}_Protein_Report.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
         f"{OUTPUT}/MOSCA_Protein_Report.xlsx",

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
@@ -6,8 +6,8 @@ rule quantification:
         expand("{output}/Annotation/{sample}/fgs.ffn", output=OUTPUT, sample=set(EXPS["Sample"]))
     output:
         expand("{output}/Quantification/{name}.readcounts", output=OUTPUT, name=set(not_mp_exps['Name'])),
-        expand("{output}/Quantification/{sample}/mg.readcounts.norm", output=OUTPUT, sample=set(mg_exps['Sample'])),
-        expand("{output}/Quantification/{sample}/mt.readcounts.norm", output=OUTPUT, sample=set(mt_exps['Sample']))
+        expand("{output}/Quantification/{sample}_mg_readcounts.tsv", output=OUTPUT, sample=set(mg_exps['Sample'])),
+        expand("{output}/Quantification/{sample}_mt_readcounts.tsv", output=OUTPUT, sample=set(mt_exps['Sample']))
     threads:
         config["threads"]
     params:

diff --git a/workflow/scripts/main_reports.py b/workflow/scripts/main_reports.py
@@ -85,8 +85,8 @@ def make_protein_reports(out, exps, max_lines=1000000):
     mg_report = mg_report.groupby('Entry')[mg_report.columns.tolist()[1:]].sum().reset_index()
     mt_report = mt_report.groupby('Entry')[mt_report.columns.tolist()[1:]].sum().reset_index()
     mp_report = mp_report.groupby('Entry')[mp_report.columns.tolist()[1:]].sum().reset_index()
-    mg_report.to_csv(f'{out}/Quantification/mg.readcounts', sep='\t', index=False)
-    mt_report.to_csv(f'{out}/Quantification/mt.readcounts', sep='\t', index=False)
+    mg_report.to_csv(f'{out}/Quantification/mg_entries.tsv', sep='\t', index=False)
+    mt_report.to_csv(f'{out}/Quantification/mt_entries.tsv', sep='\t', index=False)
     mp_report[mp_report[mp_report.columns.tolist()[1:]].isnull().sum(
         axis=1) < len(mp_report.columns.tolist()[1:])].drop_duplicates().to_csv(
         f'{out}/Metaproteomics/mp.spectracounts', sep='\t', index=False)

diff --git a/workflow/scripts/metaproteomics.py b/workflow/scripts/metaproteomics.py
@@ -361,7 +361,7 @@ def run(self):
                 f'{out}/2nd_search/reports/{name}_Default_Protein_Report.txt', sep='\t', index_col=0
             )[['Main Accession', '#PSMs']].rename(columns={'#PSMs': name}), how='outer', on='Main Accession')
         spectracounts.groupby('Main Accession')[snakemake.params.names].sum().reset_index().fillna(value=0.0).to_csv(
-            f'{snakemake.params.output}/spectracounts.tsv', sep='\t', index=False)
+            f'{snakemake.params.output}/{snakemake.wildcards.sample}_mp_spectracounts.tsv', sep='\t', index=False)
 
 
 if __name__ == '__main__':

diff --git a/workflow/scripts/mosca_tools.py b/workflow/scripts/mosca_tools.py
@@ -142,8 +142,8 @@ def perform_alignment(reference, reads, basename, threads=1):
     else:
         print(f'{basename}.log was found!')
     run_pipe_command(
-        f"""samtools view -F 260 -S {basename}.sam | cut -f 3 | sort | uniq -c | 
-        awk '{{printf("%s\\t%s\\n", $2, $1)}}'""", output=f'{basename}.readcounts')
+        f"""samtools view -F 260 -S {basename}.sam | cut -f 3 | sort | uniq -c | \
+awk '{{printf("%s\\t%s\\n", $2, $1)}}'""", output=f'{basename}.readcounts')
 
 
 def fastq2fasta(fastq, output):
@@ -156,7 +156,7 @@ def timed_message(message):
 
 def normalize_counts_by_size(readcounts, reference):
     run_pipe_command(
-        f"seqkit fx2tab {reference} | sort | awk '{{print $1\"\\t\"length($2)}}' | join - {readcounts} | "
+        f"seqkit fx2tab {reference} | sort -k 1b,1 | awk '{{print $1\"\\t\"length($2)}}' | join - {readcounts} | "
         f"awk '{{print $1\"\\t\"$3/$2}}'", output=readcounts.replace('.readcounts', '.readcounts.norm'))
 
 

diff --git a/workflow/scripts/quantification.py b/workflow/scripts/quantification.py
@@ -37,19 +37,18 @@ def run():
             normalize_counts_by_size(
                 f"{snakemake.params.output}/Quantification/{pexps.loc[i]['Name']}.readcounts", reference)
             # Read the results of alignment and add them to the readcounts result at sample level
-            normalized_by_gene_size = pd.read_csv(
+            normalized_counts = pd.read_csv(
                 f"{snakemake.params.output}/Quantification/{pexps.loc[i]['Name']}.readcounts.norm",
                 sep='\t', names=['Gene' if pexps.loc[i]['Data type'] == 'mrna' else 'Contig', pexps.loc[i]['Name']])
             if pexps.loc[i]['Data type'] == 'dna':
-                mg_result = pd.merge(mg_result, normalized_by_gene_size, how='outer', on='Contig')
+                mg_result = pd.merge(mg_result, normalized_counts, how='outer', on='Contig')
             else:
-                mt_result = pd.merge(mt_result, normalized_by_gene_size, how='outer', on='Gene')
-        Path(f"{snakemake.params.output}/Quantification/{sample}").mkdir(parents=True, exist_ok=True)
+                mt_result = pd.merge(mt_result, normalized_counts, how='outer', on='Gene')
         if len(mg_result) > 0:
-            mg_result.to_csv(f"{snakemake.params.output}/Quantification/{sample}/mg.readcounts", sep='\t', index=False)
+            mg_result.to_csv(f"{snakemake.params.output}/Quantification/{sample}_mg_readcounts.tsv", sep='\t', index=False)
         if len(mt_result) > 0:
             mt_result.astype(int, errors='ignore').to_csv(
-                f"{snakemake.params.output}/Quantification/{sample}/mt.readcounts", sep='\t', index=False)
+                f"{snakemake.params.output}/Quantification/{sample}_mt_readcounts.tsv", sep='\t', index=False)
 
 
 if __name__ == '__main__':