diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml new file mode 100644 index 0000000..bc57afa --- /dev/null +++ b/.github/workflows/nf-test.yml @@ -0,0 +1,33 @@ + +name: Nextflow test + +on: [push] + +jobs: + test: + + runs-on: ubuntu-latest + defaults: + run: + shell: bash -el {0} + + steps: + - uses: easimon/maximize-build-space@master + with: + root-reserve-mb: 512 + swap-size-mb: 1024 + remove-dotnet: 'true' + remove-android: 'true' + remove-haskell: 'true' + remove-codeql: 'true' + remove-docker-images: 'true' + - uses: actions/checkout@v3 + - uses: conda-incubator/setup-miniconda@v2 + with: + python-version: 3.11 + channels: conda-forge,bioconda + channel-priority: true + - uses: nf-core/setup-nextflow@v1 + - name: Test Nextflow pipeline + run: | + nextflow run ${GITHUB_WORKSPACE} -c test/nextflow.config -profile gh -stub \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..14a753a --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.nextflow* +/outputs/ +/work/ +/test/outputs +/test/work +/test/*.html +~$*.xlsx +*.html \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..63b4b68 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) [year] [fullname] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..6c64aac --- /dev/null +++ b/README.md @@ -0,0 +1,194 @@ +# Nanopore variant calling pipeline + +![GitHub Workflow Status (with branch)](https://img.shields.io/github/actions/workflow/status/scbirlab/nf-ggi/nf-test.yml) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.10.0-23aa62.svg)](https://www.nextflow.io/) +[![run with conda](https://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) + +**scbirlab/nf-ont-call-variants** is a Nextflow pipeline to call variants from Nanopore FASTQ files from bacterial clones relative to a wildtype control. + +The pipeline broadly recapitualtes, where possible, the GATK best practices for germline short variant calling. + +**Table of contents** + +- [Processing steps](#processing-steps) +- [Requirements](#requirements) +- [Quick start](#quick-start) +- [Inputs](#inputs) +- [Outputs](#outputs) +- [Issues, problems, suggestions](#issues-problems-suggestions) +- [Further help](#further-help) + +## Processing steps + +For each sample: + +1. Quality Trim reads using `cutadapt`. +2. Map to genome FASTA using `minimap2`. +3. Mark duplicates with `picard MarkDuplicates`. +4. Call variants with `Clair3`. + +Then merge resulting GVCFs using GATK `CombineGVCFs`. With the combined variant calls: + +1. Joint genoptype with GATK `GenotypeGVCFs`. +2. Filter variants using GATK `VariantFiltration`. +3. Annotate variant effects using `snpEff`. +4. Filter out variants where all samples are identical to the wildtype control, which [is assumed to be the `sample_id` which is alphabetically last](#sample-sheet). +5. Write to output TSV. + +### Other steps + +1. Get FASTQ quality metrics with `fastqc`. +2. Generate alignment statistics and plots with `samtools stats`. +2. Map to genome FASTA using `bowtie2` because `minimap2` logs are not compatible with `multiqc`. This way, some kind of alignment metrics are possible. +3. Compile the logs of processing steps into an HTML report with `multiqc`. + +## Requirements + +### Software + +You need to have Nextflow and `conda` installed on your system. + +#### First time using Nextflow? + +If you're at the Crick or your shared cluster has it already installed, try: + +```bash +module load Nextflow +``` + +Otherwise, if it's your first time using Nextflow on your system, you can install it using `conda`: + +```bash +conda install -c bioconda nextflow +``` + +You may need to set the `NXF_HOME` environment variable. For example, + +```bash +mkdir -p ~/.nextflow +export NXF_HOME=~/.nextflow +``` + +To make this a permanent change, you can do something like the following: + +```bash +mkdir -p ~/.nextflow +echo "export NXF_HOME=~/.nextflow" >> ~/.bash_profile +source ~/.bash_profile +``` + +### Reference genome + +You also need the genome FASTA for the bacteria you are sequencing. These can be obtained from [NCBI Nucleotide](https://www.ncbi.nlm.nih.gov/nuccore/): + +1. Search for your strain of interest, and open its main page +2. On the right-hand side, click `Customize view`, then `Customize` and check `Show sequence`. Finally, click `Update view`. You may have to wait a few minute while the sequence downloads. +3. Click `Send to: > Complete record > File > FASTA > Create file` +4. Save the files to a path which you provide as `--genome_fasta` [below](#inputs). + +## Quick start + +Make a [sample sheet (see below)](#sample-sheet) and, optionally, a [`nextflow.config` file](#inputs) in the directory where you want the pipeline to run. Then run Nextflow. + +```bash +nextflow run scbirlab/nf-ont-call-variants +``` + +Each time you run the pipeline after the first time, Nextflow will use a locally-cached version which will not be automatically updated. If you want to ensure that you're using the very latest version of the pipeline, use the `-latest` flag. + +```bash +nextflow run scbirlab/nf-ont-call-variants -latest +``` +If you want to run a particular tagged version of the pipeline, such as `v0.0.1`, you can do so using + +```bash +nextflow run scbirlab/nf-ont-call-variants -r v0.0.2 +``` + +For help, use `nextflow run scbirlab/nf-ont-call-variants --help`. + +The first time you run the pipeline on your system, the software dependencies in `environment.yml` will be installed. This may take several minutes. + +## Inputs + +The following parameters are required: + +- `sample_sheet`: path to a CSV with information about the samples and FASTQ files to be processed +- `genome_fasta`: path to reference genome FASTA +- `snpeff_database`: name of snpEff database to use for annotation. This should be derived from the same assembly as `genome_fasta`. Database names often end in the assembly name, such as `gca_000015005`, which you can check matches that of your `genome_fasta` + +The following parameters have default values which can be overridden if necessary. + +- `inputs = "inputs"` : The folder containing your inputs. +- `trim_qual = 10` : For `cutadapt`, the minimum Phred score for trimming 3' calls +- `min_length = 10` : For `cutadapt`, the minimum trimmed length of a read. Shorter reads will be discarded +- `gatk_image = "docker://broadinstitute/gatk:latest"` : Which GATK4 image to use +- `snpeff_url = "https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip"` : Where to download snpEff from +- `clair3_image = "docker://hkubal/clair3:latest"` : Which Clair3 image to use + +The parameters can be provided either in the `nextflow.config` file or on the `nextflow run` command. + +Here is an example of the `nextflow.config` file: + +```nextflow +params { + sample_sheet = "/path/to/sample-sheet.csv" + inputs = "/path/to/inputs" + genome_fasta = "/path/to/1_ASM28329v1_genomic.fna" + snpeff_database = "Mycobacterium_smegmatis_str_mc2_155_gca_000283295" + +} +``` + +Alternatively, you can provide the parameters on the command line: + +```bash +nextflow run scbirlab/nf-ont-call-variants \ + --sample_sheet /path/to/sample-sheet.csv \ + --inputs /path/to/inputs + --genome_fasta /path/to/1_ASM28329v1_genomic.fna \ + --snpeff_database Mycobacterium_smegmatis_str_mc2_155_gca_000283295 +``` + +### Sample sheet + +The sample sheet is a CSV file providing information about which FASTQ files belong to which sample. + +The file must have a header with the column names below, and one line per sample to be processed. + +- `sample_id`: the unique name of the sample. The wildtype must be **named so that it is alphabetically last** +- `reads`: path (relative to `inputs` option above) to compressed FASTQ files derived from Nanopore sequencing + +Here is an example of the sample sheet: + +| sample_id | reads | +| --------- | -------------------- | +| wt | raw_reads.fastq.gz | +| mut1 | raw_reads.fastq.gz | + +## Outputs + +Outputs are saved in the same directory as `sample_sheet`. They are organised under three directories: + +- `processed`: FASTQ files and logs resulting from alignments +- `tables`: tables, plots, and VCF files corresponding to variant calls +- `multiqc`: HTML report on processing steps + +## Issues, problems, suggestions + +Add to the [issue tracker](https://www.github.com/scbirlab/nf-ont-call-variants/issues). + +## Further help + +Here are the help pages of the software used by this pipeline. + +- [fastqc](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- [multiqc](https://multiqc.info/) +- [nextflow](https://www.nextflow.io/docs/latest/index.html) +- [cutadapt](https://cutadapt.readthedocs.io/en/stable/index.html) +- [minimap2](https://lh3.github.io/minimap2/minimap2.html) +- [bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) +- [samtools](http://www.htslib.org/doc/samtools.html) +- [GATK](https://gatk.broadinstitute.org/hc/en-us) +- [Picard](https://broadinstitute.github.io/picard/) +- [snpEff](https://pcingola.github.io/SnpEff/) \ No newline at end of file diff --git a/bin/metabolism/connect-metabolites.py b/bin/metabolism/connect-metabolites.py new file mode 100644 index 0000000..6900bae --- /dev/null +++ b/bin/metabolism/connect-metabolites.py @@ -0,0 +1,121 @@ +"""Use RDKit to clean reactants and products, and identify shared and similar metabolites between pairs.""" + +from typing import Iterable, List, Optional, Union +from functools import partial +from itertools import product +import sys + +from carabiner import print_err +from carabiner.cast import cast +import pandas as pd +import numpy as np +from rdkit import RDLogger +from rdkit.Chem import AddHs, rdFingerprintGenerator, MolFromSmiles, MolToInchi, MolToSmiles +from rdkit.Chem.SaltRemover import SaltRemover +from rdkit.DataStructs import TanimotoSimilarity +from tqdm.auto import tqdm + +RDLogger.DisableLog('rdApp.*') + +removers = ( + SaltRemover(defnData="[H,Na,K,Mg,Ca,Mn,Fe,F,Cl,Br,O,S]").StripMol, + partial(SaltRemover().StripMol, dontRemoveEverything=True), + AddHs, +) + +def _clean_smiles_list(smiles=str) -> str: + mol = MolFromSmiles(smiles) + for remover in removers: + mol = remover(mol) + return MolToSmiles(mol) + + +def clean_metabolites(table: pd.DataFrame, + cols=Iterable[str]) -> pd.DataFrame: + return table.assign(**{col: table[col].apply(_clean_smiles_list) for col in cols}) + + +def _shared(a_b): + a, b = (map(MolFromSmiles, a_or_b) for a_or_b in a_b) + b = set(map(MolToInchi, b)) + return any(MolToInchi(item) in b for item in a) + + +def _have_shared_chemical(df: pd.DataFrame, a: str, b: str): + return df[[a, b]].progress_apply(_shared, axis=1) + +mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048) + +def _fingerprinter(a): + return mfpgen.GetFingerprint(MolFromSmiles(a)) + +def _tanimoto(a_b): + a, b = a_b + return max(TanimotoSimilarity(*map(_fingerprinter, items)) for items in product(*a_b)) + +def _max_similarity(df: pd.DataFrame, a: str, b: str): + return df[[a, b]].progress_apply(_tanimoto, axis=1) + + +def _smiles_splitter(df: pd.DataFrame, col: str, char: str = ".") -> List[str]: + return df[col].str.split(char) + + +def connect_metabolites(table: pd.DataFrame, + cols: Iterable[str], + id_col: Union[str, Iterable[str]], + table2: Optional[pd.DataFrame] = None) -> pd.DataFrame: + + if table2 is None: + table2 = table.copy() + if isinstance(id_col, str): + id_col = cast(id_col, to=list) + cols = cast(cols, to=list) + + all_cols = id_col + cols + all_cols2 = [f"{col}_2" for col in all_cols] + cols2 = [f"{col}_2" for col in cols] + + table = table[all_cols].assign(**{f"{col}_split": partial(_smiles_splitter, col=col) for col in cols}) + table2 = (table2[all_cols] + .rename(columns=dict(zip(all_cols, all_cols2))) + .assign(**{f"{col}_split": partial(_smiles_splitter, col=col) for col in cols2})) + table = table.merge(table2, how='cross').query(f"{id_col[0]} < {id_col[0]}_2") + + index_labels = tuple("_".join(items) for items in product(["r1", "p1"], ["r2", "p2"])) + print_err(f"By shared chemicals...") + table = table.assign(**{f"connection_{index_labels[i]}": partial(_have_shared_chemical, a=f"{a}_split", b=f"{b}_split") + for i, (a, b) in enumerate(product(cols, cols2))}) + print_err(f"By maximum similarity...") + table = table.assign(**{f"max_similarity_{index_labels[i]}": partial(_max_similarity, a=f"{a}_split", b=f"{b}_split") + for i, (a, b) in enumerate(product(cols, cols2))}) + + return table[[col for col in table if not col.endswith("_split")]] + + +def main() -> None: + + tqdm.pandas() + + print_err(f"Reading table from STDIN:") + table = pd.read_csv(sys.stdin, sep='\t').assign(_id=lambda x: x["rhea_reaction_id"].astype(str).str.cat(x["uniprot_id"], sep=":")) + print_err(table) + id_table = table[["_id", "rhea_reaction_id", "uniprot_id"]].copy() + + print_err(f"Cleaning metabolites...") + table = clean_metabolites(table, cols=("reactants", "products")) + print_err(f"Connecting metabolites...") + table = (connect_metabolites(table, cols=("reactants", "products"), id_col=["_id"]) + .merge(id_table, how='left') + .merge(id_table.rename(columns={col: f"{col}_2" for col in id_table}), how='left')) + cols_to_rename = ("reactants", "products", "uniprot_id", "rhea_reaction_iduniprot_id") + table = (table + .drop(columns=["_id", "_id_2"]) + .rename(columns=dict(zip(cols_to_rename, (f"{col}_1" for col in cols_to_rename))))) + table[reversed(table.columns)].to_csv(sys.stdout, sep='\t', index=False) + + return None + + +if __name__ == '__main__': + main() diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..29cd842 --- /dev/null +++ b/environment.yml @@ -0,0 +1,15 @@ +name: nf-ggi + +channels: + - conda-forge + - bioconda + +dependencies: + - python==3.11 + - pip + - bioconda::hhsuite + - conda-forge::jq + - conda-forge::rdkit + - pip: + - "-e /nemo/lab/johnsone/home/users/johnsoe/github/yunta" + - "-e /nemo/lab/johnsone/home/users/johnsoe/github/rf2t-micro" \ No newline at end of file diff --git a/main.nf b/main.nf new file mode 100644 index 0000000..e0ac300 --- /dev/null +++ b/main.nf @@ -0,0 +1,616 @@ +#!/usr/bin/env nextflow + +/* +======================================================================================== + Gene-gene interaction predicting Nextflow Workflow +======================================================================================== + Github : https://github.com/scbirlab/nf-ggi + Contact : Eachan Johnson +---------------------------------------------------------------------------------------- +*/ + +nextflow.enable.dsl=2 + +/* +======================================================================================== + Help text +======================================================================================== +*/ +if ( params.help ) { + println """\ + S C B I R G E N E - G E N E I N T E R A C T I O N P R E D I C T I O N P I P E L I N E + ============================================================================================= + Nextflow pipeline to predict gene-gene interactions based on protein-protein interaction + predictions and similar metabolites. + + Usage: + nextflow run scbirlab/nf-ggi --accession + nextflow run scbirlab/nf-ggi -c + + Required parameters: + sample_sheet UniProt accession number for organism of interest. + uniclust, bfd Paths to get HHblits databases. + + Optional parameters (with defaults): + rhea_url URL to download Rhea reaction database. Default: "https://ftp.expasy.org/databases/rhea" + outputs Output folder. Default: "outputs". + + The parameters can be provided either in the `nextflow.config` file or on the `nextflow run` command. + + """.stripIndent() + exit 0 +} + +/* +======================================================================================== + Check parameters +======================================================================================== +*/ +if ( !params.sample_sheet ) { + throw new Exception("!!! PARAMETER MISSING: Please provide a sample sheet including UniProt proteome ID for organism of interest.") +} +if ( !params.uniclust ) { + throw new Exception("!!! PARAMETER MISSING: Please provide a path to UniClust database.") +} +if ( !params.bfd ) { + throw new Exception("!!! PARAMETER MISSING: Please provide a path to BFD database.") +} + +working_dir = params.outputs + +sequences = "sequences" +ppi = "ppi" +metabi = "metabolites" +coex = "coexpression" + +log.info """\ + S C B I R G E N E - G E N E I N T E R A C T I O N P R E D I C T I O N P I P E L I N E + ============================================================================================= + test mode : ${params.test} + inputs + sample sheet : ${params.sample_sheet} + UniClust database : ${params.uniclust} + BFD : ${params.bfd} + Rhea : ${params.rhea_url} + output : ${params.outputs} + """ + .stripIndent() + +// dirs_to_make = [sequences, ppi, metabi] +// println """ +// Making directories: +// """.stripIndent() +// dirs_to_make.each { +// println "${it}: " +// println file(it).mkdirs() ? "OK" : "Cannot create directory: ${it}" +// } + +/* +======================================================================================== + Create Channels +======================================================================================== +*/ + +database_ch = Channel.of( tuple( params.uniclust, params.bfd ) ) +rhea_url_ch = Channel.of( params.rhea_url ) +sample_sheet_ch = Channel.fromPath( params.sample_sheet, + checkIfExists: true ) + + +/* +======================================================================================== + MAIN Workflow +======================================================================================== +*/ + +workflow { + + sample_sheet_ch + .splitCsv( header: true ) + .map { params.non_self ? tuple( it.organism_id, it.proteome_name, it.bait ) : tuple( it.organism_id, it.proteome_name ) } + .set { sample_sheet } + sample_sheet + .map { it[0] } + .unique() + .map { sample -> [ "sequences", "msa", "ppi", + "coexpression", "metabolites" ].collect { "${params.outputs}/${sample}/${it}" } } + .subscribe { it.each { println "Creating output directory: ${it}" } } + .map { it.each { file( it ).mkdirs() } } + // Get FASTA sequences by UniProt ID, then split each sequence into a + // single FASTA file. + sample_sheet.map { it[0] } | PULL_FASTA_SEQUENCES + PULL_FASTA_SEQUENCES.out + .splitFasta( elem: 1, record: [id: true, text: true] ) + .set { fastas } + + // If in test mode, only take 3 proteins, otherwise take all. + // Then munge to extract IDs + ( params.test ? fastas.take(3) : fastas ) + .map { tuple( it[0], it[1].id.split('\\|')[1], it[1].id.split('\\|')[2], it[1].text ) } // Organism ID, UniProtID, Entry Name, FASTA text + .set { sample_fastas } + + if ( params.non_self ) { + sample_sheet.map { it[2] } // Bait UniProtID + | PULL_BAIT_FASTA_SEQUENCES // Bait UniProtID, FASTA file + sample_sheet + .map { tuple( it[2], it[0] ) } // Bait UniProtID, Organism ID + .combine( PULL_BAIT_FASTA_SEQUENCES.out, + by: 0 ) // Bait UniProtID, Organism ID, FASTA file + .set { bait_fastas } + } + + // Get reactants from Rhea database using UniProtIDs + rhea_url_ch | DOWNLOAD_RHEA_DB + DOWNLOAD_RHEA_DB.out.set { rhea_db } + sample_fastas + .collectFile( newLine: true ) { [ "${it[0]}.txt", "${it[1]}\t${it[2]}" ] } + .map { tuple( it.getSimpleName(), it ) } // Organism ID, UniProtID list file + .combine( rhea_db ) // Organism ID, UniProtID list file, Rhea SMILES, Rhea2UniProt, Rhea2UniProt_Trembl + | GET_ENZYME_REACTANTS // Organism ID, reaction SMILES file + | CONNECT_METABOLITES + + // Get STRING connections + sample_fastas + .map { it[0..1] } // Organism ID, UniProtID + .collectFile( newLine: true ) { [ "${it[0]}.txt", it[1] ]} // UniProtID list + .map { tuple( it.getSimpleName(), it ) } // Organism ID, UniProtID list + | DOWNLOAD_STRING_DB + + // Get MSAs and cross them within each organism + sample_fastas + .collectFile( newLine: true ) { [ "${it[1]}.fasta", it[-1] ] } + .map { tuple( it.getSimpleName(), it ) } // UniProtID, FASTA file + .set { fasta_files } + sample_fastas + .map { it[1..0] } // UniProtID, Organism ID + .join( fasta_files, + by: 0, + failOnMismatch: true, + failOnDuplicate: true ) // UniProtID, Organism ID, FASTA file + .set { pre_msa } + + if ( params.non_self ) { + pre_msa.concat( bait_fastas ).set { pre_msa } + } + + pre_msa + .map { tuple( it[1], it[0], it[2] ) } // Organism ID, UniProtID, FASTA file + .combine( database_ch ) // Organism ID, UniProtID, FASTA file, uniclust, bfd + | GET_MSA // Organism ID, UniProtID, MSA file + + if ( params.non_self ) { + sample_sheet_ch + .map { it[2] } // Bait Uniprot ID + .toList() // [Bait Uniprot ID,...] + .set { bait_list } + GET_MSA.out + .filter { bait_list.contains( it[1] ) } + .set { counter_msas } + } else { + GET_MSA.out + .set { counter_msas } + } + + counter_msas + .map { tuple( it[0], it[2] ) } // // Organism ID, MSA file + .groupTuple( by: 0, sort: { a, b -> a.getSimpleName() <=> b.getSimpleName() } ) // Organism ID, [MSA file, ...] + .map { tuple ( it[0], + it[1].withIndex().collect { el, i -> Math.round(Math.floor(i / params.batch_size)) }, // add batch index + it[1] ) } // Organism ID, [batch_i, ...], [MSA file, ...] + .transpose() // Organism ID, batch_i, MSA file + .groupTuple( by: [0,1], sort: { a, b -> a.getSimpleName() <=> b.getSimpleName() } ) // Organism ID, batch_i, [MSA file, ...] + .set { msas_by_proteome } + GET_MSA.out + .combine( msas_by_proteome, by: 0 ) // Organism ID, UniProtID, MSA file, batch_i, [MSA file, ...] + .map { a -> a[0..-2] + [ + a[-1] + .sort { b, c -> b.getSimpleName() <=> c.getSimpleName() } + .takeWhile { it.getSimpleName() != a[2].getSimpleName() } + ] } // take lower triangle per proteome + .filter { it[-1].size() > 0 } // filter out trivial (size-0) elements + .set { msas_to_process } // Organism ID, UniProtID, MSA file, batch_i, [MSA file, ...] + + // Calculate evolutionary coupling + // msas_to_process | RF2TRACK + msas_to_process | DIRECT_COUPLING_ANALYSIS + + if ( params.test ) { + // msas_to_process | ALPHAFOLD2 + } + // ALPHAFOLD2.out + // .map { it[2] } + // .collect() + +} + +process PULL_FASTA_SEQUENCES { + + tag "${organism_id}" + + publishDir( "${params.outputs}/${organism_id}/sequences", + mode: 'copy' ) + + input: + val organism_id + + output: + tuple val( organism_id ), path( "*.fasta.gz" ) + + // TODO: filter only for representative proteome if no reference proteome + script: + """ + function get_proteome_id() { + curl -s "https://rest.uniprot.org/proteomes/search?query=(taxonomy_id:${organism_id})&format=json" | jq '.results[] | select(.proteomeType == "'"\$1"' proteome").id' + } + QUERIES=("Reference and representative" "Representative" "Other") + PROTEOME_ID= + for q in "\${QUERIES[@]}" + do + PROTEOME_ID=\$(get_proteome_id "\$q") + if [ ! -z \$PROTEOME_ID ] + then + break + fi + done + + wget "https://rest.uniprot.org/uniprotkb/stream?query=(proteome:\$PROTEOME_ID)&format=fasta&download=true&compressed=true" \ + -O ${organism_id}.fasta.gz \ + || ( + echo "Failed to download taxonomy ID ${organism_id} with proteome ID \$PROTEOME_ID from UniProt" + exit 1 + ) + """ + +} + + +process PULL_BAIT_FASTA_SEQUENCES { + + tag "${uniprot_id}" + + publishDir( "${params.outputs}", + mode: 'copy' ) + + input: + val uniprot_id + + output: + tuple val( uniprot_id ), path( "*.fasta" ) + + script: + """ + wget "https://rest.uniprot.org/uniprotkb/stream?query=(accession:${uniprot_id})&format=fasta&download=true&compressed=false" \ + -O ${uniprot_id}.fasta \ + || ( + echo "Failed to download Uniprot ID ${uniprot_id} from UniProt" + exit 1 + ) + """ + +} + + +process DOWNLOAD_STRING_DB { + + tag "${organism_id}" + + publishDir( "${params.outputs}/${organism_id}/coexpression", + mode: 'copy' ) + + input: + tuple val( organism_id ), path( uniprot_ids ) + + output: + tuple val( organism_id ), path( "*.string.tsv" ) + + script: + """ + OUTFILE=${organism_id}.string0.tsv + printf 'string_id_1\\tstring_id_2\\tstring_cooccurence\\tstring_coexpression\\n' > \$OUTFILE + curl -s "https://stringdb-downloads.org/download/protein.links.full.v12.0/${organism_id}.protein.links.full.v12.0.txt.gz" \ + | zcat \ + | tail -n+2 \ + | tr ' ' \$'\\t' \ + | cut -f-2,6,8 \ + >> \$OUTFILE \ + || ( + echo "Failed to download taxonomy ID ${organism_id} from STRING " + exit 1 + ) + + # Resolve UniProt IDs + printf 'uniprot_id_1\\tstring_id_1\\n' > ${organism_id}-string-lookup.tsv + cat ${uniprot_ids} | split -l 5 - uniprot-chunk_ + for chunk in uniprot-chunk_* + do + + curl -s -X POST --data "species=${organism_id}&echo_query=1&identifiers="\$(awk -v ORS="%0d" '1' \$chunk) https://string-db.org/api/tsv-no-header/get_string_ids \ + | cut -f1,3 \ + >> ${organism_id}-string-lookup.tsv \ + || ( + echo "Failed to download UniProt lookup from STRING " + cat \$chunk + exit 1 + ) + sleep 0.1 + done + + python -c 'import pandas as pd; import sys; lookup = pd.read_csv("${organism_id}-string-lookup.tsv", sep="\\t"); renamer = dict(uniprot_id_1="uniprot_id_2", string_id_1="string_id_2"); pd.read_csv("${organism_id}.string0.tsv", sep="\\t").assign(organism_id="${organism_id}").merge(lookup).merge(lookup.rename(columns=renamer)).to_csv(sys.stdout, sep="\\t", index=False)' \ + > ${organism_id}.string.tsv + """ + +} + + +process DOWNLOAD_RHEA_DB { + + tag "${rhea_url}" + + input: + val rhea_url + + output: + tuple path( "rhea-reaction-smiles.tsv" ), path( "rhea2uniprot.tsv" ), path( "rhea2uniprot_trembl.tsv.gz" ) + + script: + """ + wget ${rhea_url}/tsv/rhea2uniprot.tsv + wget ${rhea_url}/tsv/rhea2uniprot_trembl.tsv.gz + wget ${rhea_url}/tsv/rhea-reaction-smiles.tsv + """ +} + + +process GET_ENZYME_REACTANTS { + + tag "${organism_id}" + + publishDir( "${params.outputs}/${organism_id}/metabolites", + mode: 'copy' ) + + input: + tuple val( organism_id ), path( uniprot_ids ), path( rhea_smiles ), path( rhea2uniprot ), path( rhea2uniprot_tr ) + + output: + tuple val( organism_id ), path( "*.rxn-smiles.tsv" ) + + script: + """ + OUTFILE=${organism_id}.rxn-smiles.tsv + join -t\$'\\t' -1 4 -2 1 <(tail -n+2 -q "${rhea2uniprot}" <(zcat ${rhea2uniprot_tr}) | sort -k4) <(sort -k1 "${uniprot_ids}") \ + | awk -v OFS='\\t' '\$3=="UN"{ \$2++; print \$0; \$2++; print \$0 }; \$3!="UN"' \ + | sort -k2 \ + | join -t\$'\\t' -1 2 -2 1 - <(sort -k1 "${rhea_smiles}") \ + > rxn-smiles0.tsv + + cat <(printf 'organism_id\\trhea_reaction_id\\tuniprot_id\\tdirection\\trhea_master_reaction_id\\tentry_name\\treaction_smiles\\treactants\\tproducts\\n') \ + <(paste rxn-smiles0.tsv \ + <(cat rxn-smiles0.tsv | cut -f6 | awk -F '>>' -v OFS=\$'\t' '{ print \$1,\$2 }') \ + | awk -v OFS='\\t' '{ print "${organism_id}",\$0 }') \ + > \$OUTFILE + + if [ \$(cat \$OUTFILE | wc -l) -gt 1 ] + then + exit 0 + else + >&2 echo "No entries in reaction file: \$OUTFILE." + exit 1 + fi + """ +} + + +process CONNECT_METABOLITES { + + tag "${organism_id}" + label "big_time" + + publishDir( "${params.outputs}/${organism_id}/metabolites", + mode: 'copy' ) + + input: + tuple val( organism_id ), path( reaction_table ) + + output: + tuple val( organism_id ), path( "*.metabolism-connection.tsv" ) + + script: + """ + cat ${reaction_table} | python ${projectDir}/bin/metabolism/connect-metabolites.py > ${organism_id}.metabolism-connection.tsv + """ +} + + +process GET_MSA { + + tag "${organism_id} : ${uniprot_id}" + label 'big_cpu' + errorStrategy 'retry' + maxRetries 2 + + publishDir( "${params.outputs}/${organism_id}/msa", + mode: 'copy' ) + + // Proteome ID, UniProtID, FASTA file, uniclust, bfd + input: + tuple val( organism_id ), val( uniprot_id ), file( fasta ), val( uniclust ), val( bfd ) + + output: + tuple val( organism_id ), val( uniprot_id ), path( "${organism_id}-${uniprot_id}.a3m" ) + + script: + """ + set -x + dbs=(${uniclust} ${bfd}) + for d in \${dbs[@]} + do + hhblits \ + -cpu ${task.cpus} \ + -maxmem ${task.memory.getGiga()} \ + -v 2 \ + -i "${fasta}" \ + -d \$d \ + -e 0.001 \ + -o /dev/null \ + -oa3m "\$(basename \$d).a3m" \ + -cov 60 \ + -n 3 \ + -realign -realign_max 10000 + done + + outputs=( *.a3m ) + cat <(head -n2 \${outputs[0]}) <(tail -n+3 -q \${outputs[@]}) \ + > "${organism_id}-${uniprot_id}.a3m" + for f in \${outputs[@]} + do + if [ \$f != "${organism_id}-${uniprot_id}.a3m" ] + then + rm \$f + fi + done + """ +} + + +process DIRECT_COUPLING_ANALYSIS { + + label 'big_time' + tag "${uniprot_id}-batch_${batch_idx}" + stageInMode 'link' + errorStrategy 'retry' + maxRetries 2 + + publishDir( "${params.outputs}/${organism_id}/ppi", + mode: 'copy' ) + + // Proteome ID, UniProtID, MSA file, [MSA file, ...] + input: + tuple val( organism_id ), val( uniprot_id ), path( msa1, stageAs: "ref/ref.a3m" ), val( batch_idx ), path( msa2 ) + + output: + tuple val( organism_id ), val( uniprot_id ), path( "*-batch*_dca.tsv" ), emit: main + path "*-batch*_dca" , emit: plots + + script: + """ + MSA_LIST=msa-list.txt + for item in *.a3m + do + echo "\$item" >> \$MSA_LIST + done + sppid dca-single \ + <(echo "ref/ref.a3m") \ + --msa2 \$MSA_LIST \ + --list-file \ + --apc \ + --output "${organism_id}-${uniprot_id}-batch_${batch_idx}_dca.tsv" \ + --plot "${organism_id}-${uniprot_id}-batch_${batch_idx}_dca" + """ +} + +process RF2TRACK { + + label 'big_time' + tag "${uniprot_id}-batch_${batch_idx}" + stageInMode 'link' + errorStrategy 'retry' + maxRetries 2 + + publishDir( "${params.outputs}/${organism_id}/ppi", + mode: 'copy' ) + + // Proteome ID, UniProtID, MSA file, [MSA file, ...] + input: + tuple val( organism_id ), val( uniprot_id ), path( msa1, stageAs: "ref/ref.a3m" ), val( batch_idx ), path( msa2 ) + + output: + tuple val( organism_id ), val( uniprot_id ), path( "*-batch_*.tsv" ), emit: main + path "*-batch*_rf2t", emit: plots + + script: + """ + MSA_LIST=msa-list.txt + for item in *.a3m + do + echo "\$item" >> \$MSA_LIST + done + PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True sppid rf2t-single \ + <(echo "ref/ref.a3m") \ + --msa2 \$MSA_LIST \ + --list-file \ + --output "${organism_id}-${uniprot_id}-batch_${batch_idx}_rf2t.tsv" \ + --plot "${organism_id}-${uniprot_id}-batch_${batch_idx}_rf2t" + """ +} + + +process ALPHAFOLD2 { + + label 'gpu' + tag "${uniprot_id}-batch_${batch_idx}" + stageInMode 'link' + errorStrategy 'retry' + maxRetries 2 + + publishDir( "${params.outputs}/${organism_id}/ppi", + mode: 'copy' ) + + input: + tuple val( organism_id ), val( uniprot_id ), path( msa1, stageAs: "ref/ref.a3m" ), val( batch_idx ), path( msa2 ) + + output: + tuple val( organism_id ), val( uniprot_id ), path( "*-batch_*_af2" ), emit: main + path "*-batch_*_af2/*.pdb", emit: pdb + + script: + """ + MSA_LIST=msa-list.txt + for item in *.a3m + do + echo "\$item" >> \$MSA_LIST + done + export CUDNN_PATH=\$(dirname \$(python -c "import nvidia.cudnn; print(nvidia.cudnn.__file__)")) + export LD_LIBRARY_PATH=\${CUDNN_PATH}/lib + >&2 echo "CuDNN path at" \$CUDNN_PATH "contains:" # should exist and give a good path + >&2 echo \$(ls \$CUDNN_PATH) # should contain stuff like a lib subdir with libcudnn .so files + >&2 echo "LD library path at" \$LD_LIBRARY_PATH # should exist and contain CUDNN_PATH + >&2 echo "\$(nvcc --version)" + >&2 python3 -c "import tensorflow as tf; print(f'Available devices:\\n{tf.config.list_physical_devices()}')" + XLA_PYTHON_CLIENT_MEM_FRACTION=.9 sppid af2-single \ + <(echo "ref/ref.a3m") \ + --msa2 \$MSA_LIST \ + --list-file \ + --output "${organism_id}-${uniprot_id}-batch_${batch_idx}_af2" \ + --plot "${organism_id}-${uniprot_id}-batch_${batch_idx}_af2" + """ +} + + +/* +======================================================================================== + Workflow Event Handler +======================================================================================== +*/ + +workflow.onComplete { + + println ( workflow.success ? """ + Pipeline execution summary + --------------------------- + Completed at: ${workflow.complete} + Duration : ${workflow.duration} + Success : ${workflow.success} + workDir : ${workflow.workDir} + exit status : ${workflow.exitStatus} + """ : """ + Failed: ${workflow.errorReport} + exit status : ${workflow.exitStatus} + """ + ) +} + +/* +======================================================================================== + THE END +======================================================================================== +*/ \ No newline at end of file diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..d876615 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,159 @@ +manifest { + + author = "Eachan Johnson" + homePage = "https://github.com/scbirlab/nf-ppi" + description = "Predict gene-gene interactions based on protein-protein interaction predictions and similar metabolites." + defaultBranch = "v0.0.1" + nextflowVersion = '!>=24.0.0' + version = "0.0.1" + doi = '' + +} + +params { + + /* Required */ + sample_sheet = null + bfd = null + uniclust = null + + /* Optional */ + test = false + non_self = false + help = null + rhea_url = "https://ftp.expasy.org/databases/rhea" + outputs = "${launchDir}/outputs" + batch_size = 100 + +} + +process.conda = "${projectDir}/environment.yml" +conda { + createTimeout = '2h' + enabled = true +} + +profiles { + + standard { + + process { + executor = 'slurm' + array = 1000 + + withLabel: big_cpu { + time = '3h' + cpus = 16 + memory = 32.GB + } + + withLabel: big_time { + time = '4d' + cpus = 1 + memory = 64.GB + } + + withLabel: some_mem { + memory = 16.GB + } + + withLabel: med_mem { + memory = 64.GB + } + + withLabel: big_mem { + memory = 128.GB + } + + withLabel: gpu_single { + queue = 'gpu' + time = '7d' + module = 'cuDNN/8.9.2.26-CUDA-12.1.1' + cpus = 4 + clusterOptions = '--gres=gpu:1' + memory = 128.GB + } + + withLabel: gpu { + queue = 'gpu' + time = '4h' + module = 'cuDNN/8.9.2.26-CUDA-12.1.1' + cpus = 2 + clusterOptions = '--gres=gpu:2' + memory = 128.GB + } + + } + + dag { + enabled = true + overwrite = true + } + + notification { + enabled = true + to = "$USER@crick.ac.uk" + } + + } + + local { + + process { + executor = 'local' + + withLabel: big_cpu { + time = '3h' + cpus = 16 + memory = 32.GB + } + + withLabel: some_mem { + memory = 8.GB + } + + withLabel: med_mem { + memory = 16.GB + } + + withLabel: big_mem { + memory = 32.GB + } + + withLabel: gpu { + memory = 32.GB + } + + } + + } + + gh { + + conda.useMamba = false + + process { + executor = 'local' + + withLabel: big_cpu { + cpus = 1 + memory = 12.GB + } + + withLabel: some_mem { + memory = 12.GB + } + + withLabel: med_mem { + memory = 12.GB + } + + withLabel: gpu { + memory = 12.GB + } + + } + + } + +} \ No newline at end of file diff --git a/test/inputs/sample-sheet.csv b/test/inputs/sample-sheet.csv new file mode 100644 index 0000000..fca7561 --- /dev/null +++ b/test/inputs/sample-sheet.csv @@ -0,0 +1,2 @@ +organism_id,proteome_name +243273,"Mycoplasma genitalium" diff --git a/test/nextflow.config b/test/nextflow.config new file mode 100644 index 0000000..a7b0a18 --- /dev/null +++ b/test/nextflow.config @@ -0,0 +1,6 @@ +params { + sample_sheet = "inputs/sample-sheet.csv" + test = true + uniclust = "/nemo/lab/johnsone/reference/hhdb/uniclust30/uniclust30_2018_08" + bfd = "/nemo/lab/johnsone/reference/hhdb/bfd_metaclust/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" +} \ No newline at end of file