diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml
new file mode 100644
index 0000000..bc57afa
--- /dev/null
+++ b/.github/workflows/nf-test.yml
@@ -0,0 +1,33 @@
+
+name: Nextflow test
+
+on: [push]
+
+jobs:
+  test:
+
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -el {0}
+
+    steps:
+    - uses: easimon/maximize-build-space@master
+      with:
+        root-reserve-mb: 512
+        swap-size-mb: 1024
+        remove-dotnet: 'true'
+        remove-android: 'true'
+        remove-haskell: 'true'
+        remove-codeql: 'true'
+        remove-docker-images: 'true'
+    - uses: actions/checkout@v3
+    - uses: conda-incubator/setup-miniconda@v2
+      with:
+        python-version: 3.11
+        channels: conda-forge,bioconda
+        channel-priority: true
+    - uses: nf-core/setup-nextflow@v1
+    - name: Test Nextflow pipeline
+      run: |
+          nextflow run ${GITHUB_WORKSPACE} -c test/nextflow.config -profile gh -stub
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..14a753a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+.nextflow*
+/outputs/
+/work/
+/test/outputs
+/test/work
+/test/*.html
+~$*.xlsx
+*.html
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..63b4b68
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) [year] [fullname]
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6c64aac
--- /dev/null
+++ b/README.md
@@ -0,0 +1,194 @@
+# Nanopore variant calling pipeline
+
+![GitHub Workflow Status (with branch)](https://img.shields.io/github/actions/workflow/status/scbirlab/nf-ggi/nf-test.yml)
+[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.10.0-23aa62.svg)](https://www.nextflow.io/)
+[![run with conda](https://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
+
+**scbirlab/nf-ont-call-variants** is a Nextflow pipeline to call variants from Nanopore FASTQ files from bacterial clones relative to a wildtype control.
+
+The pipeline broadly recapitualtes, where possible, the GATK best practices for germline short variant calling. 
+
+**Table of contents**
+
+- [Processing steps](#processing-steps)
+- [Requirements](#requirements)
+- [Quick start](#quick-start)
+- [Inputs](#inputs)
+- [Outputs](#outputs)
+- [Issues, problems, suggestions](#issues-problems-suggestions)
+- [Further help](#further-help)
+
+## Processing steps
+
+For each sample:
+
+1. Quality Trim reads using `cutadapt`. 
+2. Map to genome FASTA using `minimap2`.
+3. Mark duplicates with `picard MarkDuplicates`.
+4. Call variants with `Clair3`.
+
+Then merge resulting GVCFs using GATK `CombineGVCFs`. With the combined variant calls:
+
+1. Joint genoptype with GATK `GenotypeGVCFs`.
+2. Filter variants using GATK `VariantFiltration`.
+3. Annotate variant effects using `snpEff`.
+4. Filter out variants where all samples are identical to the wildtype control, which [is assumed to be the `sample_id` which is alphabetically last](#sample-sheet).
+5. Write to output TSV.
+
+### Other steps
+
+1. Get FASTQ quality metrics with `fastqc`.
+2. Generate alignment statistics and plots with `samtools stats`.
+2. Map to genome FASTA using `bowtie2` because `minimap2` logs are not compatible with `multiqc`. This way, some kind of alignment metrics are possible.
+3. Compile the logs of processing steps into an HTML report with `multiqc`.
+
+## Requirements
+
+### Software
+
+You need to have Nextflow and `conda` installed on your system.
+
+#### First time using Nextflow?
+
+If you're at the Crick or your shared cluster has it already installed, try:
+
+```bash
+module load Nextflow
+```
+
+Otherwise, if it's your first time using Nextflow on your system, you can install it using `conda`:
+
+```bash
+conda install -c bioconda nextflow 
+```
+
+You may need to set the `NXF_HOME` environment variable. For example,
+
+```bash
+mkdir -p ~/.nextflow
+export NXF_HOME=~/.nextflow
+```
+
+To make this a permanent change, you can do something like the following:
+
+```bash
+mkdir -p ~/.nextflow
+echo "export NXF_HOME=~/.nextflow" >> ~/.bash_profile
+source ~/.bash_profile
+```
+
+### Reference genome
+
+You also need the genome FASTA for the bacteria you are sequencing. These can be obtained from [NCBI Nucleotide](https://www.ncbi.nlm.nih.gov/nuccore/):
+
+1. Search for your strain of interest, and open its main page
+2. On the right-hand side, click `Customize view`, then `Customize` and check `Show sequence`. Finally, click `Update view`. You may have to wait a few minute while the sequence downloads.
+3. Click `Send to: > Complete record > File > FASTA > Create file`
+4. Save the files to a path which you provide as `--genome_fasta` [below](#inputs).
+
+## Quick start
+
+Make a [sample sheet (see below)](#sample-sheet) and, optionally, a [`nextflow.config` file](#inputs) in the directory where you want the pipeline to run. Then run Nextflow.
+
+```bash 
+nextflow run scbirlab/nf-ont-call-variants
+```
+
+Each time you run the pipeline after the first time, Nextflow will use a locally-cached version which will not be automatically updated. If you want to ensure that you're using the very latest version of the pipeline, use the `-latest` flag.
+
+```bash 
+nextflow run scbirlab/nf-ont-call-variants -latest
+```
+If you want to run a particular tagged version of the pipeline, such as `v0.0.1`, you can do so using
+
+```bash 
+nextflow run scbirlab/nf-ont-call-variants -r v0.0.2
+```
+
+For help, use `nextflow run scbirlab/nf-ont-call-variants --help`.
+
+The first time you run the pipeline on your system, the software dependencies in `environment.yml` will be installed. This may take several minutes.
+
+## Inputs
+
+The following parameters are required:
+
+- `sample_sheet`: path to a CSV with information about the samples and FASTQ files to be processed
+- `genome_fasta`: path to reference genome FASTA
+- `snpeff_database`: name of snpEff database to use for annotation. This should be derived from the same assembly as `genome_fasta`. Database names often end in the assembly name, such as `gca_000015005`, which you can check matches that of your `genome_fasta`
+
+The following parameters have default values which can be overridden if necessary.
+
+- `inputs = "inputs"` : The folder containing your inputs.
+- `trim_qual = 10` : For `cutadapt`, the minimum Phred score for trimming 3' calls
+- `min_length = 10` : For `cutadapt`, the minimum trimmed length of a read. Shorter reads will be discarded
+- `gatk_image = "docker://broadinstitute/gatk:latest"` : Which GATK4 image to use
+- `snpeff_url = "https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip"` : Where to download snpEff from
+- `clair3_image = "docker://hkubal/clair3:latest"` : Which Clair3 image to use
+
+The parameters can be provided either in the `nextflow.config` file or on the `nextflow run` command.
+
+Here is an example of the `nextflow.config` file:
+
+```nextflow
+params {
+    sample_sheet = "/path/to/sample-sheet.csv"
+    inputs = "/path/to/inputs"
+    genome_fasta = "/path/to/1_ASM28329v1_genomic.fna"
+    snpeff_database = "Mycobacterium_smegmatis_str_mc2_155_gca_000283295"
+
+}
+```
+
+Alternatively, you can provide the parameters on the command line:
+
+```bash
+nextflow run scbirlab/nf-ont-call-variants \
+    --sample_sheet /path/to/sample-sheet.csv \
+    --inputs /path/to/inputs
+    --genome_fasta /path/to/1_ASM28329v1_genomic.fna \
+    --snpeff_database Mycobacterium_smegmatis_str_mc2_155_gca_000283295
+``` 
+
+### Sample sheet
+
+The sample sheet is a CSV file providing information about which FASTQ files belong to which sample.
+
+The file must have a header with the column names below, and one line per sample to be processed.
+
+- `sample_id`: the unique name of the sample. The wildtype must be **named so that it is alphabetically last**
+- `reads`: path (relative to `inputs` option above) to compressed FASTQ files derived from Nanopore sequencing
+
+Here is an example of the sample sheet:
+
+| sample_id | reads                |
+| --------- | -------------------- | 
+| wt        | raw_reads.fastq.gz   | 
+| mut1      | raw_reads.fastq.gz   | 
+
+## Outputs
+
+Outputs are saved in the same directory as `sample_sheet`. They are organised under three directories:
+
+- `processed`: FASTQ files and logs resulting from alignments
+- `tables`: tables, plots, and VCF files corresponding to variant calls
+- `multiqc`: HTML report on processing steps
+
+## Issues, problems, suggestions
+
+Add to the [issue tracker](https://www.github.com/scbirlab/nf-ont-call-variants/issues).
+
+## Further help
+
+Here are the help pages of the software used by this pipeline.
+
+- [fastqc](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+- [multiqc](https://multiqc.info/)
+- [nextflow](https://www.nextflow.io/docs/latest/index.html)
+- [cutadapt](https://cutadapt.readthedocs.io/en/stable/index.html)
+- [minimap2](https://lh3.github.io/minimap2/minimap2.html)
+- [bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/manual.shtml)
+- [samtools](http://www.htslib.org/doc/samtools.html)
+- [GATK](https://gatk.broadinstitute.org/hc/en-us)
+- [Picard](https://broadinstitute.github.io/picard/)
+- [snpEff](https://pcingola.github.io/SnpEff/)
\ No newline at end of file
diff --git a/bin/metabolism/connect-metabolites.py b/bin/metabolism/connect-metabolites.py
new file mode 100644
index 0000000..6900bae
--- /dev/null
+++ b/bin/metabolism/connect-metabolites.py
@@ -0,0 +1,121 @@
+"""Use RDKit to clean reactants and products, and identify shared and similar metabolites between pairs."""
+
+from typing import Iterable, List, Optional, Union
+from functools import partial
+from itertools import product
+import sys
+
+from carabiner import print_err
+from carabiner.cast import cast
+import pandas as pd
+import numpy as np
+from rdkit import RDLogger    
+from rdkit.Chem import AddHs, rdFingerprintGenerator, MolFromSmiles, MolToInchi, MolToSmiles
+from rdkit.Chem.SaltRemover import SaltRemover
+from rdkit.DataStructs import TanimotoSimilarity
+from tqdm.auto import tqdm
+
+RDLogger.DisableLog('rdApp.*')     
+
+removers = (
+    SaltRemover(defnData="[H,Na,K,Mg,Ca,Mn,Fe,F,Cl,Br,O,S]").StripMol,
+    partial(SaltRemover().StripMol, dontRemoveEverything=True),
+    AddHs,
+)
+
+def _clean_smiles_list(smiles=str) -> str:
+    mol = MolFromSmiles(smiles)
+    for remover in removers:
+        mol = remover(mol)
+    return MolToSmiles(mol)
+
+
+def clean_metabolites(table: pd.DataFrame, 
+                      cols=Iterable[str]) -> pd.DataFrame:
+    return table.assign(**{col: table[col].apply(_clean_smiles_list) for col in cols})
+
+
+def _shared(a_b):
+    a, b = (map(MolFromSmiles, a_or_b) for a_or_b in a_b)
+    b = set(map(MolToInchi, b))
+    return any(MolToInchi(item) in b for item in a)
+
+
+def _have_shared_chemical(df: pd.DataFrame, a: str, b: str):
+    return df[[a, b]].progress_apply(_shared, axis=1)
+
+mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
+
+def _fingerprinter(a):
+    return mfpgen.GetFingerprint(MolFromSmiles(a))
+
+def _tanimoto(a_b):
+    a, b = a_b
+    return max(TanimotoSimilarity(*map(_fingerprinter, items)) for items in product(*a_b))
+
+def _max_similarity(df: pd.DataFrame, a: str, b: str):
+    return df[[a, b]].progress_apply(_tanimoto, axis=1)
+
+
+def _smiles_splitter(df: pd.DataFrame, col: str, char: str = ".") -> List[str]:
+    return df[col].str.split(char)
+
+
+def connect_metabolites(table: pd.DataFrame,
+                        cols: Iterable[str],
+                        id_col: Union[str, Iterable[str]],
+                        table2: Optional[pd.DataFrame] = None) -> pd.DataFrame:
+    
+    if table2 is None:
+        table2 = table.copy()
+    if isinstance(id_col, str):
+        id_col = cast(id_col, to=list)
+    cols = cast(cols, to=list)
+
+    all_cols = id_col + cols
+    all_cols2 = [f"{col}_2" for col in all_cols]
+    cols2 = [f"{col}_2" for col in cols]
+
+    table = table[all_cols].assign(**{f"{col}_split": partial(_smiles_splitter, col=col) for col in cols})
+    table2 = (table2[all_cols]
+              .rename(columns=dict(zip(all_cols, all_cols2)))
+              .assign(**{f"{col}_split": partial(_smiles_splitter, col=col) for col in cols2}))
+    table = table.merge(table2, how='cross').query(f"{id_col[0]} < {id_col[0]}_2")
+
+    index_labels = tuple("_".join(items) for items in product(["r1", "p1"], ["r2", "p2"]))
+    print_err(f"By shared chemicals...")
+    table = table.assign(**{f"connection_{index_labels[i]}": partial(_have_shared_chemical, a=f"{a}_split", b=f"{b}_split") 
+                            for i, (a, b) in enumerate(product(cols, cols2))})
+    print_err(f"By maximum similarity...")
+    table = table.assign(**{f"max_similarity_{index_labels[i]}": partial(_max_similarity, a=f"{a}_split", b=f"{b}_split") 
+                            for i, (a, b) in enumerate(product(cols, cols2))})
+
+    return table[[col for col in table if not col.endswith("_split")]]
+    
+    
+def main() -> None:
+
+    tqdm.pandas()
+
+    print_err(f"Reading table from STDIN:")
+    table = pd.read_csv(sys.stdin, sep='\t').assign(_id=lambda x: x["rhea_reaction_id"].astype(str).str.cat(x["uniprot_id"], sep=":"))
+    print_err(table)
+    id_table = table[["_id", "rhea_reaction_id", "uniprot_id"]].copy()
+
+    print_err(f"Cleaning metabolites...")
+    table = clean_metabolites(table, cols=("reactants", "products"))
+    print_err(f"Connecting metabolites...")
+    table = (connect_metabolites(table, cols=("reactants", "products"), id_col=["_id"])
+             .merge(id_table, how='left')
+             .merge(id_table.rename(columns={col: f"{col}_2" for col in id_table}), how='left'))
+    cols_to_rename = ("reactants", "products", "uniprot_id", "rhea_reaction_iduniprot_id")
+    table = (table
+             .drop(columns=["_id", "_id_2"])
+             .rename(columns=dict(zip(cols_to_rename, (f"{col}_1" for col in cols_to_rename)))))
+    table[reversed(table.columns)].to_csv(sys.stdout, sep='\t', index=False)
+
+    return None
+
+
+if __name__ == '__main__':
+    main()
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..29cd842
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,15 @@
+name: nf-ggi
+
+channels:
+    - conda-forge
+    - bioconda
+
+dependencies:
+    - python==3.11
+    - pip
+    - bioconda::hhsuite
+    - conda-forge::jq
+    - conda-forge::rdkit
+    - pip:
+        - "-e /nemo/lab/johnsone/home/users/johnsoe/github/yunta"
+        - "-e /nemo/lab/johnsone/home/users/johnsoe/github/rf2t-micro"
\ No newline at end of file
diff --git a/main.nf b/main.nf
new file mode 100644
index 0000000..e0ac300
--- /dev/null
+++ b/main.nf
@@ -0,0 +1,616 @@
+#!/usr/bin/env nextflow
+
+/*
+========================================================================================
+   Gene-gene interaction predicting Nextflow Workflow
+========================================================================================
+   Github   : https://github.com/scbirlab/nf-ggi
+   Contact  : Eachan Johnson <eachan.johnson@crick.ac.uk>
+----------------------------------------------------------------------------------------
+*/
+
+nextflow.enable.dsl=2
+
+/*
+========================================================================================
+   Help text
+========================================================================================
+*/
+if ( params.help ) {
+   println """\
+         S C B I R   G E N E - G E N E   I N T E R A C T I O N   P R E D I C T I O N   P I P E L I N E
+         =============================================================================================
+         Nextflow pipeline to predict gene-gene interactions based on protein-protein interaction 
+         predictions and similar metabolites.
+
+         Usage:
+            nextflow run scbirlab/nf-ggi --accession <acc number>
+            nextflow run scbirlab/nf-ggi -c <config-file>
+
+         Required parameters:
+            sample_sheet         UniProt accession number for organism of interest.
+            uniclust, bfd        Paths to get HHblits databases.
+
+         Optional parameters (with defaults):  
+            rhea_url       URL to download Rhea reaction database. Default: "https://ftp.expasy.org/databases/rhea"
+            outputs        Output folder. Default: "outputs".
+
+         The parameters can be provided either in the `nextflow.config` file or on the `nextflow run` command.
+   
+   """.stripIndent()
+   exit 0
+}
+
+/*
+========================================================================================
+   Check parameters
+========================================================================================
+*/
+if ( !params.sample_sheet ) {
+   throw new Exception("!!! PARAMETER MISSING: Please provide a sample sheet including UniProt proteome ID for organism of interest.")
+}
+if ( !params.uniclust ) {
+   throw new Exception("!!! PARAMETER MISSING: Please provide a path to UniClust database.")
+}
+if ( !params.bfd ) {
+   throw new Exception("!!! PARAMETER MISSING: Please provide a path to BFD database.")
+}
+
+working_dir = params.outputs
+
+sequences = "sequences"
+ppi = "ppi"
+metabi = "metabolites"
+coex = "coexpression"
+
+log.info """\
+         S C B I R   G E N E - G E N E   I N T E R A C T I O N   P R E D I C T I O N   P I P E L I N E
+         =============================================================================================
+         test mode                     : ${params.test}
+         inputs
+            sample sheet               : ${params.sample_sheet}
+            UniClust database          : ${params.uniclust}
+            BFD                        : ${params.bfd}
+            Rhea                       : ${params.rhea_url}
+         output                        : ${params.outputs}
+         """
+         .stripIndent()
+
+// dirs_to_make = [sequences, ppi, metabi]
+// println  """
+//             Making directories: 
+//           """.stripIndent()
+// dirs_to_make.each { 
+//    println "${it}: " 
+//    println file(it).mkdirs() ? "OK" : "Cannot create directory: ${it}"
+// }
+
+/*
+========================================================================================
+   Create Channels
+========================================================================================
+*/
+
+database_ch = Channel.of( tuple( params.uniclust, params.bfd ) )
+rhea_url_ch = Channel.of( params.rhea_url )
+sample_sheet_ch = Channel.fromPath( params.sample_sheet,
+                                    checkIfExists: true )
+
+
+/*
+========================================================================================
+   MAIN Workflow
+========================================================================================
+*/
+
+workflow {
+
+   sample_sheet_ch
+      .splitCsv( header: true )
+      .map { params.non_self ? tuple( it.organism_id, it.proteome_name, it.bait ) : tuple( it.organism_id, it.proteome_name ) }
+      .set { sample_sheet }
+   sample_sheet
+      .map { it[0] }
+      .unique()
+      .map { sample -> [ "sequences", "msa", "ppi", 
+                         "coexpression", "metabolites" ].collect { "${params.outputs}/${sample}/${it}" } }
+      .subscribe { it.each { println "Creating output directory: ${it}" } }
+      .map { it.each { file( it ).mkdirs() } }
+   // Get FASTA sequences by UniProt ID, then split each sequence into a 
+   // single FASTA file.
+   sample_sheet.map { it[0] } | PULL_FASTA_SEQUENCES
+   PULL_FASTA_SEQUENCES.out
+      .splitFasta( elem: 1, record: [id: true, text: true] )
+      .set { fastas }
+
+   // If in test mode, only take 3 proteins, otherwise take all. 
+   // Then munge to extract IDs
+   ( params.test ? fastas.take(3) : fastas )
+      .map { tuple( it[0], it[1].id.split('\\|')[1], it[1].id.split('\\|')[2], it[1].text ) }  // Organism ID, UniProtID, Entry Name, FASTA text
+      .set { sample_fastas }
+
+   if ( params.non_self ) {
+      sample_sheet.map { it[2] }      // Bait UniProtID
+         | PULL_BAIT_FASTA_SEQUENCES  // Bait UniProtID, FASTA file
+      sample_sheet
+         .map { tuple( it[2], it[0] ) }  // Bait UniProtID, Organism ID
+         .combine( PULL_BAIT_FASTA_SEQUENCES.out, 
+                   by: 0 )  // Bait UniProtID, Organism ID, FASTA file
+         .set { bait_fastas }
+   }
+   
+   // Get reactants from Rhea database using UniProtIDs
+   rhea_url_ch | DOWNLOAD_RHEA_DB
+   DOWNLOAD_RHEA_DB.out.set { rhea_db }
+   sample_fastas
+      .collectFile( newLine: true ) { [ "${it[0]}.txt", "${it[1]}\t${it[2]}" ] }
+      .map { tuple( it.getSimpleName(), it ) }  // Organism ID, UniProtID list file
+      .combine( rhea_db ) // Organism ID, UniProtID list file, Rhea SMILES, Rhea2UniProt, Rhea2UniProt_Trembl
+      | GET_ENZYME_REACTANTS  // Organism ID, reaction SMILES file
+      | CONNECT_METABOLITES 
+
+   // Get STRING connections
+   sample_fastas
+      .map { it[0..1] }  // Organism ID, UniProtID
+      .collectFile( newLine: true ) { [ "${it[0]}.txt", it[1] ]}  // UniProtID list
+      .map { tuple( it.getSimpleName(), it ) }  // Organism ID, UniProtID list
+      | DOWNLOAD_STRING_DB
+
+   // Get MSAs and cross them within each organism
+   sample_fastas
+      .collectFile( newLine: true ) { [ "${it[1]}.fasta", it[-1] ] }
+      .map { tuple( it.getSimpleName(), it )  }  // UniProtID, FASTA file
+      .set { fasta_files }
+   sample_fastas
+      .map { it[1..0] }  // UniProtID, Organism ID
+      .join( fasta_files, 
+            by: 0,
+            failOnMismatch: true, 
+            failOnDuplicate: true )  // UniProtID, Organism ID, FASTA file
+      .set { pre_msa }
+
+   if ( params.non_self ) {
+      pre_msa.concat( bait_fastas ).set { pre_msa }
+   }
+
+   pre_msa
+      .map { tuple( it[1], it[0], it[2] ) }  // Organism ID, UniProtID, FASTA file
+      .combine( database_ch )  // Organism ID, UniProtID, FASTA file, uniclust, bfd
+      | GET_MSA  // Organism ID, UniProtID, MSA file
+   
+   if ( params.non_self ) {
+      sample_sheet_ch
+         .map { it[2] } // Bait Uniprot ID
+         .toList()      // [Bait Uniprot ID,...]
+         .set { bait_list }
+      GET_MSA.out
+         .filter { bait_list.contains( it[1] ) }
+         .set { counter_msas }
+   } else {
+      GET_MSA.out
+         .set { counter_msas }
+   }
+
+   counter_msas
+      .map { tuple( it[0], it[2] ) }  // // Organism ID, MSA file
+      .groupTuple( by: 0, sort: { a, b -> a.getSimpleName() <=> b.getSimpleName() } )  // Organism ID, [MSA file, ...]
+      .map { tuple ( it[0], 
+                     it[1].withIndex().collect { el, i -> Math.round(Math.floor(i / params.batch_size)) },  // add batch index
+                     it[1] ) }  // Organism ID, [batch_i, ...], [MSA file, ...]
+      .transpose()  // Organism ID, batch_i, MSA file
+      .groupTuple( by: [0,1], sort: { a, b -> a.getSimpleName() <=> b.getSimpleName() } )  // Organism ID, batch_i, [MSA file, ...]
+      .set { msas_by_proteome }  
+   GET_MSA.out
+      .combine( msas_by_proteome, by: 0 )  // Organism ID, UniProtID, MSA file, batch_i, [MSA file, ...]
+      .map { a -> a[0..-2] + [ 
+         a[-1]
+         .sort { b, c -> b.getSimpleName() <=> c.getSimpleName() }
+         .takeWhile { it.getSimpleName() != a[2].getSimpleName() } 
+      ] }  // take lower triangle per proteome
+      .filter { it[-1].size() > 0 }  // filter out trivial (size-0) elements
+      .set { msas_to_process }  // Organism ID, UniProtID, MSA file, batch_i, [MSA file, ...]
+
+   // Calculate evolutionary coupling
+   // msas_to_process | RF2TRACK
+   msas_to_process | DIRECT_COUPLING_ANALYSIS 
+
+   if ( params.test ) {
+      // msas_to_process | ALPHAFOLD2
+   }
+   // ALPHAFOLD2.out
+   //           .map { it[2] }
+   //           .collect()
+
+}
+
+process PULL_FASTA_SEQUENCES {
+
+   tag "${organism_id}"
+
+   publishDir( "${params.outputs}/${organism_id}/sequences", 
+               mode: 'copy' )
+
+   input:
+   val organism_id
+
+   output:
+   tuple val( organism_id ), path( "*.fasta.gz" )
+
+   // TODO: filter only for representative proteome if no reference proteome
+   script:
+   """
+   function get_proteome_id() {
+      curl -s "https://rest.uniprot.org/proteomes/search?query=(taxonomy_id:${organism_id})&format=json" | jq '.results[] | select(.proteomeType == "'"\$1"' proteome").id'
+   }
+   QUERIES=("Reference and representative" "Representative" "Other")
+   PROTEOME_ID=
+   for q in "\${QUERIES[@]}"
+   do
+      PROTEOME_ID=\$(get_proteome_id "\$q")
+      if [ ! -z \$PROTEOME_ID ]
+      then 
+         break
+      fi
+   done
+
+   wget "https://rest.uniprot.org/uniprotkb/stream?query=(proteome:\$PROTEOME_ID)&format=fasta&download=true&compressed=true" \
+      -O ${organism_id}.fasta.gz \
+      || (
+         echo "Failed to download taxonomy ID ${organism_id} with proteome ID \$PROTEOME_ID from UniProt"
+         exit 1   
+      )
+   """
+
+}
+
+
+process PULL_BAIT_FASTA_SEQUENCES {
+
+   tag "${uniprot_id}"
+
+   publishDir( "${params.outputs}", 
+               mode: 'copy' )
+
+   input:
+   val uniprot_id
+
+   output:
+   tuple val( uniprot_id ), path( "*.fasta" )
+
+   script:
+   """
+   wget "https://rest.uniprot.org/uniprotkb/stream?query=(accession:${uniprot_id})&format=fasta&download=true&compressed=false" \
+      -O ${uniprot_id}.fasta \
+      || (
+         echo "Failed to download Uniprot ID ${uniprot_id} from UniProt"
+         exit 1   
+      )
+   """
+
+}
+
+
+process DOWNLOAD_STRING_DB {
+
+   tag "${organism_id}"
+
+   publishDir( "${params.outputs}/${organism_id}/coexpression", 
+               mode: 'copy' )
+
+   input:
+   tuple val( organism_id ), path( uniprot_ids )
+
+   output:
+   tuple val( organism_id ), path( "*.string.tsv" )
+
+   script:
+   """
+   OUTFILE=${organism_id}.string0.tsv
+   printf 'string_id_1\\tstring_id_2\\tstring_cooccurence\\tstring_coexpression\\n' > \$OUTFILE
+   curl -s "https://stringdb-downloads.org/download/protein.links.full.v12.0/${organism_id}.protein.links.full.v12.0.txt.gz" \
+      | zcat \
+      | tail -n+2 \
+      | tr ' ' \$'\\t' \
+      | cut -f-2,6,8 \
+      >> \$OUTFILE \
+      || (
+         echo "Failed to download taxonomy ID ${organism_id} from STRING <https://stringdb-downloads.org>"
+         exit 1   
+      )
+
+   # Resolve UniProt IDs
+   printf 'uniprot_id_1\\tstring_id_1\\n' > ${organism_id}-string-lookup.tsv
+   cat ${uniprot_ids} | split -l 5 - uniprot-chunk_
+   for chunk in uniprot-chunk_*
+   do
+      
+      curl -s -X POST --data "species=${organism_id}&echo_query=1&identifiers="\$(awk -v ORS="%0d" '1' \$chunk) https://string-db.org/api/tsv-no-header/get_string_ids \
+         | cut -f1,3 \
+         >> ${organism_id}-string-lookup.tsv \
+         || (
+            echo "Failed to download UniProt lookup from STRING <https://string-db.org/api/tsv-no-header/get_string_ids>"
+            cat \$chunk
+            exit 1  
+         )
+      sleep 0.1
+   done
+
+   python -c 'import pandas as pd; import sys; lookup = pd.read_csv("${organism_id}-string-lookup.tsv", sep="\\t"); renamer = dict(uniprot_id_1="uniprot_id_2", string_id_1="string_id_2"); pd.read_csv("${organism_id}.string0.tsv", sep="\\t").assign(organism_id="${organism_id}").merge(lookup).merge(lookup.rename(columns=renamer)).to_csv(sys.stdout, sep="\\t", index=False)' \
+      > ${organism_id}.string.tsv
+   """
+
+}
+
+
+process DOWNLOAD_RHEA_DB {
+
+   tag "${rhea_url}"
+
+   input:
+   val rhea_url
+
+   output:
+   tuple path( "rhea-reaction-smiles.tsv" ), path( "rhea2uniprot.tsv" ), path( "rhea2uniprot_trembl.tsv.gz" )
+
+   script:
+   """
+   wget ${rhea_url}/tsv/rhea2uniprot.tsv
+   wget ${rhea_url}/tsv/rhea2uniprot_trembl.tsv.gz
+   wget ${rhea_url}/tsv/rhea-reaction-smiles.tsv
+   """
+}
+
+
+process GET_ENZYME_REACTANTS {
+
+   tag "${organism_id}"
+
+   publishDir( "${params.outputs}/${organism_id}/metabolites", 
+               mode: 'copy' )
+
+   input:
+   tuple val( organism_id ), path( uniprot_ids ), path( rhea_smiles ), path( rhea2uniprot ), path( rhea2uniprot_tr )
+
+   output:
+   tuple val( organism_id ), path( "*.rxn-smiles.tsv" )
+
+   script:
+   """
+   OUTFILE=${organism_id}.rxn-smiles.tsv
+   join -t\$'\\t' -1 4 -2 1 <(tail -n+2 -q "${rhea2uniprot}" <(zcat ${rhea2uniprot_tr}) | sort -k4) <(sort -k1 "${uniprot_ids}") \
+      | awk -v OFS='\\t' '\$3=="UN"{ \$2++; print \$0; \$2++; print \$0 }; \$3!="UN"' \
+      | sort -k2 \
+      | join -t\$'\\t' -1 2 -2 1 - <(sort -k1 "${rhea_smiles}") \
+      > rxn-smiles0.tsv
+
+   cat <(printf 'organism_id\\trhea_reaction_id\\tuniprot_id\\tdirection\\trhea_master_reaction_id\\tentry_name\\treaction_smiles\\treactants\\tproducts\\n') \
+      <(paste rxn-smiles0.tsv \
+         <(cat rxn-smiles0.tsv | cut -f6 | awk -F '>>' -v OFS=\$'\t' '{ print \$1,\$2 }') \
+         | awk -v OFS='\\t' '{ print "${organism_id}",\$0 }') \
+      > \$OUTFILE
+
+   if [ \$(cat \$OUTFILE | wc -l) -gt 1 ]
+   then 
+      exit 0
+   else
+      >&2 echo "No entries in reaction file: \$OUTFILE."
+      exit 1
+   fi
+   """
+}
+
+
+process CONNECT_METABOLITES {
+
+   tag "${organism_id}"
+   label "big_time"
+
+   publishDir( "${params.outputs}/${organism_id}/metabolites", 
+               mode: 'copy' )
+
+   input:
+   tuple val( organism_id ), path( reaction_table )
+
+   output:
+   tuple val( organism_id ), path( "*.metabolism-connection.tsv" )
+
+   script:
+   """
+   cat ${reaction_table} | python ${projectDir}/bin/metabolism/connect-metabolites.py > ${organism_id}.metabolism-connection.tsv
+   """
+}
+
+
+process GET_MSA {
+
+   tag "${organism_id} : ${uniprot_id}"
+   label 'big_cpu'
+   errorStrategy 'retry'
+   maxRetries 2
+
+   publishDir( "${params.outputs}/${organism_id}/msa", 
+               mode: 'copy' )
+
+   // Proteome ID, UniProtID, FASTA file, uniclust, bfd
+   input:
+   tuple val( organism_id ), val( uniprot_id ), file( fasta ), val( uniclust ), val( bfd )
+
+   output:
+   tuple val( organism_id ), val( uniprot_id ), path( "${organism_id}-${uniprot_id}.a3m" )
+
+   script:
+   """
+   set -x
+   dbs=(${uniclust} ${bfd})
+   for d in \${dbs[@]}
+   do
+   hhblits \
+      -cpu ${task.cpus} \
+      -maxmem ${task.memory.getGiga()} \
+      -v 2 \
+      -i "${fasta}" \
+      -d \$d \
+      -e 0.001 \
+      -o /dev/null \
+      -oa3m "\$(basename \$d).a3m" \
+      -cov 60 \
+      -n 3 \
+      -realign -realign_max 10000
+   done
+
+   outputs=( *.a3m )
+   cat <(head -n2 \${outputs[0]}) <(tail -n+3 -q \${outputs[@]}) \
+      > "${organism_id}-${uniprot_id}.a3m"
+   for f in \${outputs[@]}
+   do
+      if [ \$f != "${organism_id}-${uniprot_id}.a3m" ]
+      then
+         rm \$f
+      fi
+   done
+   """
+}
+
+
+process DIRECT_COUPLING_ANALYSIS {
+
+   label 'big_time'
+   tag "${uniprot_id}-batch_${batch_idx}"
+   stageInMode 'link'
+   errorStrategy 'retry'
+   maxRetries 2
+
+   publishDir( "${params.outputs}/${organism_id}/ppi", 
+               mode: 'copy' )
+
+   // Proteome ID, UniProtID, MSA file, [MSA file, ...]
+   input:
+   tuple val( organism_id ), val( uniprot_id ), path( msa1, stageAs: "ref/ref.a3m" ), val( batch_idx ), path( msa2 )
+
+   output:
+   tuple val( organism_id ), val( uniprot_id ), path( "*-batch*_dca.tsv" ), emit: main
+   path "*-batch*_dca" , emit: plots
+
+   script:
+   """
+   MSA_LIST=msa-list.txt
+   for item in *.a3m
+   do
+      echo "\$item" >> \$MSA_LIST
+   done
+   sppid dca-single \
+      <(echo "ref/ref.a3m") \
+      --msa2 \$MSA_LIST \
+      --list-file \
+      --apc \
+      --output "${organism_id}-${uniprot_id}-batch_${batch_idx}_dca.tsv" \
+      --plot "${organism_id}-${uniprot_id}-batch_${batch_idx}_dca"
+   """
+}
+
+process RF2TRACK {
+
+   label 'big_time'
+   tag "${uniprot_id}-batch_${batch_idx}"
+   stageInMode 'link'
+   errorStrategy 'retry'
+   maxRetries 2
+
+   publishDir( "${params.outputs}/${organism_id}/ppi", 
+               mode: 'copy' )
+
+   // Proteome ID, UniProtID, MSA file, [MSA file, ...]
+   input:
+   tuple val( organism_id ), val( uniprot_id ), path( msa1, stageAs: "ref/ref.a3m" ), val( batch_idx ), path( msa2 )
+
+   output:
+   tuple val( organism_id ), val( uniprot_id ), path( "*-batch_*.tsv" ), emit: main
+   path "*-batch*_rf2t", emit: plots
+
+   script:
+   """
+   MSA_LIST=msa-list.txt
+   for item in *.a3m
+   do
+      echo "\$item" >> \$MSA_LIST
+   done
+   PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True sppid rf2t-single \
+      <(echo "ref/ref.a3m") \
+      --msa2 \$MSA_LIST \
+      --list-file \
+      --output "${organism_id}-${uniprot_id}-batch_${batch_idx}_rf2t.tsv" \
+      --plot "${organism_id}-${uniprot_id}-batch_${batch_idx}_rf2t"
+   """
+}
+
+
+process ALPHAFOLD2 {
+
+   label 'gpu'
+   tag "${uniprot_id}-batch_${batch_idx}"
+   stageInMode 'link'
+   errorStrategy 'retry'
+   maxRetries 2
+
+   publishDir( "${params.outputs}/${organism_id}/ppi", 
+               mode: 'copy' )
+
+   input:
+   tuple val( organism_id ), val( uniprot_id ), path( msa1, stageAs: "ref/ref.a3m" ), val( batch_idx ), path( msa2 )
+
+   output:
+   tuple val( organism_id ), val( uniprot_id ), path( "*-batch_*_af2" ), emit: main
+   path "*-batch_*_af2/*.pdb", emit: pdb
+
+   script:
+   """
+   MSA_LIST=msa-list.txt
+   for item in *.a3m
+   do
+      echo "\$item" >> \$MSA_LIST
+   done
+   export CUDNN_PATH=\$(dirname \$(python -c "import nvidia.cudnn; print(nvidia.cudnn.__file__)"))
+   export LD_LIBRARY_PATH=\${CUDNN_PATH}/lib
+   >&2 echo "CuDNN path at" \$CUDNN_PATH "contains:"  # should exist and give a good path
+   >&2 echo \$(ls \$CUDNN_PATH)                       # should contain stuff like a lib subdir with libcudnn .so files
+   >&2 echo "LD library path at" \$LD_LIBRARY_PATH    # should exist and contain CUDNN_PATH
+   >&2 echo "\$(nvcc --version)"
+   >&2 python3 -c "import tensorflow as tf; print(f'Available devices:\\n{tf.config.list_physical_devices()}')"
+   XLA_PYTHON_CLIENT_MEM_FRACTION=.9 sppid af2-single \
+      <(echo "ref/ref.a3m") \
+      --msa2 \$MSA_LIST \
+      --list-file \
+      --output "${organism_id}-${uniprot_id}-batch_${batch_idx}_af2" \
+      --plot "${organism_id}-${uniprot_id}-batch_${batch_idx}_af2"
+   """
+}
+
+
+/*
+========================================================================================
+   Workflow Event Handler
+========================================================================================
+*/
+
+workflow.onComplete {
+
+   println ( workflow.success ? """
+      Pipeline execution summary
+      ---------------------------
+      Completed at: ${workflow.complete}
+      Duration    : ${workflow.duration}
+      Success     : ${workflow.success}
+      workDir     : ${workflow.workDir}
+      exit status : ${workflow.exitStatus}
+      """ : """
+      Failed: ${workflow.errorReport}
+      exit status : ${workflow.exitStatus}
+      """
+   )
+}
+
+/*
+========================================================================================
+   THE END
+========================================================================================
+*/
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
new file mode 100644
index 0000000..d876615
--- /dev/null
+++ b/nextflow.config
@@ -0,0 +1,159 @@
+manifest {
+
+    author          = "Eachan Johnson"
+    homePage        = "https://github.com/scbirlab/nf-ppi"
+    description     = "Predict gene-gene interactions based on protein-protein interaction predictions and similar metabolites."
+    defaultBranch   = "v0.0.1"
+    nextflowVersion = '!>=24.0.0'
+    version         = "0.0.1"
+    doi             = ''
+
+}
+
+params {
+
+    /* Required */
+    sample_sheet = null
+    bfd = null
+    uniclust = null
+
+    /* Optional */
+    test = false
+    non_self = false
+    help = null
+    rhea_url = "https://ftp.expasy.org/databases/rhea"
+    outputs = "${launchDir}/outputs"
+    batch_size = 100
+
+}
+
+process.conda = "${projectDir}/environment.yml"
+conda {
+  createTimeout = '2h'
+  enabled = true
+}
+
+profiles {
+
+  standard {
+
+    process {
+      executor = 'slurm'
+      array = 1000
+
+      withLabel: big_cpu {
+        time = '3h'
+        cpus = 16
+        memory = 32.GB
+      }
+
+      withLabel: big_time {
+        time = '4d'
+        cpus = 1
+        memory = 64.GB
+      }
+
+      withLabel: some_mem {
+        memory = 16.GB
+      }
+
+      withLabel: med_mem {
+        memory = 64.GB
+      }
+
+      withLabel: big_mem {
+        memory = 128.GB
+      }
+
+      withLabel: gpu_single {
+        queue = 'gpu'
+        time = '7d'
+        module = 'cuDNN/8.9.2.26-CUDA-12.1.1'
+        cpus = 4
+        clusterOptions = '--gres=gpu:1'
+        memory = 128.GB
+      }
+
+      withLabel: gpu {
+        queue = 'gpu'
+        time = '4h'
+        module = 'cuDNN/8.9.2.26-CUDA-12.1.1'
+        cpus = 2
+        clusterOptions = '--gres=gpu:2'
+        memory = 128.GB
+      }
+
+    }
+
+    dag {
+      enabled = true
+      overwrite = true
+    }
+
+    notification {
+      enabled = true
+      to = "$USER@crick.ac.uk"
+    }
+
+  }
+
+  local {
+
+    process {
+      executor = 'local'
+
+      withLabel: big_cpu {
+        time = '3h'
+        cpus = 16
+        memory = 32.GB
+      }
+
+      withLabel: some_mem {
+        memory = 8.GB
+      }
+
+      withLabel: med_mem {
+        memory = 16.GB
+      }
+
+      withLabel: big_mem {
+        memory = 32.GB
+      }
+
+      withLabel: gpu {
+        memory = 32.GB
+      }
+
+    }
+
+  }
+
+  gh {
+
+    conda.useMamba = false
+
+    process {
+      executor = 'local'
+
+      withLabel: big_cpu {
+        cpus = 1
+        memory = 12.GB
+      }
+
+      withLabel: some_mem {
+        memory = 12.GB
+      }
+
+      withLabel: med_mem {
+        memory = 12.GB
+      }
+
+      withLabel: gpu {
+        memory = 12.GB
+      }
+
+    }
+
+  }
+
+}
\ No newline at end of file
diff --git a/test/inputs/sample-sheet.csv b/test/inputs/sample-sheet.csv
new file mode 100644
index 0000000..fca7561
--- /dev/null
+++ b/test/inputs/sample-sheet.csv
@@ -0,0 +1,2 @@
+organism_id,proteome_name
+243273,"Mycoplasma genitalium"
diff --git a/test/nextflow.config b/test/nextflow.config
new file mode 100644
index 0000000..a7b0a18
--- /dev/null
+++ b/test/nextflow.config
@@ -0,0 +1,6 @@
+params {
+    sample_sheet = "inputs/sample-sheet.csv"
+    test = true
+    uniclust = "/nemo/lab/johnsone/reference/hhdb/uniclust30/uniclust30_2018_08"
+    bfd = "/nemo/lab/johnsone/reference/hhdb/bfd_metaclust/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt"
+}
\ No newline at end of file