Skip to content

Commit

Permalink
Add testing workflow (#8)
Browse files Browse the repository at this point in the history
* Add testing workflow

* Add conda.enabled to config

* Add badge to README

* Set seed for badread

* badread is slow, simulate fewer reads

* set TERM

* Create plassembler env and db before running pipeline

* Fix output checking

* Fix provenance format
  • Loading branch information
dfornika authored Feb 20, 2024
1 parent 7c1e215 commit b759ac7
Show file tree
Hide file tree
Showing 22 changed files with 331 additions and 1 deletion.
1 change: 1 addition & 0 deletions .github/data/reads_to_simulate.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
GCF-002968455-1,.github/data/assemblies/GCF_002968455.1.fa
7 changes: 7 additions & 0 deletions .github/environments/art.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: art
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- art=2016.06.05
7 changes: 7 additions & 0 deletions .github/environments/badread.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: badread
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- badread=0.4.0
9 changes: 9 additions & 0 deletions .github/environments/check-outputs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: check-outputs
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- python=3
- jsonschema=4.20.0
- pyyaml=6.0.1
73 changes: 73 additions & 0 deletions .github/scripts/check_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env python3

import argparse
import csv
import glob
import json
import urllib.request

from jsonschema import validate
import yaml


def check_provenance_format_valid(provenance_files, schema):
"""
Check that the provenance files are valid according to the schema.
"""
for provenance_file in provenance_files:
with open(provenance_file) as f:
try:
provenance = yaml.load(f, Loader=yaml.BaseLoader)
validate(provenance, schema)
except Exception as e:
return False

return True


def main(args):
provenance_schema_url = "https://raw.githubusercontent.com/BCCDC-PHL/pipeline-provenance-schema/main/schema/pipeline-provenance.json"
provenance_schema_path = ".github/data/pipeline-provenance.json"
urllib.request.urlretrieve(provenance_schema_url, provenance_schema_path)

provenance_schema = None
with open(provenance_schema_path) as f:
provenance_schema = json.load(f)

provenace_files_glob = f"{args.pipeline_outdir}/**/*_provenance.yml"
provenance_files = glob.glob(provenace_files_glob, recursive=True)

tests = [
{
"test_name": "provenance_format_valid",
"test_passed": check_provenance_format_valid(provenance_files, provenance_schema),
},
]

output_fields = [
"test_name",
"test_result"
]

output_path = args.output
with open(output_path, 'w') as f:
writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore')
writer.writeheader()
for test in tests:
if test["test_passed"]:
test["test_result"] = "PASS"
else:
test["test_result"] = "FAIL"
writer.writerow(test)

for test in tests:
if not test['test_passed']:
exit(1)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check outputs')
parser.add_argument('--pipeline-outdir', type=str, help='Path to the pipeline output directory')
parser.add_argument('-o', '--output', type=str, help='Path to the output file')
args = parser.parse_args()
main(args)
10 changes: 10 additions & 0 deletions .github/scripts/check_outputs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate check-outputs


.github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output -o artifacts/check_outputs_results.csv
3 changes: 3 additions & 0 deletions .github/scripts/create_art_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

conda env create -f .github/environments/art.yml
3 changes: 3 additions & 0 deletions .github/scripts/create_badread_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

conda env create -f .github/environments/badread.yml
3 changes: 3 additions & 0 deletions .github/scripts/create_output_checking_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

conda env create -f .github/environments/check-outputs.yml
10 changes: 10 additions & 0 deletions .github/scripts/create_plassembler_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash


source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate plassembler-3ac96e6e6413c7c411c19f45d1796cea

plassembler download -d plassembler-db
7 changes: 7 additions & 0 deletions .github/scripts/create_plassembler_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

nextflow pull BCCDC-PHL/plasmid-assembly -r main

conda env create \
-f ${HOME}/.nextflow/assets/BCCDC-PHL/plasmid-assembly/environments/plassembler.yml \
-p ${HOME}/.conda/envs/plassembler-3ac96e6e6413c7c411c19f45d1796cea
11 changes: 11 additions & 0 deletions .github/scripts/download_assemblies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

mkdir -p .github/data/{ncbi_datasets,assemblies}

curl -o .github/data/ncbi_datasets/GCF_002968455.1.zip "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_002968455.1/download?include_annotation_type=GENOME_FASTA,GENOME_GFF,SEQUENCE_REPORT"

unzip .github/data/ncbi_datasets/GCF_002968455.1.zip -d .github/data/ncbi_datasets/GCF_002968455.1 && rm .github/data/ncbi_datasets/GCF_002968455.1.zip

cp .github/data/ncbi_datasets/GCF_002968455.1/ncbi_dataset/data/GCF_002968455.1/GCF_002968455.1_ASM296845v1_genomic.fna .github/data/assemblies/GCF_002968455.1.fa

rm -r .github/data/ncbi_datasets
22 changes: 22 additions & 0 deletions .github/scripts/install_conda.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
set -eo pipefail

artifacts_dir="artifacts"

echo "Install Miniconda .." >> ${artifacts_dir}/test.log

export PATH=/opt/miniconda3/bin:$PATH

wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh

/bin/bash ~/miniconda.sh -b -p /opt/miniconda3

rm ~/miniconda.sh

echo ". /opt/minconda3/etc/profile.d/conda.sh" >> ~/.bashrc

conda update -n base -c defaults conda

conda install -y -c conda-forge mamba

conda init bash
11 changes: 11 additions & 0 deletions .github/scripts/install_nextflow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

set -eo pipefail

artifacts_dir="artifacts"

echo Install Nextflow .. >> ${artifacts_dir}/test.log

wget -qO- https://get.nextflow.io | bash

sudo mv nextflow /usr/local/bin/
13 changes: 13 additions & 0 deletions .github/scripts/prepare_artifacts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

artifacts_dir="artifacts"

echo "Prepare artifacts .." >> ${artifacts_dir}/test.log

mkdir -p ${artifacts_dir}/fastq

mv .github/data/fastq/*.fastq.gz ${artifacts_dir}/fastq

mkdir -p ${artifacts_dir}/pipeline_outputs

mv .github/data/test_output/* ${artifacts_dir}/pipeline_outputs
19 changes: 19 additions & 0 deletions .github/scripts/run_pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

set -eo pipefail

sed -i 's/cpus = 8/cpus = 4/g' nextflow.config
sed -i 's/cpus = 12/cpus = 4/g' nextflow.config
sed -i 's/cpus = 16/cpus = 4/g' nextflow.config

export TERM=linux

nextflow run main.nf \
-profile conda \
--cache ${HOME}/.conda/envs \
--fastq_input .github/data/fastq \
--fastq_input_long .github/data/fastq_long \
--db plassembler-db \
--outdir .github/data/test_output \
-with-report .github/data/test_output/nextflow_report.html \
-with-trace .github/data/test_output/nextflow_trace.tsv
26 changes: 26 additions & 0 deletions .github/scripts/simulate_long_reads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash


source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate badread

mkdir -p .github/data/fastq_long

while IFS=',' read -r sample_id assembly; do
badread simulate \
--seed 42 \
--reference ${assembly} \
--length 50000,5000 \
--quantity 10x \
--junk_reads 1 \
--random_reads 1 \
--chimeras 1 \
> .github/data/fastq_long/${sample_id}_RL.fastq

gzip -f .github/data/fastq_long/${sample_id}_RL.fastq

done < .github/data/reads_to_simulate.csv

35 changes: 35 additions & 0 deletions .github/scripts/simulate_short_reads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash


source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate art

mkdir -p .github/data/fastq

while IFS=',' read -r sample_id assembly; do
art_illumina \
--paired \
--in ${assembly} \
--fcov 12 \
--len 150 \
--mflen 400 \
--sdev 100 \
--rndSeed 42 \
--qShift 0 \
--qShift2 0 \
--out .github/data/fastq/${sample_id}_R

rm -f .github/data/fastq/${sample_id}_R1.aln
rm -f .github/data/fastq/${sample_id}_R2.aln

mv .github/data/fastq/${sample_id}_R1.fq .github/data/fastq/${sample_id}_R1.fastq
mv .github/data/fastq/${sample_id}_R2.fq .github/data/fastq/${sample_id}_R2.fastq

gzip -f .github/data/fastq/${sample_id}_R1.fastq
gzip -f .github/data/fastq/${sample_id}_R2.fastq

done < .github/data/reads_to_simulate.csv

56 changes: 56 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
on:
pull_request:
branches:
- main
push:
branches:
- main
workflow_dispatch:
name: Tests
jobs:
test:
strategy:
fail-fast: false
matrix:
nextflow_version: ["21.04.3", "23.10.1"]
name: Run tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@master
- name: Create Artifacts Directory
run: mkdir artifacts
- name: Install Miniconda
run: bash .github/scripts/install_conda.sh
- name: Install Nextflow
env:
NXF_VER: ${{ matrix.nextflow_version }}
run: bash .github/scripts/install_nextflow.sh
- name: Create ART Short Read Simulation Environment
run: bash .github/scripts/create_art_environment.sh
- name: Create Badread Long Read Simulation Environment
run: bash .github/scripts/create_badread_environment.sh
- name: Download Assemblies
run: bash .github/scripts/download_assemblies.sh
- name: Simulate Short Reads
run: bash .github/scripts/simulate_short_reads.sh
- name: Simulate Long Reads
run: bash .github/scripts/simulate_long_reads.sh
- name: Create plassembler environment
run: bash .github/scripts/create_plassembler_environment.sh
- name: Create plassembler db
run: bash .github/scripts/create_plassembler_db.sh
- name: Run Pipeline
run: bash .github/scripts/run_pipeline.sh
- name: Create Output Checking Environment
run: bash .github/scripts/create_output_checking_environment.sh
- name: Check Outputs
run: bash .github/scripts/check_outputs.sh
- name: Prepare Artifacts
if: always()
run: bash .github/scripts/prepare_artifacts.sh
- name: Upload Artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: artifacts-BCCDC-PHL-tbprofiler-nf-nextflow-v${{ matrix.nextflow_version }}-${{ github.run_id }}.${{ github.run_attempt }}
path: artifacts
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
[![Tests](https://github.com/BCCDC-PHL/plasmid-assembly/actions/workflows/tests.yml/badge.svg)](https://github.com/BCCDC-PHL/plasmid-assembly/actions/workflows/tests.yml)

# plasmid-assembly

A pipeline for high-quality plasmid assemblies.

Optionally annotate genes. Collects quality info on both incoming and outgoing datasets.
Expand Down
2 changes: 1 addition & 1 deletion modules/provenance.nf
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,6 @@ process pipeline_provenance {
"""
printf -- "- pipeline_name: ${pipeline_name}\\n" >> pipeline_provenance.yml
printf -- " pipeline_version: ${pipeline_version}\\n" >> pipeline_provenance.yml
printf -- "- timestamp_analysis_start: ${analysis_start}\\n" >> pipeline_provenance.yml
printf -- " timestamp_analysis_start: ${analysis_start}\\n" >> pipeline_provenance.yml
"""
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def parsePipelineName(name) {

profiles {
conda {
conda.enabled = true
process.conda = "$baseDir/environments/environment.yml"
if (params.cache){
conda.cacheDir = params.cache
Expand Down

0 comments on commit b759ac7

Please sign in to comment.