implement automated unit testing

BisanzLab · Aug 6, 2024 · 3bd3997 · 3bd3997
1 parent 3a16b38
commit 3bd3997
Show file tree

Hide file tree

Showing 22,182 changed files with 79,087 additions and 21,907 deletions.
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -0,0 +1,57 @@
+name: Tests
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  test_preprocessr:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: make test
+        run: make -C src test
+
+  test_strainr:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: r-lib/actions/setup-r@v2
+    - uses: r-lib/actions/setup-r-dependencies@v2
+      with:
+        cache-version: 1
+        r-version: 'release'
+        packages: |
+          any::tidyverse
+          any::optparse
+
+    - name: Plot.R test
+      run: |
+        ./src/Plot.R -a tests/inputs/ -i tests/inputs/ -p testing
+        diff tests/inputs/testing_abundance_summary.tsv \
+          tests/expected_output/expected_abundance_summary.tsv
+        rm tests/inputs/testing_abundance_summary.tsv
+      shell: bash
+
+  test_comprehensive:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: conda-incubator/setup-miniconda@v3
+      - name: test run comp
+        shell: bash -el {0}
+        run: |
+          conda install -c bioconda -c conda-forge -y strainr2
+          conda remove --force -y strainr2
+          make -C src release
+          export PATH="$(pwd)/src/:$PATH"
+          PreProcessR -i tests/genomes/mixture -o tests/StrainR2DB_testing
+          diff <(sort tests/StrainR2DB_testing/KmerContent.report) <(sort tests/expected_output/KmerContent_comprehensive.report)
+          StrainR -1 tests/inputs/mock_reads_testing_R1.fastq.gz \
+            -2 tests/inputs/mock_reads_testing_R2.fastq.gz \
+            -r tests/StrainR2DB_testing -o tests/StrainR2_out_testing -p testing -t 1
+          diff <(sort tests/StrainR2_out_testing/testing.abundances) <(sort tests/expected_output/testing_comprehensive.abundances)
+          diff <(sort tests/StrainR2_out_testing/testing_abundance_summary.tsv) <(sort tests/expected_output/abundance_summary_comprehensive.tsv)
diff --git a/README.md b/README.md
@@ -1,3 +1,6 @@
+[![Testing](https://github.com/kheber/StrainR2_testing/actions/workflows/testing.yml/badge.svg)](https://github.com/kheber/StrainR2_testing/actions/workflows/testing.yml)
+[![StrainR2 Version](https://anaconda.org/bioconda/strainr2/badges/version.svg)](https://anaconda.org/bioconda/strainr2)
+[![Downloads](https://anaconda.org/bioconda/strainr2/badges/downloads.svg)](https://anaconda.org/bioconda/strainr2)
 # Background
 
 Traditional methods for quantifying strain abundances in a microbiome, such as 16S rRNA sequencing, lack the resolution to differentiate strains and are limited to generalizing species. Shotgun metagenomic sequencing offers an alternative, but unnormalized abundances such as FPKM have a bias from similar genomes getting fewer unique mappings. 
@@ -32,11 +35,7 @@ To install the source code into a directory onto your computer, clone the source
 git clone https://github.com/BisanzLab/StrainR2.git
 ```
 
-Dependencies need to be installed according to versions listed at the bottom of this document. A .yml file provided in the git repository can be used to create an environment from scratch. 
-```
-conda env create -f StrainR2/strainr2.yml
-conda activate strainr2
-```
+Dependencies need to be installed according to versions listed at the bottom of this document.
 
 Files can be compiled using make
 ```

diff --git a/src/PreProcessR b/src/PreProcessR
@@ -73,7 +73,7 @@ ls "$outdir"/Subcontigs/ | sed -n '/\.subcontig$/p' | sed 's|^|'"$outdir"'/Subco
   xargs cat > "$outdir"/BBindex/BBIndex.fasta
 ls "$outdir"/excludedSubcontigs/ | sed -n '/\.subcontig$/p' | sed 's|^|'"$outdir"'/excludedSubcontigs/|' | \
   xargs cat >> "$outdir"/BBindex/BBIndex.fasta
-bbmap.sh ref="$outdir"/BBindex/BBIndex.fasta path="$outdir"/BBindex
+bbmap.sh ref="$outdir"/BBindex/BBIndex.fasta path="$outdir"/BBindex deterministic=t averagepairdist=200
 
 echo "PreProcessR complete"
 echo "Total Run Time: $((($SECONDS - $START_TIME)/60)) min $((($SECONDS - $START_TIME)%60)) sec"  

diff --git a/src/StrainR b/src/StrainR
@@ -85,7 +85,7 @@ bbmap.sh\
   ref="$reference"/BBindex/BBIndex.fasta \
   out="$outdir"/"$prefix".sam \
   rpkm="$outdir"/"$prefix".rpkm \
-  threads="$threads" \
+  threads="$threads" deterministic=t averagepairdist=200 \
   -Xmx"$mem"g \
   perfectmode=t local=f ambiguous=toss pairedonly=t nodisk=t
 

diff --git a/src/hashcounter.c b/src/hashcounter.c
@@ -16,6 +16,7 @@ hashtable* hashtable_create(uint32_t kmer_size, bool is_small, uint32_t num_subc
     ht->subcontig_names = calloc(num_subconts,sizeof(char*));
     ht->subcontig_counts = calloc(num_subconts,sizeof(int));
     ht->num_subcontigs = num_subconts;
+    ht->curr_subcontig = 0;
     ht->size = INITIAL_HT_SIZE;
     ht->count = 0;
     ht->entry_bitmask = INITIAL_HT_BITMASK;
@@ -338,12 +339,11 @@ void hash_and_insert(hashtable* ht, char* dir_location, void (*kmer_func)(hashta
     struct dirent *de;
     DIR *dr = opendir(dir_location);
     if(dr == NULL) {
-        fprintf(stderr, "Could not open excluded subcontigs directory\n\n");
+        fprintf(stderr, "Could not open subcontigs directory\n\n");
         exit(EXIT_FAILURE);
     }
     char* subcont_location;
     char* subcont_name;
-    uint32_t subcont_id;
     while (((de = readdir(dr)) != NULL)) {
         if(!(strlen(de->d_name) >= 10 && strcmp(&de->d_name[strlen(de->d_name) - 10], ".subcontig") == 0)) continue;
         uint32_t loc_size = strlen(dir_location)+strlen(de->d_name)+1;
@@ -357,10 +357,6 @@ void hash_and_insert(hashtable* ht, char* dir_location, void (*kmer_func)(hashta
         }
         seq = kseq_init(fp);
         kseq_read(seq);
-        memcpy(subcont_location, de->d_name, strlen(de->d_name)+1);
-        strtok(subcont_location, ".");
-        strtok(subcont_location, "_");
-        subcont_id = atoi(strtok(NULL,"_"));
         if(seq->comment.s != NULL){
             subcont_name = calloc(strlen(seq->name.s)+strlen(seq->comment.s)+2, sizeof(char));
             memcpy(subcont_name, seq->name.s, strlen(seq->name.s));
@@ -370,8 +366,9 @@ void hash_and_insert(hashtable* ht, char* dir_location, void (*kmer_func)(hashta
             subcont_name = calloc(strlen(seq->name.s)+1, sizeof(char));
             memcpy(subcont_name, seq->name.s, strlen(seq->name.s));
         }
-        ht->subcontig_names[subcont_id] = subcont_name;
-        hash_and_insert_subcontig(ht, seq->seq.s, subcont_id, kmer_func);
+        ht->subcontig_names[ht->curr_subcontig] = subcont_name;
+        hash_and_insert_subcontig(ht, seq->seq.s, ht->curr_subcontig, kmer_func);
+        ++ht->curr_subcontig;
         free(subcont_location);
         gzclose(fp);
         kseq_destroy(seq);
@@ -463,7 +460,7 @@ int main(int argc, char **argv){
         return EXIT_FAILURE;
     }
     fprintf(kmercontent,"SubcontigID\tStrainID\tContigID\tStart_Stop\tLength\tNunique\n");
-    uint32_t i=1;
+    uint32_t i=0;
     while(ht->subcontig_names[i]!=NULL){
         fprintf(kmercontent,"%s\t", ht->subcontig_names[i]);
         subcontig_info = strtok(ht->subcontig_names[i], ";");

diff --git a/src/hashcounter.h b/src/hashcounter.h
@@ -47,6 +47,7 @@ typedef struct hashtable{
     uint64_t count;
     uint32_t* subcontig_counts;
     uint32_t num_subcontigs;
+    uint32_t curr_subcontig;
     uint32_t kmer_size;
     ht_element_small* items_small; // for use in memory-efficient option
     bool is_small;

diff --git a/src/subcontig.c b/src/subcontig.c
@@ -20,14 +20,6 @@
     "\t\t-e number\t: exclude subcontig size (minimum subcontig size) [Default = 10000]\n"                                                           \
     "\t\t-h\t\t: display this message again\n"
 
-// for testing purposes this code was compiled with:
-// gcc -g -fsanitize=address -std=gnu99 -Wall -Wextra -Werror -Wno-unused-function -Wno-unused-parameter -O0 -o subcontig subcontig.c
-
-// the provided binary was compiled using:
-// gcc -o subcontig subcontig.c
-
-int subcontigCount = 0;
-
 // save a sequence and appropriate header information to outdir (or excludedSubcontigs if it is less than minSubcontigSize)
 void saveSubcontig(char *outdir, char *subcontigName, char *strainID, char *subcontigSeq, int start, int length, char *overlap);
 // subcontig a genome and write the sequences to outdir
@@ -331,7 +323,6 @@ void writeSubcontigs(char *outdir, char *excludeDir, char *genomeLocation, char
 // (called from by writeSubcontigs)
 void saveSubcontig(char *outdir, char *subcontigName, char *strainID, char *subcontigSeq, int start, int length, char *overlap) {
     FILE *fptr = NULL;
-    ++subcontigCount;
     char *seq = NULL;
     int overlapLen = 0;
 
@@ -349,9 +340,9 @@ void saveSubcontig(char *outdir, char *subcontigName, char *strainID, char *subc
     char *savedSubcontigName = calloc(needed, sizeof(char));
     sprintf(savedSubcontigName, ">%s;%s;%d_%d;%d", strainID, subcontigName, start - overlapLen, start + length - 1, length + overlapLen);
 
-    needed = snprintf(NULL, 0, "%ssubcontig_%d.subcontig", outdir, subcontigCount) + 1;
+    needed = snprintf(NULL, 0, "%s/%s_%d_%d.subcontig", outdir, strainID, start - overlapLen, start + length - 1) + 1;
     char *subcontigLocation = calloc(needed, sizeof(char));
-    sprintf(subcontigLocation, "%ssubcontig_%d.subcontig", outdir, subcontigCount);
+    sprintf(subcontigLocation, "%s/%s_%d_%d.subcontig", outdir, strainID, start - overlapLen, start + length - 1);
 
     fptr = fopen(subcontigLocation, "w");
     if (fptr == NULL) {

diff --git a/strainr2.yml b/strainr2.yml
@@ -1,4 +1,3 @@
-name: strainr2
 channels:
   - bioconda
   - conda-forge
@@ -198,7 +197,6 @@ dependencies:
   - readline=8.2
   - samtools=1.20
   - sed=4.8
-  - strainr2=2.0.0
   - sysroot_linux-64=2.12
   - tk=8.6.13
   - tktable=2.10