RiboTaxa_arguments.conf

##The configuration file is very important and each parameter needs to be filled to avoid errors.
##Mandatory parameters are denoted by **. Most of them depend on the configuration of your computer capacities/configuration and of your sequencing data.
#Other tool parameters have been optimized and can be left at default. 

#-----------------------
#Setting up directories
#-----------------------

[BASE]
####set up RiboTaxa directory **
RiboTaxa_DIR = /home/user/RiboTaxa

####set up data directory containing only raw reads in fastq/fastq.gz format **
###paired-end files should be metagenome_R1.fastq/fastq.gz and metagenome_R2.fastq/fastq.gz
###singled-end file should be metagenome.fastq/fastq.gz
DATA_DIR = /home/user/Documents/raw_reads

####format of your paired end files **
## fastq : if files are not compressed
## fastq.gz: if files are compressed in gz format
FORMAT = fastq

####set up output directory **
OUTPUT = /home/user/Documents/RiboTaxa_results

####number of threads/CPUS to be used through the pipeline **
THREAD = 8

[BBMAP]
####RAM limit to be used by BBTOOLS/BBMAP during quality control and mapping **
##depends of the computer RAM. Use approx 80% of available RAM
##example: Available RAM = 16GB, therefore RAM = 80/100*16 = 12GB
RAM = 12

#-----------------------------
#Quality control using BBTOOLS
#-----------------------------

####Trim reads to remove bases matching adapter sequences (Default value = r)
## f (don't trim)
## r (trim to the right)
## l (trim to the left)
##In ktrim=r mode, once a reference kmer is matched in a read, 
##that kmer and all the bases to the right will be trimmed, leaving 
##only the bases to the left; this is the normal mode for adapter trimming.

ktrim = r

####Kmer length used for finding adapters (Default value = 21)
kmer = 21

####Reads shorter than this length (bases) after trimming will be discarded (Default value = 60)
minlength = 60

#Regions with average quality BELOW this will be trimmed (Default value = 20)
trimq = 20

####Trim read ends to remove bases with quality below trimq (Default value = rl)
# rl (trim both ends), 
# f (neither end), 
# r (right end only), 
# l (left end only),
# w (sliding window)

qtrim = rl

####reads with more Ns than this (after trimming) will be discarded (Default value = 1)
maxns = 1

#------------------------------------
#Filter 16S/18S reads using SortmeRNA
#------------------------------------

####indexed database directory for sortmerna **
##This directory should contain one .clustered.fasta and several .clustered. files
SORTMERNA_DB = /home/user/Documents/Databases/sortmerna_indexed_DB


#----------------------------------------------------------
#Reconstructing 16S/18S sequences using EMIRGE and MetaRIB
#----------------------------------------------------------

[EMIRGE]

####set up database directory containing indexed files for emirge **
##This directory should contain one fasta file and several .ebwt files
EMIRGE_DB = /home/user/Documents/Databases/bowtie_indexed_DB

####length of longest reads **
MAX_LENGTH = 300

####identity threshold (Default value = 1)
##This the JOIN_TRESHOLD parameter of EMIRGE. If two candidate sequences 
##share >= this fractional identity over their bases with mapped reads, then
##merge the two sequences into one for the next iteration. Fixed to 1, sequence
##reconstruction gave the best results on controlled samples (mock or synthetic communities).
IDENTITY = 1 

####number of iterations (Default value = 40)
##Number of iterations to perform by EMIRGE for sequence reconstruction.
##The default value can fit to most of metagenomic data. EMIRGE authors recommended to increase this value for complex communities. 
NUM_ITERATION = 40

####mean insert size **
##Insert size distribution mean
MEAN_INSERT_SIZE = 300 

####standard deviation **
##Insert size distribution standard deviation
STD_DEV = 100 

####minimum fraction of the length of a candidate
##reference sequence that must be covered by mapped
##reads (Default=0.3) Range [0.0,1.0]
MIN_COV = 0.3

[METARIB]
####Subsampling reads number in each iteration (Default=1000000)
SAMPLING_NUM = 1000000

#-----------------------------------------------------------
#Taxonomic classfication using sklearn_classifier of qiime2
#-----------------------------------------------------------

####Set up path+database name for sklearn classifier **
SKLEARN_DB = /home/user/Documents/Databases/qiime2020.8_silva138/silva-138-99-nb-classifier.qza

####Confidence threshold for limiting taxonomic depth (default = 0.7)
##This threshold ensures qualititative affiliation of reconstructed sequences. 
##We do not recommend to lower this value under 0.7. 
CONFIDENCE = 0.7

#Number of reads to process in each batch (default = auto: 20 000 sequences)
#use BATCH = 1 if you have less than 16GB RAM to avoid errors
BATCH = auto