-
Notifications
You must be signed in to change notification settings - Fork 2
/
pipeline.sh
executable file
·177 lines (142 loc) · 8.43 KB
/
pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/bash
# For reliable, robust and maintainable bash scripts, start with following commands
# http://redsymbol.net/articles/unofficial-bash-strict-mode/
# set -euo pipefail
# IFS=$"\n\t"
#########################################################################################
# HEADER
#########################################################################################
#%
#% DESCRIPTION
#
#% This bash file that stiches together the preprocessing steps for RNA-seq sequences
#% from 3"RNA-seq project.
#%
#% USAGE and EXAMPLE
#% sh ${SCRIPT_NAME} [-h] inputs max_thread_count
#% ./${SCRIPT_NAME} [-h] inputs max_thread_count
#%
#% OPTIONS
#% inputs: Directory containing all raw sequence FASTQ files, this will be updated in the
#% next version
#% threads: Maximum number of threads to use for processing samples. Maximum thread
#% count can be calculated as the product of Thread(s) per core, Core(s) per
#% socket, and Socket(s) obtained from lscpu command.
#
#% NOTE: Care should be taken to supply full or relative path to the script file (this
#% file) and input directory.
#%
# ---------------------------------------------------------------------------------------
#+ SCRIPT INFORMATION
#
#+ VERSION: 0.6.0
#+ AUTHOR: Akshay Paropkari
#+ LICENSE: MIT
#
# ---------------------------------------------------------------------------------------
#% REFERENCES
#
#% 1. BBTools Suite - https://jgi.doe.gov/data-and-tools/bbtools/
#% 2. FastQC - http://www.bioinformatics.babraham.ac.uk/projects/fastqc/
#% 3. STAR - manual: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf
#%
#########################################################################################
# END_HEADER
#########################################################################################
# Help output
#== needed variables ==#
SCRIPT_HEADSIZE=$(head -50 "${0}" |grep -n "^# END_HEADER" | cut -f1 -d:)
SCRIPT_NAME=$(basename "${0}")
#== usage functions ==#
usagefull() { head -"${SCRIPT_HEADSIZE}" "${0}"| grep -e "^#%" | sed -e "s/^#[%+]\ //g" -e "s/\${SCRIPT_NAME}/${SCRIPT_NAME}/g" ; }
scriptinfo() { head -"${SCRIPT_HEADSIZE}" "${0}" | grep -e "^#+" | sed -e "s/^#+\ //g" -e "s/\${SCRIPT_NAME}/${SCRIPT_NAME}/g"; }
if [[ "${1}" = "-h" ]] || [[ "${1}" = "--help" ]] ; then
usagefull
scriptinfo
exit 0
fi
# Testing for input
if [[ -z "${1}" ]] ; then
echo -e "Input directory not supplied. Please supply a directory with raw FASTQ files"
exit 1
fi
if [[ -z "${2}" ]] ; then
echo -e "Maximum thread count to use is not specified. Please provide a value for number of threads."
exit 1
fi
# Changing working directory to input directory
# print the directory name which is being processed
echo -e "\n$(date '+%a %D %r') | Input directory identified as ${1}"
# cd into each directory in the directory
cd "${1}" && echo -e "$(date '+%a %D %r') | Switched into ${1}" || { echo -e "$(date '+%a %D %r') | cd into input directory failed! Please check your working directory."; exit 1; }
# Collect adapter files
truseq=$(find ~ -type f -name "*truseq.fa.gz" -exec realpath {} \;) && echo -e "$(date '+%a %D %r') | Identified TruSeq Fasta file - ${truseq}"
polya=$(find ~ -type f -name "*polyA.fa.gz" -exec realpath {} \;) && echo -e "$(date '+%a %D %r') | Located polyA Fasta file - ${polya}"
genome_dir=$(find ~ -type d -name "ca_genome" -exec realpath {} \;) && echo -e "$(date '+%a %D %r') | Located genome directory - ${genome_dir}"
# Starting preprocessing
for f in $(find . -type f -name "*.fastq*")
do
in_file=$(basename "${f}")
sample_id=$(basename "${f}" | cut -d_ -f1)
echo -e "$(date "+%a %D %r") | Processing ${sample_id}"
# •••••••••••••••••••••••••••••••••• #
# Step 1: Adapter and polyA trimming #
# •••••••••••••••••••••••••••••••••• #
echo -e "$(date "+%a %D %r") | Adapter and polyA trimming"
# Create output file name and command to run
trimmed_file="${sample_id}"_trimmed.fastq
CMD1="bbduk.sh in=$in_file out=$trimmed_file ref=$truseq,$polya k=13 ktrim=r mink=5 qtrim=r trimq=20 minlength=20"
# Echo and run command
echo "$(date "+%a %D %r") | ${CMD1}"
$CMD1
# •••••••••••••••••••••••••••••••••••••••••••••••••••••••••••• #
# Step 2: Fastqc generates a report of sequencing read quality #
# •••••••••••••••••••••••••••••••••••••••••••••••••••••••••••• #
echo -e "$(date "+%a %D %r") | Read quality assessment"
# Create directory to save the quality reports, one per FASTQ file and create command to run
mkdir -p "${sample_id}"_qc
CMD2="fastqc -t ${2} --nogroup -o ${sample_id}_qc ${trimmed_file}"
# Echo and run command
echo "$(date "+%a %D %r") | ${CMD2}"
$CMD2
# ••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••• #
# Step 3: STAR aligns to the reference genome and calculates gene counts #
# ••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••••• #
echo -e "$(date "+%a %D %r") | Aligning QC reads to Candida albicans A21 genome"
# Create command to run
sample_id=$(basename "${f}" .fastq | cut -d_ -f1)
CMD3="STAR --runThreadN ${2} --genomeDir ${genome_dir} --readFilesIn ${trimmed_file} --outFilterType BySJout --outFilterMultimapNmax 25 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.3 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMattributes NH HI NM MD --outSAMtype BAM SortedByCoordinate --quantMode GeneCounts --outFileNamePrefix ${sample_id} > $(basename "${f}" .fastq)_alignment.log"
# Echo and run command
echo "$(date "+%a %D %r") | ${CMD3}"
$CMD3
done
# •••••••••••••••••••••••••••••••• #
# Collect all counts into one file #
# •••••••••••••••••••••••••••••••• #
echo -e "$(date "+%a %D %r") | Collecting gene counts for all samples"
# Create command to run
format_counts=$(find ~ -type f -name "format_counts_table.py" -exec realpath {} \;) && echo -e "$(date '+%a %D %r') | Located gene count table formatting script - ${format_counts}"
CMD4="${format_counts} ${1} -o ${1}"
# Echo and run command
echo "$(date "+%a %D %r") | ${CMD4}"
$CMD4
# •••••••••••••••••••••••••••••• #
# Organize files created by STAR #
# •••••••••••••••••••••••••••••• #
# Move log files into a directory
mkdir -p ./STAR_log ./trim_log
mv -t STAR_log/ ./*Log.out ./*Log.progress.out ./*Log.final.out ./*.out.bam ./*.out.tab && echo -e "$(date '+%a %D %r') | Moved STAR intermediate files"
mv -t trim_log/ *_trimmed.fastq && echo -e "$(date '+%a %D %r') | Moved intermediate adapter trimming files"
# •••••••••••••••••••••••••••••••••••••••••••••••• #
# Run differential expression analysis with DESeq2 #
# •••••••••••••••••••••••••••••••••••••••••••••••• #
echo -e "$(date "+%a %D %r") | Running differential expression analysis using DESeq2"
METADATA=$(find . -type f -name "*etadata*" -exec realpath {} \;) && echo -e "$(date '+%a %D %r') | Located metadata file - ${METADATA}"
GENE_COUNTS=$(find . -type f -name "gene_raw_counts.txt" -exec realpath {} \;) && echo -e "$(date '+%a %D %r') | Located gene count matrix file - ${GENE_COUNTS}"
DESEQ2=$(find ~ -type f -name "deseq.R" -exec realpath {} \;) && echo -e "$(date '+%a %D %r') | Located DESEq2 Rscript file - ${DESEQ2}"
CMD5="${DESEQ2} ${GENE_COUNTS} ${METADATA} ./deseq2_lfc.txt ./MA_plot.pdf"
# Echo and run command
echo "$(date "+%a %D %r") | ${CMD5}"
$CMD5
## REMOVE ME
rm -rv *log *qc && echo -e "$(date "+%a %D %r") | Removed intermediate log and QC files"