-
Notifications
You must be signed in to change notification settings - Fork 0
/
blast_sequences.sh
102 lines (87 loc) · 4.22 KB
/
blast_sequences.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env bash
#Author: Olabiyi Aderemi Obayomi
#Created: February 2018
#this script generates a list of whatever you want at anything between 90 - 100% identity to sequences in your given database
#make sure you pass the treatments to be analysed as arguements to the script seperated by spaces
#make sure you add QIIME to your $PATH variable
#the version of QIIME used for the analysis was QIIME 1.9.1
#the version of BLAST used for the analysis was ncbi-blast-2.7.1
#enter the necessary files and save them in variables
echo "what are you searching for: "
read search
echo "Enter the full path to the list of what to search for: "
read pathogens
echo "Enter the full path to the mapping file: "
read mapping_file
echo "Enter the full path to the blast database: "
read database
echo "Enter the full path to the fasta file of what you want to search for: "
read fasta
echo "Enter the column name in your mapping file to be analysed: "
read colName
echo "What will you like to call the file that will contain your combined final results? "
read results_together
echo "Which identity threshold do you want? "
echo "options are 0 to 10 for 90 to 100% identity, respectively"
read ID
#setting the variables
seqs=_$search.fasta
blast_out=_organisms.blast
list=_$search.list
organism_list=_organisms.list
unique=_unique
identity=_identity
final=final_
mappping_file=$(cat $mapping_file)
analyses=_Analysis
#run a blast search on every treatment passed as arguements to the script
#e.g bash blast_sequences intact bussel potable water will analyse intact bussel potable and water
for treatment; do
#remove the directory if it already exists and make a new directory for the treatment and then change to the directory
rm -rf $treatment$analyses
mkdir $treatment$analyses
cd $treatment$analyses
#Extract sequences that belong to the treatment
extract_seqs_by_sample_id.py -i $fasta -o $treatment$seqs \
-m $mapping_file -s "$colName:$treatment"
echo "blasting... $treatment"
#blast the treatments's sequences against any blast formatted database, retrieving only the best hit
blastn -db $database -query $treatment$seqs -outfmt 0 \
-num_descriptions 1 -num_alignments 1 -out $treatment$blast_out
#get the list of organisms from the blast output with their identities
grep ">" $treatment$blast_out > $treatment$organism_list
#get the list of the organisms' identity scores
grep -E "Identities" $treatment$blast_out > $treatment$identity
#paste the organism list and Identities together
paste -d" " $treatment$organism_list $treatment$identity > $treatment$organism_list$identity
# get the list of organisms with identity scores greater than ID or 100%
if [ $ID -eq 10 ]; then
grep -E "(100%)" $treatment$organism_list$identity > $treatment$identity$organism_list
else
grep -E "(9[$ID-9]%)|(100%)" $treatment$organism_list$identity > $treatment$identity$organism_list
fi
#create a file that will contain the list of what you are looking for
touch $treatment$identity$list
#search for what you are looking for within the organisms list
grep -wFf $pathogens $treatment$identity$organism_list > $treatment$identity$list
##get the pathogen list and parse it for readability i.e. without the sequence assertion number and removing anything 16S or 18S
cut -d" " -f2-6 $treatment$identity$list |sed -e 's/1[68]S//g'|sort > $final$treatment$list
#get the unique list
sort -u $final$treatment$list > $treatment$unique$list
#return to the parent directory
cd ..
done
##### organize the output files ##################
#remove the folders containing the combined together if they already exist
rm -rf $results_together
rm -rf unique
#create a directory that will contain the final results
mkdir $results_together
mkdir unique
#put all the final and unique results files in one folder
find . -type f -name "final_*" -exec mv -vuni '{}' "$results_together" ";" 2> error.log
find . -type f -name "*unique*" -exec mv -vuni '{}' "unique/" ";" 2>> error.log
#change the extensions of the final and unique lists from .list to .txt so that they may be viewed
#with a text editor outside unix such as notepad
rename .list .txt $results_together/*
rename .list .txt unique/*