-
Notifications
You must be signed in to change notification settings - Fork 0
/
sequences.py
120 lines (95 loc) · 3.02 KB
/
sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from __future__ import print_function
import sys
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Data import IUPACData
testmode = 0 # is the script in testmode? 0=no 1=yes
# Required: directory "hg19" with fasta files per chromosome (from UCSC)
def readGenome():
"""
Read all chromosomes and put sequence into dictionary "genome"
"""
genome = dict()
chromosomes = range(1, 23) + ["X", "Y", "M"]
# chromosomes = [1, 3]
for chrom in chromosomes:
fastaFile = "hg19/chr" + str(chrom) + ".fa"
for record in SeqIO.parse(open(fastaFile, "rU"), "fasta"):
genome[record.id] = str(record.seq).upper()
return genome
def readFasta(f):
'''
Description: Read fasta file and put sequences in a dictionary
In: f (filename)
Out: sequences[acc] = sequence
'''
sequences = dict()
for record in SeqIO.parse(open(f), "fasta"):
sequences[record.id] = str(record.seq).upper()
return(sequences)
def motifToRegex(motif, mismatches):
"""
Converts IUPAC motif to a regular expression (copied from nt_search method in BioPython)
"""
pattern = ''
for nt in motif:
value = IUPACData.ambiguous_dna_values[nt]
if len(value) == 1:
pattern += value
else:
pattern += '[%s]' % value
pattern = "(" + pattern + "){e<=" + str(mismatches) + "}" # allow for mismatches
return pattern
def complement(s):
"""
Return the complement nucleotides
"""
basecomplement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'W': 'W', 'S': 'S', 'M': 'K', 'K': 'M', 'R': 'Y', 'Y': 'R', 'B': 'V', 'D': 'H', 'H': 'D', 'V': 'B', 'N': 'N'}
letters = list(s)
letters = [basecomplement[base] for base in letters]
return ''.join(letters)
def comrev(s):
"""
Return reverse complement
"""
return complement(s[::-1].upper())
def readMotifsFromFile(motifFile):
"""
Read a tab-delimited files with motifs to search for. Return a list with motifs
"""
try:
fh = open(motifFile)
except:
sys.exit("cannot open file" + motifFile)
motifs = list()
line = fh.readline() # skip header
for line in fh:
line = line.rstrip()
c = line.split("\t")
motifs.append(c[1].upper())
return motifs
def nucToPeptide(seq):
'''
Description: translates a nucleotide sequence in all 6 reading frames
In: string sequence
Out: list with translations
'''
translations = list()
frame = list()
# Make sequence complement reverse
seq = seq.upper()
comrevSeq = comrev(seq)
# Get all reading frames
frame.append(Seq(seq))
frame.append(Seq(seq[1:]))
frame.append(Seq(seq[2:]))
frame.append(Seq(comrevSeq))
frame.append(Seq(comrevSeq[1:]))
frame.append(Seq(comrevSeq[2:]))
# Translate sequence to protein and find motif
for i in range(len(frame)):
translations.append(frame[i].translate())
return(translations)
if testmode == 1:
genome = readGenome()
print(genome["chr1"][0:50])