-
Notifications
You must be signed in to change notification settings - Fork 0
/
mirbind.py
executable file
·167 lines (130 loc) · 5.4 KB
/
mirbind.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env python
import argparse
import numpy as np
import pandas as pd
from tensorflow import keras as k
import tensorflow_addons as tfa
from tensorflow.keras import layers
from tensorflow.keras.utils import register_keras_serializable
@register_keras_serializable()
class ResBlock(layers.Layer):
"""
Defines a Residual block. For more information refer to the original paper at https://arxiv.org/abs/1512.03385 .
"""
def __init__(self, downsample=False, filters=16, kernel_size=3):
super(ResBlock, self).__init__()
# store parameters
self.downsample = downsample
self.filters = filters
self.kernel_size = kernel_size
# initialize inner layers
self.conv1 = layers.Conv2D(kernel_size=self.kernel_size,
strides=(1 if not self.downsample else 2),
filters=self.filters,
padding="same")
self.activation1 = layers.ReLU()
self.batch_norm1 = layers.BatchNormalization()
self.conv2 = layers.Conv2D(kernel_size=self.kernel_size,
strides=1,
filters=self.filters,
padding="same")
if self.downsample:
self.conv3 = layers.Conv2D(kernel_size=1,
strides=2,
filters=self.filters,
padding="same")
self.activation2 = layers.ReLU()
self.batch_norm2 = layers.BatchNormalization()
def call(self, inputs):
x = self.conv1(inputs)
x = self.activation1(x)
x = self.batch_norm1(x)
x = self.conv2(x)
if self.downsample:
inputs = self.conv3(inputs)
x = layers.Add()([inputs, x])
x = self.activation2(x)
x = self.batch_norm2(x)
return x
def get_config(self):
return {'filters': self.filters, 'downsample': self.downsample, 'kernel_size': self.kernel_size}
def parse_input():
"""
function for parsing input parameters
:return: dictionary of parameters
"""
parser = argparse.ArgumentParser(description='miRBind: a method for prediction of potential miRNA:target site '
'binding')
parser.add_argument('--input', default="example.tsv", metavar='<input_tsv_filename>')
parser.add_argument('--output', default="example_scores", metavar='<output_filename_prefix>')
parser.add_argument('--model', default="Models/miRBind.h5", metavar='<model_name>')
args = parser.parse_args()
return vars(args)
def one_hot_encoding(df, tensor_dim=(50, 20, 1)):
"""
fun encodes miRNAs and mRNAs in df into binding matrices
:param df: dataframe containing 'gene' and 'miRNA' columns
:param tensor_dim: output shape of the matrix
:return: numpy array of predictions
"""
# alphabet for watson-crick interactions.
alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1., "AU": 1., "UA": 1.}
# create empty main 2d matrix array
N = df.shape[0] # number of samples in df
shape_matrix_2d = (N, *tensor_dim) # 2d matrix shape
# initialize dot matrix with zeros
ohe_matrix_2d = np.zeros(shape_matrix_2d, dtype="float32")
# compile matrix with watson-crick interactions.
for index, row in df.iterrows():
for bind_index, bind_nt in enumerate(row.gene.upper()):
for mirna_index, mirna_nt in enumerate(row.miRNA.upper()):
base_pairs = bind_nt + mirna_nt
ohe_matrix_2d[index, bind_index, mirna_index, 0] = alphabet.get(base_pairs, 0)
return ohe_matrix_2d
def write_score(output_file, df, scores):
"""
fun writes information about sequence and its score to the output_file
:param output_file
:param df: dataframe with miRNA:target pairs
:param scores: numpy array, predicted scores
"""
scores = scores.flatten()[::2]
df["score"] = pd.Series(scores, index=df.index)
df.to_csv(output_file + '.tsv', sep='\t', index=False)
def predict_probs(df, model, output):
"""
fun predicts the probability of miRNA:target site binding in df file
:param df: input dataframe with sequences containing 'gene' and 'miRNA' columns
:param model: Keras model used for predicting
:param output: output file to write probabilities to
"""
miRNA_length = 20
gene_length = 50
orig_len = len(df)
mask = (df["miRNA"].str.len() == miRNA_length) & (df["gene"].str.len() == gene_length)
df = df[mask]
processed_len = len(df)
if orig_len != processed_len:
print("Skipping " + str(orig_len - processed_len) + " pairs due to inappropriate length.")
ohe = one_hot_encoding(df)
prob = model.predict(ohe)
write_score(output, df, prob)
def main():
arguments = parse_input()
output = arguments["output"]
try:
model = k.models.load_model(arguments["model"])
except (IOError, ImportError):
print()
print("Can't load the model", arguments["model"])
return
print("===========================================")
try:
input_df = pd.read_csv(arguments["input"], names=['miRNA', 'gene'], sep='\t')
except IOError as e:
print()
print("Can't load file", arguments["input"])
print(e)
return
predict_probs(input_df, model, output)
main()