-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_ucsc_rmsk_bed_to_vcf
executable file
·276 lines (236 loc) · 10.2 KB
/
convert_ucsc_rmsk_bed_to_vcf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/usr/bin/perl -w
use warnings;
use strict;
use POSIX;
use Getopt::Long;
use File::Path;
use File::Basename;
use Pod::Usage;
=head1 NAME
convert_ucsc_rmsk_bed_to_vcf
=head1 SYNOPSIS
convert_ucsc_rmsk_bed_to_vcf [options]
=head1 DESCRIPTION
Converts UCSC repeat masker BED file to VCF file.
=cut
#option variables
my $help;
my $verbose;
my $debug;
my $refFASTAFile;
#initialize options
Getopt::Long::Configure ('bundling');
if(!GetOptions ('h'=>\$help, 'v'=>\$verbose, 'd'=>\$debug,
'r:s'=>\$refFASTAFile)
|| !defined($refFASTAFile)
|| scalar(@ARGV)!=1)
{
if ($help)
{
pod2usage(-verbose => 2);
}
else
{
pod2usage(1);
}
}
my $lobstrTrfBEDFile = $ARGV[0];
open(BED, "$lobstrTrfBEDFile") || die "Cannot open $lobstrTrfBEDFile";
585 1504 13 4 13 chr1 10000 10468 -249240153 + (CCCTAA)n Simple_repeat Simple_repeat 1 463 0 1
585 3612 114 270 13 chr1 10468 11447 -249239174 - TAR1 Satellite telo -399 1712 483 2
585 437 235 186 35 chr1 11503 11675 -249238946 - L1MC LINE L1 -2236 5646 5449 3
585 239 294 19 10 chr1 11677 11780 -249238841 - MER5B DNA hAT-Charlie -74 104 1 4
585 318 230 38 0 chr1 15264 15355 -249235266 - MIR3 SINE MIR -119 143 49 5
585 203 162 0 0 chr1 16712 16749 -249233872 + (TGG)n Simple_repeat Simple_repeat 1 37 0 6
585 239 338 148 0 chr1 18906 19048 -249231573 + L2a LINE L2 2942 3104 -322 7
585 652 346 85 42 chr1 19947 20405 -249230216 + L3 LINE CR1 3042 3519 -970 8
585 270 331 7 27 chr1 20530 20679 -249229942 + Plat_L3 LINE CR1 2802 2947 -639 9
ld example SQL type description
bin 585 smallint(5) unsigned Indexing field to speed chromosome range queries.
swScore 1504 int(10) unsigned Smith Waterman alignment score
milliDiv 13 int(10) unsigned Base mismatches in parts per thousand
milliDel 4 int(10) unsigned Bases deleted in parts per thousand
milliIns 13 int(10) unsigned Bases inserted in parts per thousand
genoName chr1 varchar(255) Genomic sequence name
genoStart 10000 int(10) unsigned Start in genomic sequence
genoEnd 10468 int(10) unsigned End in genomic sequence
genoLeft -249240153 int(11) -#bases after match in genomic sequence
strand + char(1) Relative orientation + or -
repName (CCCTAA)n varchar(255) Name of repeat
repClass Simple_repeat varchar(255) Class of repeat
repFamily Simple_repeat varchar(255) Family of repeat
repStart 1 int(11) Start (if strand is +) or -#bases after match (if strand is -) in repeat sequence
repEnd 463 int(11) End in repeat sequence
repLeft 0 int(11) -#bases after match (if strand is +) or start (if strand is -) in repeat sequence
id 1 char(1) First digit of id field in RepeatMasker .out file. Best ignored.
#.
Class of Repeat
461751 DNA 61 acro
1881 DNA? 1194734 Alu
1498690 LINE 2325 centr
51 LINE? 61303 CR1
371543 Low_complexity 1265 Deu
717656 LTR 2744 DNA
122 LTR? 1881 DNA?
3733 Other 556 Dong-R4
2236 RC 579 ERV
729 RNA 175937 ERV1
1769 rRNA 10868 ERVK
9566 Satellite 160346 ERVL
1340 scRNA 1854 ERVL?
417913 Simple_repeat 347105 ERVL-MaLR
1793723 SINE 10892 Gypsy
425 SINE? 7869 Gypsy?
4386 snRNA 12573 hAT
1481 srpRNA 3050 hAT?
2002 tRNA 19755 hAT-Blackjack
7036 Unknown 254646 hAT-Charlie
97 Unknown? 30669 hAT-Tip100
1800 Helitron
436 Helitron?
951780 L1
84 L1?
466438 L2
371543 Low_complexity
2206 LTR
122 LTR?
57 Merlin
595094 MIR
1992 MuDR
3733 Other
51 Penelope?
2120 PiggyBac
241 PiggyBac?
729 RNA
1769 rRNA
17874 RTE
655 RTE-BovB
6775 Satellite
1340 scRNA
417913 Simple_repeat
962 SINE
425 SINE?
4386 snRNA
1481 srpRNA
1950 TcMar
3424 TcMar?
16348 TcMar-Mariner
8156 TcMar-Tc2
104026 TcMar-Tigger
405 telo
3670 tRNA
7036 Unknown
97 Unknown?
#CCCAGCCCACCCATCCCATCCCCGCCCACCCATCCCATCCC <STR> . PASS RU=CCCAT;RL=43;REF=8.6;PSCORE=41;COMP=17,4,9,68;ENTROPY=1.35
my $vcfHdr = <<HDR;
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##source=TandemRepeatFinder/lobSTR_3.0.2;
##INFO=<ID=RU,Number=1,Type=String,Description="Repeat motif">
##INFO=<ID=MOTIF,Number=1,Type=String,Description="Canonical repeat motif">
##INFO=<ID=RL,Number=1,Type=Integer,Description="Reference STR track length in bp">
##INFO=<ID=REF,Number=1,Type=Float,Description="Reference copy number">
##INFO=<ID=TRF_SCORE,Number=1,Type=Integer,Description="TRF Score for M/I/D as 2/-7/-7">
##INFO=<ID=COMP,Number=4,Type=Integer,Description="Composition(%) of bases in repeat tract">
##INFO=<ID=ENTROPY,Number=1,Type=Float,Description="Entropy measure of repeat tract">
##ALT=<ID=STR,Description="Short Tandem Repeat">
##assembly=/net/fantasia/home/atks/ref/genome/hs37d5.fa
##contig=<ID=1,length=249250621>
##contig=<ID=2,length=243199373>
##contig=<ID=3,length=198022430>
##contig=<ID=4,length=191154276>
##contig=<ID=5,length=180915260>
##contig=<ID=6,length=171115067>
##contig=<ID=7,length=159138663>
##contig=<ID=8,length=146364022>
##contig=<ID=9,length=141213431>
##contig=<ID=10,length=135534747>
##contig=<ID=11,length=135006516>
##contig=<ID=12,length=133851895>
##contig=<ID=13,length=115169878>
##contig=<ID=14,length=107349540>
##contig=<ID=15,length=102531392>
##contig=<ID=16,length=90354753>
##contig=<ID=17,length=81195210>
##contig=<ID=18,length=78077248>
##contig=<ID=19,length=59128983>
##contig=<ID=20,length=63025520>
##contig=<ID=21,length=48129895>
##contig=<ID=22,length=51304566>
##contig=<ID=X,length=155270560>
##contig=<ID=Y,length=59373566>
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
HDR
print $vcfHdr;
while(<BED>)
{
s/\r?\n?$//;
if (/^#/)
{
print "$_\n";
}
else
{
my ($chrom, $start, $end, $periodSize, $refCopyNo,
$dummy1, $dummy2, $dummy3,
$purityScore, $A, $C, $G, $T, $entropy, $repeatUnit, $overlap) = split("\t");
$chrom =~ s/chr//;
my $ref = `vt seq -i $chrom:$start-$end -r $refFASTAFile -q`;
my $alt = "<STR>";
my $refCopyNoBp = int($refCopyNo * length($repeatUnit) + 0.5);
my $motif = motif2cannonical($repeatUnit);
print "$chrom\t$start\t.\t$ref\t$alt\t.\tPASS\tRU=$repeatUnit;MOTIF=$motif;RL=$refCopyNoBp;REF=$refCopyNo;TRF_SCORE=$purityScore;COMP=$A,$G,$T,$C;ENTROPY=$entropy\n";
}
}
close(BED);
sub motif2cannonical
{
my $motif = shift;
my $motifrc = reverseComplement($motif);
my @motifs = ();
for my $i (0..(length($motif)-1))
{
push(@motifs, shiftBase($motif, $i));
}
@motifs = sort {$a cmp $b} @motifs;
return $motifs[0];
}
sub reverseComplement
{
my $seq = shift;
my $len = length($seq);
my $rc = "";
for my $i (0..($len-1))
{
$rc .= complement(substr($seq, ($len-1-$i),1));
}
return $rc;
}
sub shiftBase
{
my ($seq, $i) = @_;
my $len = length($seq);
if ($i>=$len) {$i = $i%%$len;}
my $shifted = substr($seq, $i, ($len-$i)) . substr($seq, 0, $i);
return $shifted;
}
sub complement
{
my $base = shift;
if ($base eq 'A')
{
return 'T';
}
elsif ($base eq 'C')
{
return 'G';
}
elsif ($base eq 'G')
{
return 'C';
}
elsif ($base eq 'T')
{
return 'A';
}
}