-
Notifications
You must be signed in to change notification settings - Fork 0
/
tbstat
executable file
·126 lines (100 loc) · 2.48 KB
/
tbstat
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/perl
use warnings;
use strict;
use POSIX qw(ceil floor);
use Getopt::Long;
use Pod::Usage;
=head1 NAME
tbstat
=head1 SYNOPSIS
usage: tbstat [options] <tsv-file>
-h help
-v verbose
-d debug
-c column (>= 1, default 1)
-l calculate lambda GC, assumes that values are Chisq (1 d.f) statistics
--beta estimate alpha and beta parameters from mean and variance
tsv-file tab separated value file with headers
Calculates the mean, median, minimum and maximum of a column of data.
=head1 DESCRIPTION
=cut
#reads in annotation file
my $verbose;
my $debug;
my $help;
my $col = 1;
my $lambdaGC;
my $betaParametersEstimation;
#initialize options
Getopt::Long::Configure ('bundling');
if(!GetOptions ('h'=>\$help, 'v'=>\$verbose,'d'=>\$debug,'c=i'=>\$col, 'l'=>\$lambdaGC, 'beta'=>\$betaParametersEstimation)
|| $help
|| $col<1
|| scalar(@ARGV)>1)
{
if ($help)
{
pod2usage(-verbose => 2);
}
else
{
pod2usage(1);
}
}
if(scalar(@ARGV)==0)
{
*IN = *STDIN;
}
else
{
open(IN, $ARGV[0]) || die "Cannot open $ARGV[0]";
}
my $colNo;
my $i = $col - 1;
my @DATA;
my @fields;
while (<IN>)
{
s/\r?\n?$//;
@fields = split("\t");
push(@DATA, $fields[$i]);
if ($verbose) {print STDERR "$fields[$i]\n"}
}
@DATA = sort {$a<=>$b} @DATA;
my $total = 0;
my $min = $DATA[0];
my $max = $DATA[0];
for my $j(0 .. $#DATA)
{
$total += $DATA[$j];
$min = $DATA[$j] if ($DATA[$j]<$min);
$max = $DATA[$j] if ($DATA[$j]>$max);
}
my $n = scalar(@DATA);
my $mean = $total/$n;
my $median = ($DATA[floor($#DATA/2)]+$DATA[ceil($#DATA/2)])/2;
my $ss = 0;
my $sc = 0;
for my $j(0 .. $#DATA)
{
$ss += ($DATA[$j] - $mean)**2;
$sc += ($DATA[$j] - $mean)**3;
}
my $variance = $ss/($n-1);
my $skew = $ss!=0 ? (sqrt($n*($n-1))/($n-2))*((sqrt($n)*$sc)/sqrt($ss**3)) : 0;
my $cov = sqrt($variance)/$mean;
my $alpha = $mean * (( $mean * (1-$mean) / $variance) - 1 );
my $beta = (1 - $mean) * (( $mean * (1-$mean) / $variance) - 1 );
printf "N: %d\n", $n;
printf "Sum: %.4f\n", $total;
printf "Mean: %.4f\n", $mean;
printf "Median: %.4f\n", $median;
printf "lambda GC: %.4f\n", $median/0.456 if $lambdaGC;
printf "Variance: %.4f\n", $variance;
printf "Std. Dev.: %.4f\n", sqrt($variance);
printf "Coefficient of variance: %.4f\n", $cov;
printf "Skew: %.4f\n", $skew;
printf "Minimum: %.4f\n", $min;
printf "Maximum: %.4f\n", $max;
printf "alpha: %.4f\n", $alpha if $betaParametersEstimation;
printf "beta: %.4f\n", $beta if $betaParametersEstimation;