Skip to content

Commit

Permalink
Cache per-TN VCFs before writing to disk
Browse files Browse the repository at this point in the history
  • Loading branch information
ckandoth committed Aug 18, 2015
1 parent 86c978d commit 7915683
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 23 deletions.
38 changes: 24 additions & 14 deletions maf2maf.pl
Original file line number Diff line number Diff line change
Expand Up @@ -122,16 +122,17 @@
# Load the tumor-normal pairs from the TSV created by maf2vcf
my $tsv_file = $vcf_file;
$tsv_file =~ s/(\.vcf)?$/.pairs.tsv/;
my %tn_pair = map{ chomp; split( "\t", $_ )}`egrep -v "^#" $tsv_file`;
my %tn_vcf_idx = ();
# ::TODO:: If the same tumor is paired with different normals, treat them as separate TN-pairs
my %tn_pair = map{ chomp; split( "\t", $_ )}`egrep -v ^# $tsv_file`;
my ( %tn_vcf_idx, %tn_ids );

# Store the VEP annotated VCF header so we can duplicate it for per-TN VCFs
my $vep_vcf_header = `grep ^## $vep_anno`;

# Split the multi-sample VEP annotated VCF into per-TN VCFs
my ( @t_col_idx, %vep_fh );
$vep_fh{ $vep_anno } = IO::File->new( $vep_anno ) or die "ERROR: Couldn't open file: $vep_anno\n";
while( my $line = $vep_fh{ $vep_anno }->getline ) {
my ( @t_col_idx, %tn_vep );
my $vep_fh = IO::File->new( $vep_anno ) or die "ERROR: Couldn't open file: $vep_anno\n";
while( my $line = $vep_fh->getline ) {

# Skip comment lines, but parse everything else including the column headers
next if( $line =~ m/^##/ );
Expand All @@ -140,7 +141,7 @@
# Parse the header line to map column names to their indexes
if( $line =~ m/^#CHROM/ ) {

# Fill up %tn_vcf_idx with VCF column indexes of tumor-normal pairs
# Fill up %tn_vcf_idx with VCF column indexes of tumor-normal pairs, and %tn_ids with IDs
my %n_col_idx;
foreach my $idx ( 9..$#cols ){
if( defined $tn_pair{ $cols[$idx] }){
Expand All @@ -149,34 +150,43 @@
else {
$n_col_idx{ $cols[$idx] } = $idx;
}
$tn_ids{$idx} = $cols[$idx];
}
map{ my $n_id = $tn_pair{$cols[$_]}; $tn_vcf_idx{$_} = $n_col_idx{$n_id} } @t_col_idx;

# Initialize VCF header for each tumor-normal pair
foreach my $t_idx ( @t_col_idx ){
my $n_idx = $tn_vcf_idx{ $t_idx };
my $key = "$t_idx\_vs_$n_idx";
my ( $t_id, $n_id ) = @cols[$t_idx,$n_idx];
my ( $t_id, $n_id ) = ( $tn_ids{$t_idx}, $tn_ids{$n_idx} );
my $tn_vcf_file = "$tmp_dir/$t_id\_vs_$n_id.vep.vcf";
$vep_fh{ $key } = IO::File->new( $tn_vcf_file, ">" ) or die "ERROR: Couldn't open file: $tn_vcf_file\n";
$vep_fh{ $key }->print( $vep_vcf_header . "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$t_id\t$n_id\n" );
$tn_vep{$tn_vcf_file} .= $vep_vcf_header . "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$t_id\t$n_id\n";
}
}
# For all other lines containing variants, write it to the appropriate per-TN VCF
else {
foreach my $t_idx ( @t_col_idx ){
my $n_idx = $tn_vcf_idx{ $t_idx };
my @format_keys = split( /\:/, $cols[8] );
my $idx = 0;
my %t_info = map{( $format_keys[$idx++], $_ )} split( /\:/, $cols[$t_idx] );

# Skip variants where tumor genotype is null
# Skip variants for TN-pairs where the tumor genotype is null
next if ( $t_info{GT} eq './.' );
$vep_fh{ "$t_idx\_vs_$n_idx" }->print( join( "\t", @cols[0..8,$t_idx,$n_idx] ) . "\n" );

# Otherwise write it to the appropriate per-TN VCF file
my $n_idx = $tn_vcf_idx{ $t_idx };
my ( $t_id, $n_id ) = ( $tn_ids{$t_idx}, $tn_ids{$n_idx} );
my $tn_vcf_file = "$tmp_dir/$t_id\_vs_$n_id.vep.vcf";
$tn_vep{$tn_vcf_file} .= join( "\t", @cols[0..8,$t_idx,$n_idx] ) . "\n";
}
}
}
foreach( keys %vep_fh ) { $vep_fh{ $_ }->close };

# Write the cached contents of per-TN annotated VCFs into files
foreach my $tn_vcf_file ( keys %tn_vep ) {
my $tn_vep_fh = IO::File->new( $tn_vcf_file, ">" ) or die "ERROR: Couldn't open file: $tn_vcf_file\n";
$tn_vep_fh->print( $tn_vep{$tn_vcf_file} );
$tn_vep_fh->close;
}

# For each VCF generated by maf2vcf above, contruct a vcf2maf command and run it
my @vcfs = grep{ !m/.vep.vcf$/ and !m/$vcf_file/ } glob( "$tmp_dir/*.vcf" ); # Avoid reannotating annotated VCFs
Expand Down
22 changes: 13 additions & 9 deletions maf2vcf.pl
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
my $maf_fh = IO::File->new( $input_maf ) or die "ERROR: Couldn't open file: $input_maf\n";
my $line_count = 0;
my %col_idx = (); # Hash to map column names to column indexes
my %vcf_fh = (); # Hash of VCF file handles to speed up writing per-TN pair VCFs
my %tn_vcf = (); # In-memory cache to speed up writing per-TN pair VCFs
my %vcf_col_pair = ();
my %vcf_col_idx = ();
my @var_pos = ();
Expand Down Expand Up @@ -86,12 +86,11 @@
my ( $t_id, $n_id ) = split( /\t/, $pair );
$n_id = "NORMAL" unless( defined $n_id ); # Use a placeholder name for normal if its undefined
my $vcf_file = "$output_dir/$t_id\_vs_$n_id.vcf";
$vcf_fh{ $vcf_file } = IO::File->new( $vcf_file, ">" );
$vcf_fh{ $vcf_file }->print( "##fileformat=VCFv4.2\n" );
$vcf_fh{ $vcf_file }->print( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n" );
$vcf_fh{ $vcf_file }->print( "##FORMAT=<ID=AD,Number=G,Type=Integer,Description=\"Allelic Depths of REF and ALT(s) in the order listed\">\n" );
$vcf_fh{ $vcf_file }->print( "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n" );
$vcf_fh{ $vcf_file }->print( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$t_id\t$n_id\n" );
$tn_vcf{$vcf_file} .= "##fileformat=VCFv4.2\n";
$tn_vcf{$vcf_file} .= "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n";
$tn_vcf{$vcf_file} .= "##FORMAT=<ID=AD,Number=G,Type=Integer,Description=\"Allelic Depths of REF and ALT(s) in the order listed\">\n";
$tn_vcf{$vcf_file} .= "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n";
$tn_vcf{$vcf_file} .= "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$t_id\t$n_id\n";

# Set genotype column indexes for the multi-sample VCF, and keep the pairing info
$vcf_col_idx{ $t_id } = $idx++;
Expand Down Expand Up @@ -178,7 +177,7 @@
# Contruct a VCF formatted line and append it to the respective VCF
my $vcf_file = "$output_dir/$t_id\_vs_$n_id.vcf";
my $vcf_line = join( "\t", $chr, $pos, ".", $ref, $alt, qw( . . . ), "GT:AD:DP", $t_fmt, $n_fmt );
$vcf_fh{ $vcf_file }->print( "$vcf_line\n" );
$tn_vcf{$vcf_file} .= "$vcf_line\n";

# Store VCF formatted data for the multi-sample VCF
my $key = join( "\t", $chr, $pos, ".", $ref, $alt);
Expand All @@ -188,7 +187,12 @@
}
$maf_fh->close;

foreach( keys %vcf_fh ){ $vcf_fh{ $_ }->close };
# Write the cached contents of per-TN VCFs into files
foreach my $vcf_file ( keys %tn_vcf ) {
my $tn_vcf_fh = IO::File->new( $vcf_file, ">" ) or die "ERROR: Fail to create file $vcf_file\n";
$tn_vcf_fh->print( $tn_vcf{$vcf_file} );
$tn_vcf_fh->close;
}

# Initialize header lines for the multi-sample VCF
unless( defined $output_vcf ) {
Expand Down

0 comments on commit 7915683

Please sign in to comment.