From 7915683ec05810bef23b01ad4585190348b2f394 Mon Sep 17 00:00:00 2001 From: Cyriac Kandoth Date: Tue, 18 Aug 2015 03:14:41 -0400 Subject: [PATCH] Cache per-TN VCFs before writing to disk --- maf2maf.pl | 38 ++++++++++++++++++++++++-------------- maf2vcf.pl | 22 +++++++++++++--------- 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/maf2maf.pl b/maf2maf.pl index 1736d71..51f4e91 100644 --- a/maf2maf.pl +++ b/maf2maf.pl @@ -122,16 +122,17 @@ # Load the tumor-normal pairs from the TSV created by maf2vcf my $tsv_file = $vcf_file; $tsv_file =~ s/(\.vcf)?$/.pairs.tsv/; -my %tn_pair = map{ chomp; split( "\t", $_ )}`egrep -v "^#" $tsv_file`; -my %tn_vcf_idx = (); +# ::TODO:: If the same tumor is paired with different normals, treat them as separate TN-pairs +my %tn_pair = map{ chomp; split( "\t", $_ )}`egrep -v ^# $tsv_file`; +my ( %tn_vcf_idx, %tn_ids ); # Store the VEP annotated VCF header so we can duplicate it for per-TN VCFs my $vep_vcf_header = `grep ^## $vep_anno`; # Split the multi-sample VEP annotated VCF into per-TN VCFs -my ( @t_col_idx, %vep_fh ); -$vep_fh{ $vep_anno } = IO::File->new( $vep_anno ) or die "ERROR: Couldn't open file: $vep_anno\n"; -while( my $line = $vep_fh{ $vep_anno }->getline ) { +my ( @t_col_idx, %tn_vep ); +my $vep_fh = IO::File->new( $vep_anno ) or die "ERROR: Couldn't open file: $vep_anno\n"; +while( my $line = $vep_fh->getline ) { # Skip comment lines, but parse everything else including the column headers next if( $line =~ m/^##/ ); @@ -140,7 +141,7 @@ # Parse the header line to map column names to their indexes if( $line =~ m/^#CHROM/ ) { - # Fill up %tn_vcf_idx with VCF column indexes of tumor-normal pairs + # Fill up %tn_vcf_idx with VCF column indexes of tumor-normal pairs, and %tn_ids with IDs my %n_col_idx; foreach my $idx ( 9..$#cols ){ if( defined $tn_pair{ $cols[$idx] }){ @@ -149,34 +150,43 @@ else { $n_col_idx{ $cols[$idx] } = $idx; } + $tn_ids{$idx} = $cols[$idx]; } map{ my $n_id = $tn_pair{$cols[$_]}; $tn_vcf_idx{$_} = $n_col_idx{$n_id} } @t_col_idx; # Initialize VCF header for each tumor-normal pair foreach my $t_idx ( @t_col_idx ){ my $n_idx = $tn_vcf_idx{ $t_idx }; - my $key = "$t_idx\_vs_$n_idx"; - my ( $t_id, $n_id ) = @cols[$t_idx,$n_idx]; + my ( $t_id, $n_id ) = ( $tn_ids{$t_idx}, $tn_ids{$n_idx} ); my $tn_vcf_file = "$tmp_dir/$t_id\_vs_$n_id.vep.vcf"; - $vep_fh{ $key } = IO::File->new( $tn_vcf_file, ">" ) or die "ERROR: Couldn't open file: $tn_vcf_file\n"; - $vep_fh{ $key }->print( $vep_vcf_header . "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$t_id\t$n_id\n" ); + $tn_vep{$tn_vcf_file} .= $vep_vcf_header . "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$t_id\t$n_id\n"; } } # For all other lines containing variants, write it to the appropriate per-TN VCF else { foreach my $t_idx ( @t_col_idx ){ - my $n_idx = $tn_vcf_idx{ $t_idx }; my @format_keys = split( /\:/, $cols[8] ); my $idx = 0; my %t_info = map{( $format_keys[$idx++], $_ )} split( /\:/, $cols[$t_idx] ); - # Skip variants where tumor genotype is null + # Skip variants for TN-pairs where the tumor genotype is null next if ( $t_info{GT} eq './.' ); - $vep_fh{ "$t_idx\_vs_$n_idx" }->print( join( "\t", @cols[0..8,$t_idx,$n_idx] ) . "\n" ); + + # Otherwise write it to the appropriate per-TN VCF file + my $n_idx = $tn_vcf_idx{ $t_idx }; + my ( $t_id, $n_id ) = ( $tn_ids{$t_idx}, $tn_ids{$n_idx} ); + my $tn_vcf_file = "$tmp_dir/$t_id\_vs_$n_id.vep.vcf"; + $tn_vep{$tn_vcf_file} .= join( "\t", @cols[0..8,$t_idx,$n_idx] ) . "\n"; } } } -foreach( keys %vep_fh ) { $vep_fh{ $_ }->close }; + +# Write the cached contents of per-TN annotated VCFs into files +foreach my $tn_vcf_file ( keys %tn_vep ) { + my $tn_vep_fh = IO::File->new( $tn_vcf_file, ">" ) or die "ERROR: Couldn't open file: $tn_vcf_file\n"; + $tn_vep_fh->print( $tn_vep{$tn_vcf_file} ); + $tn_vep_fh->close; +} # For each VCF generated by maf2vcf above, contruct a vcf2maf command and run it my @vcfs = grep{ !m/.vep.vcf$/ and !m/$vcf_file/ } glob( "$tmp_dir/*.vcf" ); # Avoid reannotating annotated VCFs diff --git a/maf2vcf.pl b/maf2vcf.pl index 02be2db..7725716 100644 --- a/maf2vcf.pl +++ b/maf2vcf.pl @@ -51,7 +51,7 @@ my $maf_fh = IO::File->new( $input_maf ) or die "ERROR: Couldn't open file: $input_maf\n"; my $line_count = 0; my %col_idx = (); # Hash to map column names to column indexes -my %vcf_fh = (); # Hash of VCF file handles to speed up writing per-TN pair VCFs +my %tn_vcf = (); # In-memory cache to speed up writing per-TN pair VCFs my %vcf_col_pair = (); my %vcf_col_idx = (); my @var_pos = (); @@ -86,12 +86,11 @@ my ( $t_id, $n_id ) = split( /\t/, $pair ); $n_id = "NORMAL" unless( defined $n_id ); # Use a placeholder name for normal if its undefined my $vcf_file = "$output_dir/$t_id\_vs_$n_id.vcf"; - $vcf_fh{ $vcf_file } = IO::File->new( $vcf_file, ">" ); - $vcf_fh{ $vcf_file }->print( "##fileformat=VCFv4.2\n" ); - $vcf_fh{ $vcf_file }->print( "##FORMAT=\n" ); - $vcf_fh{ $vcf_file }->print( "##FORMAT=\n" ); - $vcf_fh{ $vcf_file }->print( "##FORMAT=\n" ); - $vcf_fh{ $vcf_file }->print( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$t_id\t$n_id\n" ); + $tn_vcf{$vcf_file} .= "##fileformat=VCFv4.2\n"; + $tn_vcf{$vcf_file} .= "##FORMAT=\n"; + $tn_vcf{$vcf_file} .= "##FORMAT=\n"; + $tn_vcf{$vcf_file} .= "##FORMAT=\n"; + $tn_vcf{$vcf_file} .= "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$t_id\t$n_id\n"; # Set genotype column indexes for the multi-sample VCF, and keep the pairing info $vcf_col_idx{ $t_id } = $idx++; @@ -178,7 +177,7 @@ # Contruct a VCF formatted line and append it to the respective VCF my $vcf_file = "$output_dir/$t_id\_vs_$n_id.vcf"; my $vcf_line = join( "\t", $chr, $pos, ".", $ref, $alt, qw( . . . ), "GT:AD:DP", $t_fmt, $n_fmt ); - $vcf_fh{ $vcf_file }->print( "$vcf_line\n" ); + $tn_vcf{$vcf_file} .= "$vcf_line\n"; # Store VCF formatted data for the multi-sample VCF my $key = join( "\t", $chr, $pos, ".", $ref, $alt); @@ -188,7 +187,12 @@ } $maf_fh->close; -foreach( keys %vcf_fh ){ $vcf_fh{ $_ }->close }; +# Write the cached contents of per-TN VCFs into files +foreach my $vcf_file ( keys %tn_vcf ) { + my $tn_vcf_fh = IO::File->new( $vcf_file, ">" ) or die "ERROR: Fail to create file $vcf_file\n"; + $tn_vcf_fh->print( $tn_vcf{$vcf_file} ); + $tn_vcf_fh->close; +} # Initialize header lines for the multi-sample VCF unless( defined $output_vcf ) {