From 1e983537c27eacc1aeeb89d619ab6d4a979ac3e9 Mon Sep 17 00:00:00 2001 From: Cyriac Kandoth Date: Thu, 28 May 2015 23:13:40 -0400 Subject: [PATCH] Support user-defined soforms --- data/isoform_overrides_at_mskcc | 33 +++++++++++++++++++++++++++++++++ docs/snpeff_maf_readme.txt | 2 +- docs/vep_maf_readme.txt | 2 +- maf2maf.pl | 5 ++++- vcf2maf.pl | 24 ++++++++++++++++++++---- 5 files changed, 59 insertions(+), 7 deletions(-) create mode 100644 data/isoform_overrides_at_mskcc diff --git a/data/isoform_overrides_at_mskcc b/data/isoform_overrides_at_mskcc new file mode 100644 index 0000000..d1b8b14 --- /dev/null +++ b/data/isoform_overrides_at_mskcc @@ -0,0 +1,33 @@ +#enst_id gene_name refseq_id ccds_id +ENST00000508376 APC NM_000038 CCDS4107.1 +ENST00000395915 AURKA NM_003600 CCDS13451.1 +ENST00000357654 BRCA1 NM_007294 CCDS11453.1 +ENST00000304494 CDKN2A NM_000077 CCDS6510.1 +ENST00000405598 CHEK2 NM_007194 CCDS13843.1 +ENST00000373106 CSF3R NM_000760 CCDS413.1 +ENST00000241393 CXCR4 NM_003467 CCDS46420.1 +ENST00000340748 DNMT1 NM_001379 CCDS12228.1 +ENST00000280892 EIF4E NM_001130678 CCDS47109.1 +ENST00000288319 ERG NM_182918 CCDS13658.1 +ENST00000405192 ETV1 NM_001163147 CCDS55087.1 +ENST00000358487 FGFR2 NM_000141 CCDS31298.1 +ENST00000440486 FGFR3 NM_000142 CCDS3353.1 +ENST00000498215 FOXP1 NM_001244814 CCDS2914.1 +ENST00000538466 FYN NM_153047 CCDS5095.1 +ENST00000371085 GNAS NM_000516 CCDS13472.1 +ENST00000331340 IKZF1 NM_006060 CCDS75596.1 +ENST00000424583 MEF2B NM_001145785 CCDS46024.1 +ENST00000397752 MET NM_000245 CCDS43636.1 +ENST00000372115 MUTYH NM_001048171 CCDS41320.1 +ENST00000396334 MYD88 NM_002468 CCDS2674.2 +ENST00000356341 PAK1 NM_002576 CCDS8250.1 +ENST00000373547 PPP6C NM_002721 CCDS6861.1 +ENST00000373198 PTPRT NM_133170 CCDS68127.1 +ENST00000378823 RAD50 NM_005732 CCDS34233.1 +ENST00000267868 RAD51 NM_002875 CCDS10062.1 +ENST00000487270 RAD51B NM_133509 CCDS9789.1 +ENST00000335858 RAD51D NM_133629 CCDS11288.1 +ENST00000368323 RIT1 NM_006912 CCDS1123.1 +ENST00000504102 SPOP NM_001007228 CCDS11551.1 +ENST00000588136 TCF3 NM_001136139 CCDS45899.1 +ENST00000523873 VEGFA NM_001171623 CCDS55010.1 diff --git a/docs/snpeff_maf_readme.txt b/docs/snpeff_maf_readme.txt index b1b1df6..0206f7f 100644 --- a/docs/snpeff_maf_readme.txt +++ b/docs/snpeff_maf_readme.txt @@ -18,7 +18,7 @@ The subsequent 10 columns are relevant to most analyses. The next column is relevant to analyses that consider the effect of the variant on all alternate isoforms of the gene, or on non-coding/regulatory transcripts. The effects are sorted first by transcript biotype priority, then by effect severity, and finally by decreasing order of transcript -length. Each effect in the list is in the format [Gene_Name,Effect,HGVSp,Transcript_ID]. +length. Each effect in the list is in the format [Gene_Name,Effect,HGVSp,Transcript_ID,]. 46. all_effects - a semicolon delimited list of all possible variant effects, sorted by priority diff --git a/docs/vep_maf_readme.txt b/docs/vep_maf_readme.txt index ab2b338..021ed2d 100644 --- a/docs/vep_maf_readme.txt +++ b/docs/vep_maf_readme.txt @@ -18,7 +18,7 @@ The subsequent 10 columns are relevant to most analyses. The next column is relevant to analyses that consider the effect of the variant on all alternate isoforms of the gene, or on non-coding/regulatory transcripts. The effects are sorted first by transcript biotype priority, then by effect severity, and finally by decreasing order of transcript -length. Each effect in the list is in the format [SYMBOL,Consequence,HGVSp,Transcript_ID]. +length. Each effect in the list is in the format [SYMBOL,Consequence,HGVSp,Transcript_ID,RefSeq]. 46. all_effects - a semicolon delimited list of all possible variant effects, sorted by priority diff --git a/maf2maf.pl b/maf2maf.pl index 48a756e..9d94e0d 100644 --- a/maf2maf.pl +++ b/maf2maf.pl @@ -45,7 +45,7 @@ # Parse options and print usage syntax on a syntax error, or if help was explicitly requested my ( $man, $help ) = ( 0, 0 ); -my ( $input_maf, $output_maf, $tmp_dir ); +my ( $input_maf, $output_maf, $tmp_dir, $custom_enst_file ); GetOptions( 'help!' => \$help, 'man!' => \$man, @@ -59,6 +59,7 @@ 'nrm-rad-col=s' => \$nrm_rad_col, 'nrm-vad-col=s' => \$nrm_vad_col, 'retain-cols=s' => \$retain_cols, + 'custom-enst=s' => \$custom_enst_file, 'vep-path=s' => \$vep_path, 'vep-data=s' => \$vep_data, 'vep-forks=s' => \$vep_forks, @@ -99,6 +100,7 @@ my $vcf2maf_cmd = "$perl_bin $vcf2maf_path --input-vcf $tn_vcf --output-maf $tn_maf " . "--tumor-id $tumor_id --normal-id $normal_id --vep-path $vep_path --vep-data $vep_data " . "--vep-forks $vep_forks --ref-fasta $ref_fasta"; + $vcf2maf_cmd .= " --custom-enst $custom_enst_file" if( $custom_enst_file ); system( $vcf2maf_cmd ) == 0 or die "\nERROR: Failed to run vcf2maf!\nCommand: $vcf2maf_cmd\n"; } @@ -236,6 +238,7 @@ =head1 OPTIONS --nrm-rad-col Name of MAF column for reference allele depth in normal BAM [n_ref_count] --nrm-vad-col Name of MAF column for variant allele depth in normal BAM [n_alt_count] --retain-cols Comma-delimited list of columns to retain from the input MAF [Center,Verification_Status,Validation_Status,Mutation_Status,Sequencing_Phase,Sequence_Source,Validation_Method,Score,BAM_file,Sequencer,Tumor_Sample_UUID,Matched_Norm_Sample_UUID] + --custom-enst List of custom ENST IDs that override canonical selection --vep-path Folder containing variant_effect_predictor.pl [~/vep] --vep-data VEP's base cache/plugin directory [~/.vep] --vep-forks Number of forked processes to use when running VEP [4] diff --git a/vcf2maf.pl b/vcf2maf.pl index d56957a..97075d0 100644 --- a/vcf2maf.pl +++ b/vcf2maf.pl @@ -162,7 +162,7 @@ sub GetBiotypePriority { # Parse options and print usage if there is a syntax error, or if usage was explicitly requested my ( $man, $help, $use_snpeff ) = ( 0, 0, 0 ); -my ( $input_vcf, $vep_anno, $snpeff_anno, $output_maf ); +my ( $input_vcf, $vep_anno, $snpeff_anno, $output_maf, $custom_enst_file ); my ( $vcf_tumor_id, $vcf_normal_id ); GetOptions( 'help!' => \$help, @@ -176,6 +176,7 @@ sub GetBiotypePriority { 'normal-id=s' => \$normal_id, 'vcf-tumor-id=s' => \$vcf_tumor_id, 'vcf-normal-id=s' => \$vcf_normal_id, + 'custom-enst=s' => \$custom_enst_file, 'vep-path=s' => \$vep_path, 'vep-data=s' => \$vep_data, 'vep-forks=s' => \$vep_forks, @@ -202,6 +203,13 @@ sub GetBiotypePriority { $vcf_tumor_id = $tumor_id unless( $vcf_tumor_id ); $vcf_normal_id = $normal_id unless( $vcf_normal_id ); +# Load up the custom isoform overrides if provided: +my %custom_enst; +if( $custom_enst_file ) { + ( -s $custom_enst_file ) or die "ERROR: The provided custom ENST file is missing or empty!\nPath: $custom_enst_file\n"; + %custom_enst = map{chomp; ( $_, 1 )}`grep -v ^# $custom_enst_file | cut -f1`; +} + # Annotate variants in given VCF to all possible transcripts, unless an annotated VCF was provided if( $vep_anno ) { ( -s $vep_anno ) or die "ERROR: Provided VEP-annotated VCF file is missing or empty!\nPath: $vep_anno\n"; @@ -644,8 +652,11 @@ sub GetBiotypePriority { my ( $effect_with_gene_name ) = grep { $_->{SYMBOL} } @all_effects; my $maf_gene = $effect_with_gene_name->{SYMBOL} if( $effect_with_gene_name ); + # If the gene has user-defined custom isoform overrides, choose that instead + ( $maf_effect ) = grep { $_->{SYMBOL} and $_->{SYMBOL} eq $maf_gene and $_->{Transcript_ID} and $custom_enst{$_->{Transcript_ID}} } @all_effects; + # Find the effect on the canonical transcript of that highest priority gene - ( $maf_effect ) = grep { $_->{SYMBOL} and $_->{SYMBOL} eq $maf_gene and $_->{CANONICAL} and $_->{CANONICAL} eq "YES" } @all_effects; + ( $maf_effect ) = grep { $_->{SYMBOL} and $_->{SYMBOL} eq $maf_gene and $_->{CANONICAL} and $_->{CANONICAL} eq "YES" } @all_effects unless( $maf_effect ); # If that gene has no canonical transcript tagged, choose the highest priority canonical effect on any gene ( $maf_effect ) = grep { $_->{CANONICAL} and $_->{CANONICAL} eq "YES" } @all_effects unless( $maf_effect ); @@ -725,8 +736,11 @@ sub GetBiotypePriority { my ( $effect_with_gene_name ) = grep { $_->{Gene_Name} } @all_effects; my $maf_gene = $effect_with_gene_name->{Gene_Name} if( $effect_with_gene_name ); + # If the gene has user-defined custom isoform overrides, choose that instead + ( $maf_effect ) = grep { $_->{Gene_Name} and $_->{Gene_Name} eq $maf_gene and $_->{Transcript_ID} and $custom_enst{$_->{Transcript_ID}} } @all_effects; + # Find the effect on the longest transcript of that highest priority gene - ( $maf_effect ) = sort { $b->{Amino_Acid_Length} <=> $a->{Amino_Acid_Length} } grep { $_->{Gene_Name} eq $maf_gene } @all_effects; + ( $maf_effect ) = sort { $b->{Amino_Acid_Length} <=> $a->{Amino_Acid_Length} } grep { $_->{Gene_Name} eq $maf_gene } @all_effects unless( $maf_effect ); } # Construct the MAF columns from the $maf_effect hash, and print to output @@ -779,7 +793,8 @@ sub GetBiotypePriority { my $effect_type = ( $effect->{Effect} ? $effect->{Effect} : $effect->{Consequence} ); my $protein_change = ( $effect->{HGVSp} ? $effect->{HGVSp} : '' ); my $transcript_id = ( $effect->{Transcript_ID} ? $effect->{Transcript_ID} : '' ); - $maf_line{all_effects} .= "$gene_name,$effect_type,$protein_change,$transcript_id;" if( $gene_name and $effect_type and $transcript_id ); + my $refseq_ids = ( $effect->{RefSeq} ? $effect->{RefSeq} : '' ); + $maf_line{all_effects} .= "$gene_name,$effect_type,$protein_change,$transcript_id,$refseq_ids;" if( $gene_name and $effect_type and $transcript_id ); } # At this point, we've generated all we can about this variant, so write it to the MAF @@ -836,6 +851,7 @@ =head1 OPTIONS --normal-id Matched_Norm_Sample_Barcode to report in the MAF [NORMAL] --vcf-tumor-id Tumor sample ID used in VCF's genotype columns [--tumor-id] --vcf-normal-id Matched normal ID used in VCF's genotype columns [--normal-id] + --custom-enst List of custom ENST IDs that override canonical selection --use-snpeff Use snpEff to annotate VCF, instead of the default VEP --vep-path Folder containing variant_effect_predictor.pl [~/vep] --vep-data VEP's base cache/plugin directory [~/.vep]