From c8aab2b0d00e4a8075118e5d44f8529ade63332f Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Tue, 16 Apr 2024 15:36:41 +0200 Subject: [PATCH 01/18] feat: add rename-contigs resources --- workflow/resources/rename-contigs/README.md | 13 + .../rename-contigs/grch37_ucsc2ensembl.txt | 93 ++++ .../rename-contigs/grch38_ucsc2ensembl.txt | 455 ++++++++++++++++++ 3 files changed, 561 insertions(+) create mode 100644 workflow/resources/rename-contigs/README.md create mode 100644 workflow/resources/rename-contigs/grch37_ucsc2ensembl.txt create mode 100644 workflow/resources/rename-contigs/grch38_ucsc2ensembl.txt diff --git a/workflow/resources/rename-contigs/README.md b/workflow/resources/rename-contigs/README.md new file mode 100644 index 0000000..c67f7eb --- /dev/null +++ b/workflow/resources/rename-contigs/README.md @@ -0,0 +1,13 @@ +# rename-contigs +The reference is denoted in Ensembl notation. If the caller is using USCS notation, the contigs need to be renamed like +``` +chr12 +``` +becomes +``` +12 +``` + +Two files are provided: One for GRCh38 as reference genome and one for GRCh37. The path to those files needs to be denoted in the user config to enable the `rename_contigs` rule. + +The files were taken from [dpryan79/ChromosomeMappings](https://github.com/dpryan79/ChromosomeMappings). \ No newline at end of file diff --git a/workflow/resources/rename-contigs/grch37_ucsc2ensembl.txt b/workflow/resources/rename-contigs/grch37_ucsc2ensembl.txt new file mode 100644 index 0000000..c047d4b --- /dev/null +++ b/workflow/resources/rename-contigs/grch37_ucsc2ensembl.txt @@ -0,0 +1,93 @@ +chr1 1 +chr2 2 +chr3 3 +chr4 4 +chr5 5 +chr6 6 +chr7 7 +chr8 8 +chr9 9 +chr10 10 +chr11 11 +chr12 12 +chr13 13 +chr14 14 +chr15 15 +chr16 16 +chr17 17 +chr18 18 +chr19 19 +chr20 20 +chr21 21 +chr22 22 +chrX X +chrY Y +chrM MT +chr1_gl000191_random GL000191.1 +chr1_gl000192_random GL000192.1 +chr4_gl000193_random GL000193.1 +chr4_gl000194_random GL000194.1 +chr7_gl000195_random GL000195.1 +chr8_gl000196_random GL000196.1 +chr8_gl000197_random GL000197.1 +chr9_gl000198_random GL000198.1 +chr9_gl000199_random GL000199.1 +chr9_gl000200_random GL000200.1 +chr9_gl000201_random GL000201.1 +chr11_gl000202_random GL000202.1 +chr17_gl000203_random GL000203.1 +chr17_gl000204_random GL000204.1 +chr17_gl000205_random GL000205.1 +chr17_gl000206_random GL000206.1 +chr18_gl000207_random GL000207.1 +chr19_gl000208_random GL000208.1 +chr19_gl000209_random GL000209.1 +chr21_gl000210_random GL000210.1 +chrUn_gl000211 GL000211.1 +chrUn_gl000212 GL000212.1 +chrUn_gl000213 GL000213.1 +chrUn_gl000214 GL000214.1 +chrUn_gl000215 GL000215.1 +chrUn_gl000216 GL000216.1 +chrUn_gl000217 GL000217.1 +chrUn_gl000218 GL000218.1 +chrUn_gl000219 GL000219.1 +chrUn_gl000220 GL000220.1 +chrUn_gl000221 GL000221.1 +chrUn_gl000222 GL000222.1 +chrUn_gl000223 GL000223.1 +chrUn_gl000224 GL000224.1 +chrUn_gl000225 GL000225.1 +chrUn_gl000226 GL000226.1 +chrUn_gl000227 GL000227.1 +chrUn_gl000228 GL000228.1 +chrUn_gl000229 GL000229.1 +chrUn_gl000230 GL000230.1 +chrUn_gl000231 GL000231.1 +chrUn_gl000232 GL000232.1 +chrUn_gl000233 GL000233.1 +chrUn_gl000234 GL000234.1 +chrUn_gl000235 GL000235.1 +chrUn_gl000236 GL000236.1 +chrUn_gl000237 GL000237.1 +chrUn_gl000238 GL000238.1 +chrUn_gl000239 GL000239.1 +chrUn_gl000240 GL000240.1 +chrUn_gl000241 GL000241.1 +chrUn_gl000242 GL000242.1 +chrUn_gl000243 GL000243.1 +chrUn_gl000244 GL000244.1 +chrUn_gl000245 GL000245.1 +chrUn_gl000246 GL000246.1 +chrUn_gl000247 GL000247.1 +chrUn_gl000248 GL000248.1 +chrUn_gl000249 GL000249.1 +chr4_ctg9_hap1 HSCHR4_1 +chr6_apd_hap1 HSCHR6_MHC_APD +chr6_cox_hap2 HSCHR6_MHC_COX +chr6_dbb_hap3 HSCHR6_MHC_DBB +chr6_mann_hap4 HSCHR6_MHC_MANN +chr6_mcf_hap5 HSCHR6_MHC_MCF +chr6_qbl_hap6 HSCHR6_MHC_QBL +chr6_ssto_hap7 HSCHR6_MHC_SSTO +chr17_ctg5_hap1 HSCHR17_1 diff --git a/workflow/resources/rename-contigs/grch38_ucsc2ensembl.txt b/workflow/resources/rename-contigs/grch38_ucsc2ensembl.txt new file mode 100644 index 0000000..899e195 --- /dev/null +++ b/workflow/resources/rename-contigs/grch38_ucsc2ensembl.txt @@ -0,0 +1,455 @@ +chr1 1 +chr10 10 +chr10_GL383545v1_alt +chr10_GL383546v1_alt +chr10_KI270824v1_alt +chr10_KI270825v1_alt +chr11 11 +chr11_GL383547v1_alt +chr11_JH159136v1_alt +chr11_JH159137v1_alt +chr11_KI270721v1_random KI270721.1 +chr11_KI270826v1_alt +chr11_KI270827v1_alt +chr11_KI270829v1_alt +chr11_KI270830v1_alt +chr11_KI270831v1_alt +chr11_KI270832v1_alt +chr11_KI270902v1_alt +chr11_KI270903v1_alt +chr11_KI270927v1_alt +chr12 12 +chr12_GL383549v1_alt +chr12_GL383550v2_alt +chr12_GL383551v1_alt +chr12_GL383552v1_alt +chr12_GL383553v2_alt +chr12_GL877875v1_alt +chr12_GL877876v1_alt +chr12_KI270833v1_alt +chr12_KI270834v1_alt +chr12_KI270835v1_alt +chr12_KI270836v1_alt +chr12_KI270837v1_alt +chr12_KI270904v1_alt +chr13 13 +chr13_KI270838v1_alt +chr13_KI270839v1_alt +chr13_KI270840v1_alt +chr13_KI270841v1_alt +chr13_KI270842v1_alt +chr13_KI270843v1_alt +chr14 14 +chr14_GL000009v2_random GL000009.2 +chr14_GL000194v1_random GL000194.1 +chr14_GL000225v1_random GL000225.1 +chr14_KI270722v1_random KI270722.1 +chr14_KI270723v1_random KI270723.1 +chr14_KI270724v1_random KI270724.1 +chr14_KI270725v1_random KI270725.1 +chr14_KI270726v1_random KI270726.1 +chr14_KI270844v1_alt +chr14_KI270845v1_alt +chr14_KI270846v1_alt +chr14_KI270847v1_alt +chr15 15 +chr15_GL383554v1_alt +chr15_GL383555v2_alt +chr15_KI270727v1_random KI270727.1 +chr15_KI270848v1_alt +chr15_KI270849v1_alt +chr15_KI270850v1_alt +chr15_KI270851v1_alt +chr15_KI270852v1_alt +chr15_KI270905v1_alt +chr15_KI270906v1_alt +chr16 16 +chr16_GL383556v1_alt +chr16_GL383557v1_alt +chr16_KI270728v1_random KI270728.1 +chr16_KI270853v1_alt +chr16_KI270854v1_alt +chr16_KI270855v1_alt +chr16_KI270856v1_alt +chr17 17 +chr17_GL000205v2_random GL000205.2 +chr17_GL000258v2_alt +chr17_GL383563v3_alt +chr17_GL383564v2_alt +chr17_GL383565v1_alt +chr17_GL383566v1_alt +chr17_JH159146v1_alt +chr17_JH159147v1_alt +chr17_JH159148v1_alt +chr17_KI270729v1_random KI270729.1 +chr17_KI270730v1_random KI270730.1 +chr17_KI270857v1_alt +chr17_KI270858v1_alt +chr17_KI270859v1_alt +chr17_KI270860v1_alt +chr17_KI270861v1_alt +chr17_KI270862v1_alt +chr17_KI270907v1_alt +chr17_KI270908v1_alt +chr17_KI270909v1_alt +chr17_KI270910v1_alt +chr18 18 +chr18_GL383567v1_alt +chr18_GL383568v1_alt +chr18_GL383569v1_alt +chr18_GL383570v1_alt +chr18_GL383571v1_alt +chr18_GL383572v1_alt +chr18_KI270863v1_alt +chr18_KI270864v1_alt +chr18_KI270911v1_alt +chr18_KI270912v1_alt +chr19 19 +chr19_GL000209v2_alt +chr19_GL383573v1_alt +chr19_GL383574v1_alt +chr19_GL383575v2_alt +chr19_GL383576v1_alt +chr19_GL949746v1_alt +chr19_GL949747v2_alt +chr19_GL949748v2_alt +chr19_GL949749v2_alt +chr19_GL949750v2_alt +chr19_GL949751v2_alt +chr19_GL949752v1_alt +chr19_GL949753v2_alt +chr19_KI270865v1_alt +chr19_KI270866v1_alt +chr19_KI270867v1_alt +chr19_KI270868v1_alt +chr19_KI270882v1_alt +chr19_KI270883v1_alt +chr19_KI270884v1_alt +chr19_KI270885v1_alt +chr19_KI270886v1_alt +chr19_KI270887v1_alt +chr19_KI270888v1_alt +chr19_KI270889v1_alt +chr19_KI270890v1_alt +chr19_KI270891v1_alt +chr19_KI270914v1_alt +chr19_KI270915v1_alt +chr19_KI270916v1_alt +chr19_KI270917v1_alt +chr19_KI270918v1_alt +chr19_KI270919v1_alt +chr19_KI270920v1_alt +chr19_KI270921v1_alt +chr19_KI270922v1_alt +chr19_KI270923v1_alt +chr19_KI270929v1_alt +chr19_KI270930v1_alt +chr19_KI270931v1_alt +chr19_KI270932v1_alt +chr19_KI270933v1_alt +chr19_KI270938v1_alt +chr1_GL383518v1_alt +chr1_GL383519v1_alt +chr1_GL383520v2_alt +chr1_KI270706v1_random KI270706.1 +chr1_KI270707v1_random KI270707.1 +chr1_KI270708v1_random KI270708.1 +chr1_KI270709v1_random KI270709.1 +chr1_KI270710v1_random KI270710.1 +chr1_KI270711v1_random KI270711.1 +chr1_KI270712v1_random KI270712.1 +chr1_KI270713v1_random KI270713.1 +chr1_KI270714v1_random KI270714.1 +chr1_KI270759v1_alt +chr1_KI270760v1_alt +chr1_KI270761v1_alt +chr1_KI270762v1_alt +chr1_KI270763v1_alt +chr1_KI270764v1_alt +chr1_KI270765v1_alt +chr1_KI270766v1_alt +chr1_KI270892v1_alt +chr2 2 +chr20 20 +chr20_GL383577v2_alt +chr20_KI270869v1_alt +chr20_KI270870v1_alt +chr20_KI270871v1_alt +chr21 21 +chr21_GL383578v2_alt +chr21_GL383579v2_alt +chr21_GL383580v2_alt +chr21_GL383581v2_alt +chr21_KI270872v1_alt +chr21_KI270873v1_alt +chr21_KI270874v1_alt +chr22 22 +chr22_GL383582v2_alt +chr22_GL383583v2_alt +chr22_KB663609v1_alt +chr22_KI270731v1_random KI270731.1 +chr22_KI270732v1_random KI270732.1 +chr22_KI270733v1_random KI270733.1 +chr22_KI270734v1_random KI270734.1 +chr22_KI270735v1_random KI270735.1 +chr22_KI270736v1_random KI270736.1 +chr22_KI270737v1_random KI270737.1 +chr22_KI270738v1_random KI270738.1 +chr22_KI270739v1_random KI270739.1 +chr22_KI270875v1_alt +chr22_KI270876v1_alt +chr22_KI270877v1_alt +chr22_KI270878v1_alt +chr22_KI270879v1_alt +chr22_KI270928v1_alt +chr2_GL383521v1_alt +chr2_GL383522v1_alt +chr2_GL582966v2_alt +chr2_KI270715v1_random KI270715.1 +chr2_KI270716v1_random KI270716.1 +chr2_KI270767v1_alt +chr2_KI270768v1_alt +chr2_KI270769v1_alt +chr2_KI270770v1_alt +chr2_KI270771v1_alt +chr2_KI270772v1_alt +chr2_KI270773v1_alt +chr2_KI270774v1_alt +chr2_KI270775v1_alt +chr2_KI270776v1_alt +chr2_KI270893v1_alt +chr2_KI270894v1_alt +chr3 3 +chr3_GL000221v1_random GL000221.1 +chr3_GL383526v1_alt +chr3_JH636055v2_alt +chr3_KI270777v1_alt +chr3_KI270778v1_alt +chr3_KI270779v1_alt +chr3_KI270780v1_alt +chr3_KI270781v1_alt +chr3_KI270782v1_alt +chr3_KI270783v1_alt +chr3_KI270784v1_alt +chr3_KI270895v1_alt +chr3_KI270924v1_alt +chr3_KI270934v1_alt +chr3_KI270935v1_alt +chr3_KI270936v1_alt +chr3_KI270937v1_alt +chr4 4 +chr4_GL000008v2_random GL000008.2 +chr4_GL000257v2_alt +chr4_GL383527v1_alt +chr4_GL383528v1_alt +chr4_KI270785v1_alt +chr4_KI270786v1_alt +chr4_KI270787v1_alt +chr4_KI270788v1_alt +chr4_KI270789v1_alt +chr4_KI270790v1_alt +chr4_KI270896v1_alt +chr4_KI270925v1_alt +chr5 5 +chr5_GL000208v1_random GL000208.1 +chr5_GL339449v2_alt +chr5_GL383530v1_alt +chr5_GL383531v1_alt +chr5_GL383532v1_alt +chr5_GL949742v1_alt +chr5_KI270791v1_alt +chr5_KI270792v1_alt +chr5_KI270793v1_alt +chr5_KI270794v1_alt +chr5_KI270795v1_alt +chr5_KI270796v1_alt +chr5_KI270897v1_alt +chr5_KI270898v1_alt +chr6 6 +chr6_GL000250v2_alt +chr6_GL000251v2_alt +chr6_GL000252v2_alt +chr6_GL000253v2_alt +chr6_GL000254v2_alt +chr6_GL000255v2_alt +chr6_GL000256v2_alt +chr6_GL383533v1_alt +chr6_KB021644v2_alt +chr6_KI270758v1_alt +chr6_KI270797v1_alt +chr6_KI270798v1_alt +chr6_KI270799v1_alt +chr6_KI270800v1_alt +chr6_KI270801v1_alt +chr6_KI270802v1_alt +chr7 7 +chr7_GL383534v2_alt +chr7_KI270803v1_alt +chr7_KI270804v1_alt +chr7_KI270805v1_alt +chr7_KI270806v1_alt +chr7_KI270807v1_alt +chr7_KI270808v1_alt +chr7_KI270809v1_alt +chr7_KI270899v1_alt +chr8 8 +chr8_KI270810v1_alt +chr8_KI270811v1_alt +chr8_KI270812v1_alt +chr8_KI270813v1_alt +chr8_KI270814v1_alt +chr8_KI270815v1_alt +chr8_KI270816v1_alt +chr8_KI270817v1_alt +chr8_KI270818v1_alt +chr8_KI270819v1_alt +chr8_KI270820v1_alt +chr8_KI270821v1_alt +chr8_KI270822v1_alt +chr8_KI270900v1_alt +chr8_KI270901v1_alt +chr8_KI270926v1_alt +chr9 9 +chr9_GL383539v1_alt +chr9_GL383540v1_alt +chr9_GL383541v1_alt +chr9_GL383542v1_alt +chr9_KI270717v1_random KI270717.1 +chr9_KI270718v1_random KI270718.1 +chr9_KI270719v1_random KI270719.1 +chr9_KI270720v1_random KI270720.1 +chr9_KI270823v1_alt +chrM MT +chrUn_GL000195v1 GL000195.1 +chrUn_GL000213v1 GL000213.1 +chrUn_GL000214v1 GL000214.1 +chrUn_GL000216v2 GL000216.2 +chrUn_GL000218v1 GL000218.1 +chrUn_GL000219v1 GL000219.1 +chrUn_GL000220v1 GL000220.1 +chrUn_GL000224v1 GL000224.1 +chrUn_GL000226v1 GL000226.1 +chrUn_KI270302v1 KI270302.1 +chrUn_KI270303v1 KI270303.1 +chrUn_KI270304v1 KI270304.1 +chrUn_KI270305v1 KI270305.1 +chrUn_KI270310v1 KI270310.1 +chrUn_KI270311v1 KI270311.1 +chrUn_KI270312v1 KI270312.1 +chrUn_KI270315v1 KI270315.1 +chrUn_KI270316v1 KI270316.1 +chrUn_KI270317v1 KI270317.1 +chrUn_KI270320v1 KI270320.1 +chrUn_KI270322v1 KI270322.1 +chrUn_KI270329v1 KI270329.1 +chrUn_KI270330v1 KI270330.1 +chrUn_KI270333v1 KI270333.1 +chrUn_KI270334v1 KI270334.1 +chrUn_KI270335v1 KI270335.1 +chrUn_KI270336v1 KI270336.1 +chrUn_KI270337v1 KI270337.1 +chrUn_KI270338v1 KI270338.1 +chrUn_KI270340v1 KI270340.1 +chrUn_KI270362v1 KI270362.1 +chrUn_KI270363v1 KI270363.1 +chrUn_KI270364v1 KI270364.1 +chrUn_KI270366v1 KI270366.1 +chrUn_KI270371v1 KI270371.1 +chrUn_KI270372v1 KI270372.1 +chrUn_KI270373v1 KI270373.1 +chrUn_KI270374v1 KI270374.1 +chrUn_KI270375v1 KI270375.1 +chrUn_KI270376v1 KI270376.1 +chrUn_KI270378v1 KI270378.1 +chrUn_KI270379v1 KI270379.1 +chrUn_KI270381v1 KI270381.1 +chrUn_KI270382v1 KI270382.1 +chrUn_KI270383v1 KI270383.1 +chrUn_KI270384v1 KI270384.1 +chrUn_KI270385v1 KI270385.1 +chrUn_KI270386v1 KI270386.1 +chrUn_KI270387v1 KI270387.1 +chrUn_KI270388v1 KI270388.1 +chrUn_KI270389v1 KI270389.1 +chrUn_KI270390v1 KI270390.1 +chrUn_KI270391v1 KI270391.1 +chrUn_KI270392v1 KI270392.1 +chrUn_KI270393v1 KI270393.1 +chrUn_KI270394v1 KI270394.1 +chrUn_KI270395v1 KI270395.1 +chrUn_KI270396v1 KI270396.1 +chrUn_KI270411v1 KI270411.1 +chrUn_KI270412v1 KI270412.1 +chrUn_KI270414v1 KI270414.1 +chrUn_KI270417v1 KI270417.1 +chrUn_KI270418v1 KI270418.1 +chrUn_KI270419v1 KI270419.1 +chrUn_KI270420v1 KI270420.1 +chrUn_KI270422v1 KI270422.1 +chrUn_KI270423v1 KI270423.1 +chrUn_KI270424v1 KI270424.1 +chrUn_KI270425v1 KI270425.1 +chrUn_KI270429v1 KI270429.1 +chrUn_KI270435v1 KI270435.1 +chrUn_KI270438v1 KI270438.1 +chrUn_KI270442v1 KI270442.1 +chrUn_KI270448v1 KI270448.1 +chrUn_KI270465v1 KI270465.1 +chrUn_KI270466v1 KI270466.1 +chrUn_KI270467v1 KI270467.1 +chrUn_KI270468v1 KI270468.1 +chrUn_KI270507v1 KI270507.1 +chrUn_KI270508v1 KI270508.1 +chrUn_KI270509v1 KI270509.1 +chrUn_KI270510v1 KI270510.1 +chrUn_KI270511v1 KI270511.1 +chrUn_KI270512v1 KI270512.1 +chrUn_KI270515v1 KI270515.1 +chrUn_KI270516v1 KI270516.1 +chrUn_KI270517v1 KI270517.1 +chrUn_KI270518v1 KI270518.1 +chrUn_KI270519v1 KI270519.1 +chrUn_KI270521v1 KI270521.1 +chrUn_KI270522v1 KI270522.1 +chrUn_KI270528v1 KI270528.1 +chrUn_KI270529v1 KI270529.1 +chrUn_KI270530v1 KI270530.1 +chrUn_KI270538v1 KI270538.1 +chrUn_KI270539v1 KI270539.1 +chrUn_KI270544v1 KI270544.1 +chrUn_KI270548v1 KI270548.1 +chrUn_KI270579v1 KI270579.1 +chrUn_KI270580v1 KI270580.1 +chrUn_KI270581v1 KI270581.1 +chrUn_KI270582v1 KI270582.1 +chrUn_KI270583v1 KI270583.1 +chrUn_KI270584v1 KI270584.1 +chrUn_KI270587v1 KI270587.1 +chrUn_KI270588v1 KI270588.1 +chrUn_KI270589v1 KI270589.1 +chrUn_KI270590v1 KI270590.1 +chrUn_KI270591v1 KI270591.1 +chrUn_KI270593v1 KI270593.1 +chrUn_KI270741v1 KI270741.1 +chrUn_KI270742v1 KI270742.1 +chrUn_KI270743v1 KI270743.1 +chrUn_KI270744v1 KI270744.1 +chrUn_KI270745v1 KI270745.1 +chrUn_KI270746v1 KI270746.1 +chrUn_KI270747v1 KI270747.1 +chrUn_KI270748v1 KI270748.1 +chrUn_KI270749v1 KI270749.1 +chrUn_KI270750v1 KI270750.1 +chrUn_KI270751v1 KI270751.1 +chrUn_KI270752v1 KI270752.1 +chrUn_KI270753v1 KI270753.1 +chrUn_KI270754v1 KI270754.1 +chrUn_KI270755v1 KI270755.1 +chrUn_KI270756v1 KI270756.1 +chrUn_KI270757v1 KI270757.1 +chrX X +chrX_KI270880v1_alt +chrX_KI270881v1_alt +chrX_KI270913v1_alt +chrY Y +chrY_KI270740v1_random KI270740.1 From 5039a8fd7dd109c9bfd472ed0a97dd7842b17f9b Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Tue, 16 Apr 2024 15:44:35 +0200 Subject: [PATCH 02/18] fix: make normalize_calls standalone and add bedtools intersect --- workflow/rules/eval.smk | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index b7eabb7..03358b3 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -60,15 +60,23 @@ rule remove_non_pass: "v3.3.6/bio/bcftools/view" -use rule normalize_truth as normalize_calls with: +rule normalize_calls: input: - "results/filtered-variants/{callset}.bcf", + bcf="results/filtered-variants/{callset}.bcf", ref="resources/reference/genome.fasta", ref_index="resources/reference/genome.fasta.fai", + regions=get_test_regions, output: "results/normalized-variants/{callset}.vcf.gz", + params: + get_norm_params, log: "logs/normalize-calls/{callset}.log", + conda: + "../envs/tools.yaml" + shell: + "(bedtools intersect -b {input.regions} -a " + "<(bcftools norm {params} 2> {log}" rule stratify_truth: From 53401a40680913ea6cf47f5068b3c3646404a038 Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Tue, 16 Apr 2024 15:52:30 +0200 Subject: [PATCH 03/18] fix: output in vcf.gz --- workflow/rules/eval.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 03358b3..6eddad4 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -76,7 +76,8 @@ rule normalize_calls: "../envs/tools.yaml" shell: "(bedtools intersect -b {input.regions} -a " - "<(bcftools norm {params} 2> {log}" + "<(bcftools norm {params} {input.bcf} ) | " + "bcftools view -Oz > {output}) 2> {log}" rule stratify_truth: From a6c6c60492b341361d9e55e1df47f7678598660c Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Tue, 16 Apr 2024 16:39:25 +0200 Subject: [PATCH 04/18] test: norm params --- workflow/rules/common.smk | 9 +++++++-- workflow/rules/eval.smk | 9 +++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 008d1b5..042d3fc 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -299,8 +299,13 @@ def get_stratified_truth(suffix=""): def get_confidence_regions(wildcards): - benchmark = get_benchmark(wildcards.benchmark) - return f"resources/regions/{benchmark['genome']}.confidence-regions.bed" + if hasattr(wildcards, "benchmark"): + benchmark = get_benchmark(wildcards.benchmark) + return f"resources/regions/{benchmark['genome']}.confidence-regions.bed" + else: + benchmark_name = config["variant-calls"][wildcards.callset]["benchmark"] + benchmark = get_benchmark(benchmark_name) + return f"resources/regions/{benchmark['genome']}.confidence-regions.bed" def get_test_regions(wildcards): diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 6eddad4..149e9ae 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -65,18 +65,19 @@ rule normalize_calls: bcf="results/filtered-variants/{callset}.bcf", ref="resources/reference/genome.fasta", ref_index="resources/reference/genome.fasta.fai", - regions=get_test_regions, + regions=get_confidence_regions, output: "results/normalized-variants/{callset}.vcf.gz", - params: - get_norm_params, + #params: + # get_norm_params, log: "logs/normalize-calls/{callset}.log", conda: "../envs/tools.yaml" shell: "(bedtools intersect -b {input.regions} -a " - "<(bcftools norm {params} {input.bcf} ) | " + #"<(bcftools norm {params} {input.bcf} ) | " + "<(bcftools norm --atomize --check-ref s --fasta-ref {input.ref} --rm-dup exact {input.bcf} ) | " "bcftools view -Oz > {output}) 2> {log}" From 58c8b2e1fe7e05600477b036d7d33cf63a6da265 Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Tue, 16 Apr 2024 16:58:14 +0200 Subject: [PATCH 05/18] fix: correct commands --- workflow/rules/eval.smk | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 149e9ae..c412761 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -68,7 +68,7 @@ rule normalize_calls: regions=get_confidence_regions, output: "results/normalized-variants/{callset}.vcf.gz", - #params: + # params: # get_norm_params, log: "logs/normalize-calls/{callset}.log", @@ -76,8 +76,7 @@ rule normalize_calls: "../envs/tools.yaml" shell: "(bedtools intersect -b {input.regions} -a " - #"<(bcftools norm {params} {input.bcf} ) | " - "<(bcftools norm --atomize --check-ref s --fasta-ref {input.ref} --rm-dup exact {input.bcf} ) | " + "<(bcftools norm --atomize --check-ref s --fasta-ref {input.ref} --rm-dup exact {input.bcf}) -wa -f 1.0 -header | " "bcftools view -Oz > {output}) 2> {log}" From 1415c8917a2dbdffb9833c14ae54199d642d5032 Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Tue, 16 Apr 2024 17:21:53 +0200 Subject: [PATCH 06/18] test: switch from confidence to target regions --- workflow/rules/common.smk | 16 ++++++++++++---- workflow/rules/eval.smk | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 84f71a9..5a3cf3e 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -229,11 +229,19 @@ def get_target_bed_statement(wildcards): def get_target_regions(wildcards): - benchmark = get_benchmark(wildcards.benchmark) - if "target-regions" in benchmark: - return "resources/regions/{benchmark}/target-regions.bed" + if hasattr(wildcards, "benchmark"): + benchmark = get_benchmark(wildcards.benchmark) + if "target-regions" in benchmark: + return "resources/regions/{benchmark}/target-regions.bed" + else: + return [] else: - return [] + benchmark_name = config["variant-calls"][wildcards.callset]["benchmark"] + benchmark = get_benchmark(benchmark_name) + if "target-regions" in benchmark: + return "resources/regions/" + benchmark_name + "/target-regions.bed" + else: + return [] def get_target_regions_intersect_statement(wildcards, input): diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index c412761..b0443f6 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -65,7 +65,7 @@ rule normalize_calls: bcf="results/filtered-variants/{callset}.bcf", ref="resources/reference/genome.fasta", ref_index="resources/reference/genome.fasta.fai", - regions=get_confidence_regions, + regions=get_target_regions, output: "results/normalized-variants/{callset}.vcf.gz", # params: From 1824a5b4610e434e04888b2fb6eba74c8bffe9e7 Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Tue, 16 Apr 2024 17:27:58 +0200 Subject: [PATCH 07/18] fix: linting --- workflow/rules/common.smk | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 5a3cf3e..e3aae7a 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -239,7 +239,10 @@ def get_target_regions(wildcards): benchmark_name = config["variant-calls"][wildcards.callset]["benchmark"] benchmark = get_benchmark(benchmark_name) if "target-regions" in benchmark: - return "resources/regions/" + benchmark_name + "/target-regions.bed" + # TODO use f-string when this is fixed: https://github.com/snakemake/snakefmt/issues/215 + return "resources/regions/{benchmark_name}/target-regions.bed".format( + benchmark_name=benchmark_name + ) else: return [] From 8eec88b0e7e240583725cb37755c64364d6431ff Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Wed, 17 Apr 2024 11:42:52 +0200 Subject: [PATCH 08/18] fix: start with intersect and pipe into norm --- workflow/rules/eval.smk | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index b0443f6..65bfa80 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -65,18 +65,17 @@ rule normalize_calls: bcf="results/filtered-variants/{callset}.bcf", ref="resources/reference/genome.fasta", ref_index="resources/reference/genome.fasta.fai", - regions=get_target_regions, + regions=get_confidence_regions, output: "results/normalized-variants/{callset}.vcf.gz", - # params: - # get_norm_params, log: "logs/normalize-calls/{callset}.log", conda: "../envs/tools.yaml" shell: "(bedtools intersect -b {input.regions} -a " - "<(bcftools norm --atomize --check-ref s --fasta-ref {input.ref} --rm-dup exact {input.bcf}) -wa -f 1.0 -header | " + "<(bcftools view -Oz {input.bcf}) -wa -f 1.0 -header | " + "bcftools norm --atomize --check-ref s --fasta-ref {input.ref} --rm-dup exact | " "bcftools view -Oz > {output}) 2> {log}" From b68c6ba26e129774854033cb90954bb922d4224c Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Wed, 17 Apr 2024 12:33:55 +0200 Subject: [PATCH 09/18] test: giab dummy --- .test/config/config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 7ba2a75..b8132ec 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -1,7 +1,7 @@ variant-calls: - dummy-giab: - path: "resources/variants/na12878/all.truth.bcf" - benchmark: "giab-na12878-exome" + # dummy-giab: + # path: "resources/variants/na12878/all.truth.bcf" + # benchmark: "giab-na12878-exome" dummy-chm: path: "resources/variants/chm-eval/all.truth.bcf" benchmark: "chm-eval" From e66c82e4b135e974cd783141efd03a42d9f0dc76 Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Wed, 17 Apr 2024 13:09:17 +0200 Subject: [PATCH 10/18] test: target instead of confidence regions --- workflow/rules/eval.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 65bfa80..846fa99 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -65,7 +65,7 @@ rule normalize_calls: bcf="results/filtered-variants/{callset}.bcf", ref="resources/reference/genome.fasta", ref_index="resources/reference/genome.fasta.fai", - regions=get_confidence_regions, + regions=get_target_regions, output: "results/normalized-variants/{callset}.vcf.gz", log: From d495233c328978981698cde4cc1eee018d3fa082 Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Wed, 17 Apr 2024 13:35:32 +0200 Subject: [PATCH 11/18] fix: remove output from view --- workflow/rules/eval.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 846fa99..002472c 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -74,7 +74,7 @@ rule normalize_calls: "../envs/tools.yaml" shell: "(bedtools intersect -b {input.regions} -a " - "<(bcftools view -Oz {input.bcf}) -wa -f 1.0 -header | " + "<(bcftools view {input.bcf}) -wa -f 1.0 -header | " "bcftools norm --atomize --check-ref s --fasta-ref {input.ref} --rm-dup exact | " "bcftools view -Oz > {output}) 2> {log}" From 9320bdef064f13d9dedb2241709ab1600d49553f Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Wed, 17 Apr 2024 14:07:45 +0200 Subject: [PATCH 12/18] fix: add giab dummy again --- .test/config/config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index b8132ec..7ba2a75 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -1,7 +1,7 @@ variant-calls: - # dummy-giab: - # path: "resources/variants/na12878/all.truth.bcf" - # benchmark: "giab-na12878-exome" + dummy-giab: + path: "resources/variants/na12878/all.truth.bcf" + benchmark: "giab-na12878-exome" dummy-chm: path: "resources/variants/chm-eval/all.truth.bcf" benchmark: "chm-eval" From 50a3fe4e6dbfcf27196a5b8f270957b1471235ad Mon Sep 17 00:00:00 2001 From: BiancaStoecker Date: Wed, 17 Apr 2024 20:20:42 +0000 Subject: [PATCH 13/18] fix: added get_norm_params because of extra param for limit reads in test cases --- workflow/rules/eval.smk | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 002472c..997b296 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -41,7 +41,7 @@ rule add_format_field: "../envs/vatools.yaml" shell: # TODO: Optional - Check first if FORMAT field is present for example with - # TODO: bcftools view -h out.vcf.gz | grep FORMAT oder bcftools query -l all.bcf + # TODO: bcftools view -h out.vcf.gz | grep FORMAT oder bcftools query -l all.bcf # bcftools convert makes sure that input for vcf-genotype-annotator is in vcf format # adds FORMAT field with GT field and sample name 'truth' "vcf-genotype-annotator <(bcftools convert -Ov {input}) truth 0/1 -o {output} &> {log}" @@ -68,6 +68,8 @@ rule normalize_calls: regions=get_target_regions, output: "results/normalized-variants/{callset}.vcf.gz", + params: + extra=get_norm_params, log: "logs/normalize-calls/{callset}.log", conda: @@ -75,7 +77,7 @@ rule normalize_calls: shell: "(bedtools intersect -b {input.regions} -a " "<(bcftools view {input.bcf}) -wa -f 1.0 -header | " - "bcftools norm --atomize --check-ref s --fasta-ref {input.ref} --rm-dup exact | " + "bcftools norm {params.extra} --fasta-ref {input.ref} | " "bcftools view -Oz > {output}) 2> {log}" From 9800da9aae747505e57ac55ced65b1d190820c7c Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Thu, 18 Apr 2024 10:37:48 +0200 Subject: [PATCH 14/18] fix: remove unused checks for confidence regions --- workflow/rules/common.smk | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index a962329..f0d4483 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -301,13 +301,8 @@ def get_stratified_truth(suffix=""): def get_confidence_regions(wildcards): - if hasattr(wildcards, "benchmark"): - benchmark = get_benchmark(wildcards.benchmark) - return f"resources/regions/{benchmark['genome']}.confidence-regions.bed" - else: - benchmark_name = config["variant-calls"][wildcards.callset]["benchmark"] - benchmark = get_benchmark(benchmark_name) - return f"resources/regions/{benchmark['genome']}.confidence-regions.bed" + benchmark = get_benchmark(wildcards.benchmark) + return f"resources/regions/{benchmark['genome']}.confidence-regions.bed" def get_test_regions(wildcards): From 866e281d9df531485dc607be4ae8fbf03a51b0e8 Mon Sep 17 00:00:00 2001 From: Famke Baeuerle Date: Mon, 22 Apr 2024 11:39:39 +0200 Subject: [PATCH 15/18] fix: f String use --- workflow/rules/common.smk | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index f0d4483..57d5168 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -230,10 +230,7 @@ def get_target_regions(wildcards): benchmark_name = config["variant-calls"][wildcards.callset]["benchmark"] benchmark = get_benchmark(benchmark_name) if "target-regions" in benchmark: - # TODO use f-string when this is fixed: https://github.com/snakemake/snakefmt/issues/215 - return "resources/regions/{benchmark_name}/target-regions.bed".format( - benchmark_name=benchmark_name - ) + return "resources/regions/{benchmark_name}/target-regions.bed" else: return [] @@ -292,10 +289,7 @@ def get_benchmark_truth(wildcards): def get_stratified_truth(suffix=""): def inner(wildcards): benchmark = config["variant-calls"][wildcards.callset]["benchmark"] - # TODO use f-string when this is fixed: https://github.com/snakemake/snakefmt/issues/215 - return "results/variants/{benchmark}.truth.cov-{{cov}}.vcf.gz{suffix}".format( - benchmark=benchmark, suffix=suffix - ) + return f"results/variants/{benchmark}.truth.cov-{{cov}}.vcf.gz{suffix}" return inner @@ -307,10 +301,7 @@ def get_confidence_regions(wildcards): def get_test_regions(wildcards): benchmark = config["variant-calls"][wildcards.callset]["benchmark"] - # TODO use f-string when this is fixed: https://github.com/snakemake/snakefmt/issues/215 - return "resources/regions/{benchmark}/test-regions.cov-{{cov}}.bed".format( - benchmark=benchmark - ) + return f"resources/regions/{benchmark}/test-regions.cov-{{cov}}.bed" def get_rename_contig_file(wildcards): @@ -379,10 +370,8 @@ def get_somatic_flag(wildcards): sample_name_callset = config["variant-calls"][wildcards.callset][ "tumor_sample_name" ] # get name tumor via config -> from dict - # TODO use f-string when this is fixed: https://github.com/snakemake/snakefmt/issues/215 - somatic_flag = "--squash-ploidy --sample {sample_name_baseline},{sample_name_callset}".format( - sample_name_baseline=sample_name_baseline, - sample_name_callset=sample_name_callset, + somatic_flag = ( + f"--squash-ploidy --sample {sample_name_baseline},{sample_name_callset}" ) else: somatic_flag = "" From 0ce5adb08fcf89a92d1c6343a94b042f4c7de6b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20B=C3=A4uerle?= <45968370+famosab@users.noreply.github.com> Date: Mon, 22 Apr 2024 12:03:28 +0200 Subject: [PATCH 16/18] fix: f string use for benchmark_name --- workflow/rules/common.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 57d5168..918a542 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -223,14 +223,14 @@ def get_target_regions(wildcards): if hasattr(wildcards, "benchmark"): benchmark = get_benchmark(wildcards.benchmark) if "target-regions" in benchmark: - return "resources/regions/{benchmark}/target-regions.bed" + return f"resources/regions/{benchmark}/target-regions.bed" else: return [] else: benchmark_name = config["variant-calls"][wildcards.callset]["benchmark"] benchmark = get_benchmark(benchmark_name) if "target-regions" in benchmark: - return "resources/regions/{benchmark_name}/target-regions.bed" + return f"resources/regions/{benchmark_name}/target-regions.bed" else: return [] From 78508ed93deb899e27bc6dc9450b9cbebd4b7926 Mon Sep 17 00:00:00 2001 From: BiancaStoecker Date: Mon, 22 Apr 2024 10:25:13 +0000 Subject: [PATCH 17/18] fix: fix benchmark name --- workflow/rules/common.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 918a542..f76add7 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -221,8 +221,8 @@ def get_target_bed_statement(wildcards): def get_target_regions(wildcards): if hasattr(wildcards, "benchmark"): - benchmark = get_benchmark(wildcards.benchmark) - if "target-regions" in benchmark: + benchmark_dict = get_benchmark(wildcards.benchmark) + if "target-regions" in benchmark_dict: return f"resources/regions/{benchmark}/target-regions.bed" else: return [] From 4379e55ca92b1b9092f067b298235616c0b86ca1 Mon Sep 17 00:00:00 2001 From: BiancaStoecker Date: Mon, 22 Apr 2024 10:32:19 +0000 Subject: [PATCH 18/18] fix: fix previous commit --- workflow/rules/common.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index f76add7..5328c9e 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -223,7 +223,7 @@ def get_target_regions(wildcards): if hasattr(wildcards, "benchmark"): benchmark_dict = get_benchmark(wildcards.benchmark) if "target-regions" in benchmark_dict: - return f"resources/regions/{benchmark}/target-regions.bed" + return f"resources/regions/{wildcards.benchmark}/target-regions.bed" else: return [] else: