diff --git a/bcbio/chipseq/antibodies.py b/bcbio/chipseq/antibodies.py new file mode 100644 index 000000000..6d2699c7a --- /dev/null +++ b/bcbio/chipseq/antibodies.py @@ -0,0 +1,40 @@ +from dataclasses import dataclass + +VALID_PEAKTYPES = ["narrow", "broad"] + +@dataclass +class Antibody: + """ + ChIP-seq antibody + """ + name: str + # call narrow or broad peaks + peaktype: str + # remove duplicates? + rmdup: bool = True + + def __post_init__(self): + if self.peaktype not in VALID_PEAKTYPES: + raise TypeError(f"peaktype {self.peatktype} is not one of {VALID_PEAKTYPES}") + +_ANTIBODIES = [ + Antibody("h3f3a", "broad"), + Antibody("h3k27me3", "broad"), + Antibody("h3k36me3", "broad"), + Antibody("h3k4me1", "broad"), + Antibody("h3k79me2", "broad"), + Antibody("h3k79me3", "broad"), + Antibody("h3k9me1", "broad"), + Antibody("h3k9me2", "broad"), + Antibody("h4k20me1", "broad"), + Antibody("h2afz", "narrow"), + Antibody("h3ac", "narrow"), + Antibody("h3k27ac", "narrow"), + Antibody("h3k4me2", "narrow"), + Antibody("h3k4me3", "narrow"), + Antibody("h3k9ac", "narrow"), + Antibody("h3k9me3", "broad", False) +] + +ANTIBODIES = {x.name: x for x in _ANTIBODIES} +SUPPORTED_ANTIBODIES = {x.name for x in _ANTIBODIES} diff --git a/bcbio/chipseq/macs2.py b/bcbio/chipseq/macs2.py index d73c2553c..ef8350a24 100644 --- a/bcbio/chipseq/macs2.py +++ b/bcbio/chipseq/macs2.py @@ -7,6 +7,8 @@ from bcbio.pipeline import config_utils from bcbio.pipeline import datadict as dd from bcbio import bam +from bcbio.chipseq import antibodies +from bcbio.log import logger def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): @@ -22,12 +24,19 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat _compress_bdg_files(out_dir) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) + antibody = antibodies.ANTIBODIES.get(dd.get_chipseq_antibody(data).lower(), None) + if antibody: + logger.info(f"{antibody.name} specified, using {antibody.peaktype} peak settings.") + peaksettings = select_peak_parameters(antibody) + else: + peaksettings = "" options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data)) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" with utils.chdir(out_dir): cmd = _macs2_cmd(method) + cmd += peaksettings try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) @@ -70,3 +79,11 @@ def _macs2_cmd(method="chip"): else: raise ValueError("chip_method should be chip or atac.") return cmd + +def select_peak_parameters(antibody): + if antibody.peaktype == "broad": + return " --broad --broad-cutoff 0.05" + elif antibody.peaktype == "narrow": + return "" + else: + raise ValueError(f"{antibody.peaktype} not recognized.") diff --git a/bcbio/pipeline/datadict.py b/bcbio/pipeline/datadict.py index 20d197b75..208f264eb 100644 --- a/bcbio/pipeline/datadict.py +++ b/bcbio/pipeline/datadict.py @@ -208,6 +208,8 @@ "disc_bam": {"keys": ["work_bam_plus", "disc"]}, "sr_bam": {"keys": ["work_bam_plus", "sr"]}, "peddy_report": {"keys": ["peddy_report"]}, + "chipseq_antibody": {"keys": ["config", "algorithm", "chipseq", "antibody"]}, + "peaktype": {"keys": ["config", "algorithm", "chipseq", "peaktype"]}, "tools_off": {"keys": ["config", "algorithm", "tools_off"], "default": [], "always_list": True}, "tools_on": {"keys": ["config", "algorithm", "tools_on"], "default": [], "always_list": True}, "cwl_reporting": {"keys": ["config", "algorithm", "cwl_reporting"]},