references.bib

@article{power2017microbial,
  title={Microbial genome-wide association studies: lessons from human GWAS},
  author={Power, Robert A and Parkhill, Julian and De Oliveira, Tulio},
  journal={Nature reviews genetics},
  volume={18},
  number={1},
  pages={41--50},
  year={2017},
  publisher={Nature Publishing Group UK London}
}
@article{uffelmann2021genome,
  title={Genome-wide association studies},
  author={Uffelmann, Emil and Huang, Qin Qin and Munung, Nchangwi Syntia and De Vries, Jantina and Okada, Yukinori and Martin, Alicia R and Martin, Hilary C and Lappalainen, Tuuli and Posthuma, Danielle},
  journal={Nature Reviews Methods Primers},
  volume={1},
  number={1},
  pages={59},
  year={2021},
  publisher={Nature Publishing Group UK London}
}
@article{van2020bioinformatics,
  title={Bioinformatics approaches to the understanding of molecular mechanisms in antimicrobial resistance},
  author={Van Camp, Pieter-Jan and Haslam, David B and Porollo, Aleksey},
  journal={International journal of molecular sciences},
  volume={21},
  number={4},
  pages={1363},
  year={2020},
  publisher={MDPI}
}

@article{ren2022prediction,
  title={Prediction of antimicrobial resistance based on whole-genome sequencing and machine learning},
  author={Ren, Yunxiao and Chakraborty, Trinad and Doijad, Swapnil and Falgenhauer, Linda and Falgenhauer, Jane and Goesmann, Alexander and Hauschild, Anne-Christin and Schwengers, Oliver and Heider, Dominik},
  journal={Bioinformatics},
  volume={38},
  number={2},
  pages={325--334},
  year={2022},
  publisher={Oxford University Press}
}

@article{hyun2020machine,
  title={Machine learning with random subspace ensembles identifies antimicrobial resistance determinants from pan-genomes of three pathogens},
  author={Hyun, Jason C and Kavvas, Erol S and Monk, Jonathan M and Palsson, Bernhard O},
  journal={PLoS computational biology},
  volume={16},
  number={3},
  pages={e1007608},
  year={2020},
  publisher={Public Library of Science San Francisco, CA USA}
}

patric
@article{wattam2014patric,
  title={PATRIC, the bacterial bioinformatics database and analysis resource},
  author={Wattam, Alice R and Abraham, David and Dalay, Oral and Disz, Terry L and Driscoll, Timothy and Gabbard, Joseph L and Gillespie, Joseph J and Gough, Roger and Hix, Deborah and Kenyon, Ronald and others},
  journal={Nucleic acids research},
  volume={42},
  number={D1},
  pages={D581--D591},
  year={2014},
  publisher={Oxford University Press}
}

card
@article{alcock2023card,
  title={CARD 2023: expanded curation, support for machine learning, and resistome prediction at the Comprehensive Antibiotic Resistance Database},
  author={Alcock, Brian P and Huynh, William and Chalil, Romeo and Smith, Keaton W and Raphenya, Amogelang R and Wlodarski, Mateusz A and Edalatmand, Arman and Petkau, Aaron and Syed, Sohaib A and Tsang, Kara K and others},
  journal={Nucleic acids research},
  volume={51},
  number={D1},
  pages={D690--D699},
  year={2023},
  publisher={Oxford University Press}
}

@article{kramer2016scikit,
  title={Scikit-learn},
  author={Kramer, Oliver and Kramer, Oliver},
  journal={Machine learning for evolution strategies},
  pages={45--53},
  year={2016},
  publisher={Springer}
}

@article{hirschberg2007v,
  title={V-Measure: a conditional entropy-based external cluster evaluation},
  author={Hirschberg, Julia Bell and Rosenberg, Andrew},
  year={2007}
}


@book{molnar2020interpretable,
  title={Interpretable machine learning},
  author={Molnar, Christoph},
  year={2020},
  publisher={Lulu. com}
}


@article{seabold2010statsmodels,
  title={Statsmodels: econometric and statistical modeling with python.},
  author={Seabold, Skipper and Perktold, Josef},
  journal={SciPy},
  volume={7},
  pages={1},
  year={2010}
}


@inproceedings{bastian2009gephi,
  title={Gephi: an open source software for exploring and manipulating networks},
  author={Bastian, Mathieu and Heymann, Sebastien and Jacomy, Mathieu},
  booktitle={Proceedings of the international AAAI conference on web and social media},
  volume={3},
  number={1},
  pages={361--362},
  year={2009}
}

@techreport{hagberg2008exploring,
  title={Exploring network structure, dynamics, and function using NetworkX},
  author={Hagberg, Aric and Swart, Pieter and S Chult, Daniel},
  year={2008},
  institution={Los Alamos National Lab.(LANL), Los Alamos, NM (United States)}
}


cd hit:
@article{fu2012cd,
  title={CD-HIT: accelerated for clustering the next-generation sequencing data},
  author={Fu, Limin and Niu, Beifang and Zhu, Zhengwei and Wu, Sitao and Li, Weizhong},
  journal={Bioinformatics},
  volume={28},
  number={23},
  pages={3150--3152},
  year={2012},
  publisher={Oxford University Press}
}
@article{li2006cd,
  title={Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences},
  author={Li, Weizhong and Godzik, Adam},
  journal={Bioinformatics},
  volume={22},
  number={13},
  pages={1658--1659},
  year={2006},
  publisher={Oxford University Press}
}

@book{netSciBible,
   title =     {Network Science},
   author =    {Albert-Laszlo Barabasi},
   publisher = {Cambridge University Press},
   isbn =      {},
   year =      {2016}
}

@article{libbrecht2015machine,
  title={Machine learning applications in genetics and genomics},
  author={Libbrecht, Maxwell W and Noble, William Stafford},
  journal={Nature Reviews Genetics},
  volume={16},
  number={6},
  pages={321--332},
  year={2015},
  publisher={Nature Publishing Group UK London}
}


@article{schubert2019genome,
  title={Genome-wide discovery of epistatic loci affecting antibiotic resistance in Neisseria gonorrhoeae using evolutionary couplings},
  author={Schubert, Benjamin and Maddamsetti, Rohan and Nyman, Jackson and Farhat, Maha R and Marks, Debora S},
  journal={Nature Microbiology},
  volume={4},
  number={2},
  pages={328--338},
  year={2019},
  publisher={Nature Publishing Group UK London}
}


@article{mosquera2023genome,
  title={Genome-Wide Association Studies (GWAS) Approaches for the Detection of Genetic Variants Associated with Antibiotic Resistance: A Systematic Review},
  author={Mosquera-Rend{\'o}n, Jeanneth and Moreno-Herrera, Claudia Ximena and Robledo, Jaime and Hurtado-P{\'a}ez, Uriel},
  journal={Microorganisms},
  volume={11},
  number={12},
  pages={2866},
  year={2023},
  publisher={MDPI}
}


@article{su2019genome,
  title={Genome-based prediction of bacterial antibiotic resistance},
  author={Su, Michelle and Satola, Sarah W and Read, Timothy D},
  journal={Journal of clinical microbiology},
  volume={57},
  number={3},
  pages={10--1128},
  year={2019},
  publisher={Am Soc Microbiol}
}


@article{olson2023introducing,
  title={Introducing the bacterial and viral bioinformatics resource center (BV-BRC): a resource combining PATRIC, IRD and ViPR},
  author={Olson, Robert D and Assaf, Rida and Brettin, Thomas and Conrad, Neal and Cucinell, Clark and Davis, James J and Dempsey, Donald M and Dickerman, Allan and Dietrich, Emily M and Kenyon, Ronald W and others},
  journal={Nucleic acids research},
  volume={51},
  number={D1},
  pages={D678--D689},
  year={2023},
  publisher={Oxford University Press}
}


@article{wiatrak2024sequence,
  title={Sequence-based modelling of bacterial genomes enables accurate antibiotic resistance prediction},
  author={Wiatrak, Maciej and Weimann, Aaron and Dinan, Adam M and Brbi{\'c}, Maria and Floto, R Andres},
  journal={bioRxiv},
  pages={2024--01},
  year={2024},
  publisher={Cold Spring Harbor Laboratory}
}


@article{eyre2017wgs,
  title={WGS to predict antibiotic MICs for Neisseria gonorrhoeae},
  author={Eyre, David W and De Silva, Dilrini and Cole, Kevin and Peters, Joanna and Cole, Michelle J and Grad, Yonatan H and Demczuk, Walter and Martin, Irene and Mulvey, Michael R and Crook, Derrick W and others},
  journal={Journal of Antimicrobial Chemotherapy},
  volume={72},
  number={7},
  pages={1937--1947},
  year={2017},
  publisher={Oxford University Press}
}


@article{Gysi2021,
   abstract = {The COVID-19 pandemic has highlighted the need to quickly and reliably prioritize clinically approved compounds for their potential effectiveness for severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) infections. Here, we deployed algorithms relying on artificial intelligence, network diffusion, and network proximity, tasking each of them to rank 6,340 drugs for their expected efficacy against SARS-CoV-2. To test the predictions, we used as ground truth 918 drugs experimentally screened in VeroE6 cells, as well as the list of drugs in clinical trials that capture the medical community's assessment of drugs with potential COVID-19 efficacy. We find that no single predictive algorithm offers consistently reliable outcomes across all datasets and metrics. This outcome prompted us to develop a multimodal technology that fuses the predictions of all algorithms, finding that a consensus among the different predictive methods consistently exceeds the performance of the best individual pipelines. We screened in human cells the top-ranked drugs, obtaining a 62% success rate, in contrast to the 0.8% hit rate of nonguided screenings. Of the six drugs that reduced viral infection, four could be directly repurposed to treat COVID-19, proposing novel treatments for COVID-19. We also found that 76 of the 77 drugs that successfully reduced viral infection do not bind the proteins targeted by SARS-CoV-2, indicating that these network drugs rely on network-based mechanisms that cannot be identified using docking-based strategies. These advances offer a methodological pathway to identify repurposable drugs for future pathogens and neglected diseases underserved by the costs and extended timeline of de novo drug development.},
   author = {Deisy Morselli Gysi and Ítalo Do Valle and Marinka Zitnik and Asher Ameli and Xiao Gan and Onur Varol and Susan Dina Ghiassian and J. J. Patten and Robert A. Davey and Joseph Loscalzo and Albert László Barabási},
   doi = {10.1073/PNAS.2025581118/SUPPL_FILE/PNAS.2025581118.SD12.XLSX},
   issn = {10916490},
   issue = {19},
   journal = {Proceedings of the National Academy of Sciences of the United States of America},
   keywords = {Drug repurposing,Infectious diseases,Network medicine,Systems biology},
   month = {5},
   pages = {e2025581118},
   pmid = {33906951},
   publisher = {National Academy of Sciences},
   title = {Network medicine framework for identifying drug-repurposing opportunities for COVID-19},
   volume = {118},
   url = {https://www.pnas.org/doi/abs/10.1073/pnas.2025581118},
   year = {2021},
}
@article{Boolchandani2019,
   abstract = {Antimicrobial resistance extracts high morbidity, mortality and economic costs yearly by rendering bacteria immune to antibiotics. Identifying and understanding antimicrobial resistance are imperative for clinical practice to treat resistant infections and for public health efforts to limit the spread of resistance. Technologies such as next-generation sequencing are expanding our abilities to detect and study antimicrobial resistance. This Review provides a detailed overview of antimicrobial resistance identification and characterization methods, from traditional antimicrobial susceptibility testing to recent deep-learning methods. We focus on sequencing-based resistance discovery and discuss tools and databases used in antimicrobial resistance studies. Next-generation sequencing has improved the identification and characterization of antimicrobial resistance. Focusing on sequence-based discovery of antibiotic resistance genes, this Review discusses computational strategies and resources for resistance gene identification in genomic and metagenomic samples, including recent deep-learning approaches.},
   author = {Manish Boolchandani and Alaric W. D’Souza and Gautam Dantas},
   doi = {10.1038/s41576-019-0108-4},
   issn = {1471-0064},
   issue = {6},
   journal = {Nature Reviews Genetics 2019 20:6},
   keywords = {Antimicrobial resistance,Infectious diseases,Metagenomics,Microbial genetics,Next,generation sequencing},
   month = {3},
   pages = {356-370},
   pmid = {30886350},
   publisher = {Nature Publishing Group},
   title = {Sequencing-based methods and resources to study antimicrobial resistance},
   volume = {20},
   url = {https://www.nature.com/articles/s41576-019-0108-4},
   year = {2019},
}
@article{Crook2022,
   abstract = {The Comprehensive Resistance Prediction for Tuberculosis: an International Consortium (CRyPTIC) presents here a data compendium of 12,289 Mycobacterium tuberculosis global clinical isolates, all of which have undergone whole-genome sequencing and have had their minimum inhibitory concentrations to 13 antitubercular drugs measured in a single assay. It is the largest matched phenotypic and genotypic dataset for M. tuberculosis to date. Here, we provide a summary detailing the breadth of data collected, along with a description of how the isolates were selected, collected, and uniformly processed in CRyPTIC partner laboratories across 23 countries. The compendium contains 6,814 isolates resistant to at least 1 drug, including 2,129 samples that fully satisfy the clinical definitions of rifampicin resistant (RR), multidrug resistant (MDR), pre-extensively drug resistant (pre-XDR), or extensively drug resistant (XDR). The data are enriched for rare resistance-associated variants, and the current limits of genotypic prediction of resistance status (sensitive/resistant) are presented by using a genetic mutation catalogue, along with the presence of suspected resistance-conferring mutations for isolates resistant to the newly introduced drugs bedaquiline, clofazimine, delamanid, and linezolid. Finally, a case study of rifampicin monoresistance demonstrates how this compendium could be used to advance our genetic understanding of rare resistance phenotypes. The data compendium is fully open source and it is hoped that it will facilitate and inspire future research for years to come.},
   author = {Derrick W. Crook and Timothy E.A. Peto and Sarah J. Hoosdally and Ana Lúıza Gibertoni Cruz and A. Sarah Walker and Timothy M. Walker and Philip W. Fowler and Zamin Iqbal and Daniela Maria Cirillo and Alice Brankin and Kerri M. Malone and Martin Hunt and Jeff Knaggs and Nerges Mistry and Camilla Rodrigues and David Moore and Nazir Ahmed Ismail and Stefan Niemann and Aysha Roohi and Brice Letcher and Paola M.V. Rancoita and Emanuele Borroni and Clara Grazian},
   doi = {10.1371/journal.pbio.3001721},
   issn = {15457885},
   issue = {8},
   journal = {PLoS biology},
   month = {8},
   pages = {e3001721},
   pmid = {35944069},
   publisher = {NLM (Medline)},
   title = {A data compendium associating the genomes of 12,289 Mycobacterium tuberculosis isolates with quantitative resistance phenotypes to 13 antibiotics},
   volume = {20},
   year = {2022},
}
@article{,
   abstract = {Most existing machine translation systems operate at the level of words, relying on explicit segmentation to extract tokens. We introduce a neural machine translation (NMT) model that maps a source character sequence to a target character sequence without any segmentation. We employ a character-level convolutional network with max-pooling at the encoder to reduce the length of source representation, allowing the model to be trained at a speed comparable to subword-level models while capturing local regularities. Our character-to-character model outperforms a recently proposed baseline with a subword-level encoder on WMT’15 DE-EN and CS-EN, and gives comparable performance on FI-EN and RU-EN. We then demonstrate that it is possible to share a single character-level encoder across multiple languages by training a model on a many-to-one translation task. In this multilingual setting, the character-level encoder significantly outperforms the subword-level encoder on all the language pairs. We observe that on CS-EN, FI-EN and RU-EN, the quality of the multilingual character-level translation even surpasses the models specifically trained on that language pair alone, both in terms of the BLEU score and human judgment.},
   author = {Benjamin Sanchez-Lengeling and Emily Reif and Adam Pearce and Alexander B. Wiltschko},
   doi = {10.23915/DISTILL.00033},
   issn = {2476-0757},
   issue = {9},
   journal = {Distill},
   month = {9},
   pages = {e33},
   publisher = {Distill Working Group},
   title = {A Gentle Introduction to Graph Neural Networks},
   volume = {6},
   url = {https://distill.pub/2021/gnn-intro},
   year = {2021},
}
@article{Fey2019,
   abstract = {We introduce PyTorch Geometric, a library for deep learning on irregularly structured input data such as graphs, point clouds and manifolds, built upon PyTorch. In addition to general graph data structures and processing methods, it contains a variety of recently published methods from the domains of relational learning and 3D data processing. PyTorch Geometric achieves high data throughput by leveraging sparse GPU acceleration, by providing dedicated CUDA kernels and by introducing efficient mini-batch handling for input examples of different size. In this work, we present the library in detail and perform a comprehensive comparative study of the implemented methods in homogeneous evaluation scenarios.},
   author = {Matthias Fey and Jan Eric Lenssen},
   month = {3},
   title = {Fast Graph Representation Learning with PyTorch Geometric},
   url = {https://arxiv.org/abs/1903.02428v3},
   year = {2019},
}
@article{Hunt2022,
   abstract = {There are many short-read variant-calling tools, with different strengths and weaknesses. We present a tool, Minos, which combines outputs from arbitrary variant callers, increasing recall without loss of precision. We benchmark on 62 samples from three bacterial species and an outbreak of 385 Mycobacterium tuberculosis samples. Minos also enables joint genotyping; we demonstrate on a large (N=13k) M. tuberculosis cohort, building a map of non-synonymous SNPs and indels in a region where all such variants are assumed to cause rifampicin resistance. We quantify the correlation with phenotypic resistance and then replicate in a second cohort (N=10k).},
   author = {Martin Hunt and Brice Letcher and Kerri M. Malone and Giang Nguyen and Michael B. Hall and Rachel M. Colquhoun and Leandro Lima and Michael C. Schatz and Srividya Ramakrishnan and Zamin Iqbal},
   doi = {10.1186/S13059-022-02714-X/FIGURES/3},
   issn = {1474760X},
   issue = {1},
   journal = {Genome Biology},
   keywords = {Animal Genetics and Genomics,Bioinformatics,Evolutionary Biology,Human Genetics,Microbial Genetics and Genomics,Plant Genetics and Genomics},
   month = {12},
   pages = {1-23},
   pmid = {35791022},
   publisher = {BioMed Central Ltd},
   title = {Minos: variant adjudication and joint genotyping of cohorts of bacterial genomes},
   volume = {23},
   url = {https://link.springer.com/articles/10.1186/s13059-022-02714-x https://link.springer.com/article/10.1186/s13059-022-02714-x},
   year = {2022},
}
@article{,
   abstract = {Background: Early detection of antimicrobial resistance in pathogens and prescription of more effective antibiotics is a fast-emerging need in clinical practice. High-throughput sequencing technology, such as whole genome sequencing (WGS), may have the capacity to rapidly guide the clinical decision-making process. The prediction of antimicrobial resistance in Gram-negative bacteria, often the cause of serious systemic infections, is more challenging as genotype-to-phenotype (drug resistance) relationship is more complex than for most Gram-positive organisms. Methods and Findings: We have used NCBI BioSample database to train and cross-validate eight XGBoost-based machine learning models to predict drug resistance to cefepime, cefotaxime, ceftriaxone, ciprofloxacin, gentamicin, levofloxacin, meropenem, and tobramycin tested in Acinetobacter baumannii, Escherichia coli, Enterobacter cloacae, Klebsiella aerogenes, and Klebsiella pneumoniae. The input is the WGS data in terms of the coverage of known antibiotic resistance genes by shotgun sequencing reads. Models demonstrate high performance and robustness to class imbalanced datasets. Conclusion: Whole genome sequencing enables the prediction of antimicrobial resistance in Gram-negative bacteria. We present a tool that provides an in silico antibiogram for eight drugs. Predictions are accompanied with a reliability index that may further facilitate the decision making process. The demo version of the tool with pre-processed samples is available at https://vancampn.shinyapps.io/wgs2amr/. The stand-alone version of the predictor is available at https://github.com/pieterjanvc/wgs2amr/.},
   author = {Pieter Jan Van Camp and David B. Haslam and Aleksey Porollo},
   doi = {10.3389/FMICB.2020.01013/BIBTEX},
   issn = {1664302X},
   journal = {Frontiers in Microbiology},
   keywords = {antibiotic resistance,antimicrobial resistance,genotype-phenotype relationship,machine learning,prediction,whole-genome sequencing},
   month = {5},
   pages = {530987},
   publisher = {Frontiers Media S.A.},
   title = {Prediction of Antimicrobial Resistance in Gram-Negative Bacteria From Whole-Genome Sequencing Data},
   volume = {11},
   year = {2020},
}
@article{Zhou2020,
   abstract = {Lots of learning tasks require dealing with graph data which contains rich relation information among elements. Modeling physics systems, learning molecular fingerprints, predicting protein interface, and classifying diseases demand a model to learn from graph inputs. In other domains such as learning from non-structural data like texts and images, reasoning on extracted structures (like the dependency trees of sentences and the scene graphs of images) is an important research topic which also needs graph reasoning models. Graph neural networks (GNNs) are neural models that capture the dependence of graphs via message passing between the nodes of graphs. In recent years, variants of GNNs such as graph convolutional network (GCN), graph attention network (GAT), graph recurrent network (GRN) have demonstrated ground-breaking performances on many deep learning tasks. In this survey, we propose a general design pipeline for GNN models and discuss the variants of each component, systematically categorize the applications, and propose four open problems for future research.},
   author = {Jie Zhou and Ganqu Cui and Shengding Hu and Zhengyan Zhang and Cheng Yang and Zhiyuan Liu and Lifeng Wang and Changcheng Li and Maosong Sun},
   doi = {10.1016/J.AIOPEN.2021.01.001},
   issn = {2666-6510},
   journal = {AI Open},
   keywords = {Deep learning,Graph neural network},
   month = {1},
   pages = {57-81},
   publisher = {Elsevier},
   title = {Graph neural networks: A review of methods and applications},
   volume = {1},
   year = {2020},
}
@article{,
   abstract = {We present graph attention networks (GATs), novel neural network architectures that operate on graph-structured data, leveraging masked self-attentional layers to address the shortcomings of prior methods based on graph convolutions or their approximations. By stacking layers in which nodes are able to attend over their neighborhoods' features, we enable (implicitly) specifying different weights to different nodes in a neighborhood, without requiring any kind of costly matrix operation (such as inversion) or depending on knowing the graph structure upfront. In this way, we address several key challenges of spectral-based graph neural networks simultaneously, and make our model readily applicable to inductive as well as transductive problems. Our GAT models have achieved or matched state-of-the-art results across four established transductive and inductive graph benchmarks: the Cora, Citeseer and Pubmed citation network datasets, as well as a protein-protein interaction dataset (wherein test graphs remain unseen during training).},
   author = {Petar Veličkovi´veličkovi´c and Guillem Cucurull and Arantxa Casanova and Adriana Romero and Pietro Lì and Yoshua Bengio},
   isbn = {1710.10903v3},
   title = {GRAPH ATTENTION NETWORKS},
}
@article{,
   abstract = {Motivation: Antimicrobial resistance (AMR) is one of the biggest global problems threatening human and animal health. Rapid and accurate AMR diagnostic methods are thus very urgently needed. However, traditional antimicro-bial susceptibility testing (AST) is time-consuming, low throughput and viable only for cultivable bacteria. Machine learning methods may pave the way for automated AMR prediction based on genomic data of the bacteria. However, comparing different machine learning methods for the prediction of AMR based on different encodings and whole-genome sequencing data without previously known knowledge remains to be done. Results: In this study, we evaluated logistic regression (LR), support vector machine (SVM), random forest (RF) and convolutional neural network (CNN) for the prediction of AMR for the antibiotics ciprofloxacin, cefotaxime, ceftazi-dime and gentamicin. We could demonstrate that these models can effectively predict AMR with label encoding, one-hot encoding and frequency matrix chaos game representation (FCGR encoding) on whole-genome sequencing data. We trained these models on a large AMR dataset and evaluated them on an independent public dataset. Generally, RFs and CNNs perform better than LR and SVM with AUCs up to 0.96. Furthermore, we were able to identify mutations that are associated with AMR for each antibiotic. Availability and implementation: Source code in data preparation and model training are provided at GitHub web-site (https://github.com/YunxiaoRen/ML-iAMR).},
   author = {Yunxiao Ren and Trinad Chakraborty and Swapnil Doijad and Linda Falgenhauer and Jane Falgenhauer and Alexander Goesmann and Anne-Christin Hauschild and Oliver Schwengers and Dominik Heider},
   doi = {10.1093/bioinformatics/btab681},
   title = {Prediction of antimicrobial resistance based on whole-genome sequencing and machine learning},
   url = {https://github.com/YunxiaoRen/ML-iAMR},
}
@article{Maruyama2020,
   abstract = {Antimicrobial resistance (AMR) in the nosocomial pathogen, Acinetobacter baumannii, is becoming a serious public health threat. While some mechanisms of AMR have been reported, understanding novel mechanisms of resistance is critical for identifying emerging resistance. One of the first steps in identifying novel AMR mechanisms is performing genotype/phenotype association studies; however, performing these studies is complicated by the plastic nature of the A. baumannii pan-genome. In this study, we compared the antibiograms of 12 antimicrobials associated with multiple drug families for 84 A. baumannii isolates, many isolated in Arizona, USA. in silico screening of these genomes for known AMR mechanisms failed to identify clear correlations for most drugs. We then performed a bacterial genome wide association study (bGWAS) looking for associations between all possible 21-mers; this approach generally failed to identify mechanisms that explained the resistance phenotype. In order to decrease the genomic noise associated with population stratification, we compared four phylogenetically-related pairs of isolates with differing susceptibility profiles. RNA-Sequencing (RNA-Seq) was performed on paired isolates and differentially-expressed genes were identified. In these isolate pairs, five different potential mechanisms were identified, highlighting the difficulty of broad AMR surveillance in this species. To verify and validate differential expression, amplicon sequencing was performed. These results suggest that a diagnostic platform based on gene expression rather than genomics alone may be beneficial in certain surveillance efforts. The implementation of such advanced diagnostics coupled with increased AMR surveillance will potentially improve A. baumannii infection treatment and patient outcomes.},
   author = {Fumito Maruyama and Hirokazu Yano and Jason W Sahl and Chandler Roe and Charles H D Williamson and Adam J Vazquez and Kristen Kyger and Michael Valentine and Jolene R Bowers and Paul D Phillips and Veronica Harrison and Elizabeth Driebe and David M Engelthaler},
   doi = {10.3389/fpubh.2020.00451},
   journal = {Frontiers in Public Health | www.frontiersin.org},
   keywords = {AMR,acinetobacter,bioinformatics,genomics,transcriptomics},
   pages = {451},
   title = {Bacterial Genome Wide Association Studies (bGWAS) and Transcriptomics Identifies Cryptic Antimicrobial Resistance Mechanisms in Acinetobacter baumannii},
   volume = {8},
   url = {www.frontiersin.org},
   year = {2020},
}
@article{Dey2022,
   abstract = {Antimicrobial resistance (AMR) among microorganisms has become one of the worldwide concerns of this century and continues to challenge us. To properly understand this problem, it is essential to know the genes that cause AMR and their resistance mechanisms. Our present study focused on Klebsiella pneumoniae, which possesses AMR genes conferring resistance against multiple antibiotics. A gene interaction network of 42 functional partners was constructed and analyzed to broaden our understanding. Three closely related clusters (C1–C3) having an association with multi-drug resistance mechanisms were identified by clustering analysis. The enrichment analysis illustrated 30 genes in biological processes, 24 genes in molecular function, and 25 genes in cellular components having a significant role. The analysis of the gene interaction network revealed genes birA2, folP, pabC, folA, gyrB, glmM, gyrA, thyA_2 had maximum no. of interactions with their functional partners viz. 26, 25, 25, 24, 23, 23, 23, 23 respectively and can be considered as hub genes. Analyzing the enriched pathways and Gene Ontologies provides insight into AMR's molecular basis. In addition, the proposed study could aid the researchers in developing new treatment options to combat multi-drug resistant K. pneumoniae.},
   author = {Hrituraj Dey and Karthick Vasudevan and K. R. Dasegowda and Majji Rambabu and Prashantha CN and George Priya Doss C},
   doi = {10.1016/J.MICPATH.2022.105878},
   issn = {0882-4010},
   journal = {Microbial Pathogenesis},
   keywords = {AMR,Cytoscape,Gene interaction network,Klebsiella pneumoniae,Multi-drug resistant (MDR),birA2,folP},
   month = {12},
   pages = {105878},
   pmid = {36372206},
   publisher = {Academic Press},
   title = {An integrated gene network analysis to decode the multi-drug resistance mechanism in Klebsiella pneumoniae},
   volume = {173},
   year = {2022},
}
@article{Her2018,
   abstract = {Motivation: Antimicrobial resistance (AMR) is becoming a huge problem in both developed and developing countries, and identifying strains resistant or susceptible to certain antibiotics is essential in fighting against antibiotic-resistant pathogens. Whole-genome sequences have been collected for different microbial strains in order to identify crucial characteristics that allow certain strains to become resistant to antibiotics; however, a global inspection of the gene content responsible for AMR activities remains to be done. Results: We propose a pan-genome-based approach to characterize antibiotic-resistant microbial strains and test this approach on the bacterial model organism Escherichia coli. By identifying core and accessory gene clusters and predicting AMR genes for the E. coli pan-genome, we not only showed that certain classes of genes are unevenly distributed between the core and accessory parts of the pan-genome but also demonstrated that only a portion of the identified AMR genes belong to the accessory genome. Application of machine learning algorithms to predict whether specific strains were resistant to antibiotic drugs yielded the best prediction accuracy for the set of AMR genes within the accessory part of the pan-genome, suggesting that these gene clusters were most crucial to AMR activities in E. coli. Selecting subsets of AMR genes for different antibiotic drugs based on a genetic algorithm (GA) achieved better prediction performances than the gene sets established in the literature, hinting that the gene sets selected by the GA may warrant further analysis in investigating more details about how E. coli fight against antibiotics.},
   author = {Hsuan-Lin Her and Yu-Wei Wu},
   doi = {10.1093/bioinformatics/bty276},
   issn = {1367-4803},
   issue = {13},
   journal = {Bioinformatics},
   month = {7},
   pages = {i89-i95},
   title = {A pan-genome-based machine learning approach for predicting antimicrobial resistance activities of the <i>Escherichia coli</i> strains},
   volume = {34},
   url = {https://academic.oup.com/bioinformatics/article/34/13/i89/5045729},
   year = {2018},
}
@article{,
   abstract = {Machine learning is a proven method to predict AMR; however, the performance of any machine learning model depends on the quality of the input data. Therefore, we evaluated different methods of representing information about mutations as well as mobilizable genes, so that the information can serve as input for a robust model. We combined data from multiple bacterial species in order to develop species-independent machine learning models that can predict resistance profiles for multiple antimicrobials and species with high performance. Machine learning has proven to be a powerful method to predict antimicrobial resistance (AMR) without using prior knowledge for selected bacterial species-antimicrobial combinations. To date, only species-specific machine learning models have been developed, and to the best of our knowledge, the inclusion of information from multiple species has not been attempted. The aim of this study was to determine the feasibility of including information from multiple bacterial species to predict AMR for an individual species, since this may make it easier to train and update resistance predictions for multiple species and may lead to improved predictions. Whole-genome sequence data and susceptibility profiles from 3,528 Mycobacterium tuberculosis , 1,694 Escherichia coli , 658 Salmonella enterica , and 1,236 Staphylococcus aureus isolates were included. We developed machine learning models trained by the features of the PointFinder and ResFinder programs detected to predict binary (susceptible/resistant) AMR profiles. We tested four feature representation methods to determine the most efficient way for introducing features into the models. When training the model only on the Mycobacterium tuberculosis isolates, high prediction performances were obtained for the six AMR profiles included. By adding information on ciprofloxacin from the additional 3,588 isolates, there was no reduction in performance for the other antimicrobials but an increased performance for ciprofloxacin AMR profile prediction for Mycobacterium tuberculosis and Escherichia coli . In conclusion, the species-independent models can predict multi-AMR profiles for multiple species without losing any robustness.  IMPORTANCE Machine learning is a proven method to predict AMR; however, the performance of any machine learning model depends on the quality of the input data. Therefore, we evaluated different methods of representing information about mutations as well as mobilizable genes, so that the information can serve as input for a robust model. We combined data from multiple bacterial species in order to develop species-independent machine learning models that can predict resistance profiles for multiple antimicrobials and species with high performance. },
   author = {D. Aytan-Aktug and P. T. L. C. Clausen and V. Bortolaia and F. M. Aarestrup and O. Lund},
   doi = {10.1128/MSYSTEMS.00774-19/SUPPL_FILE/MSYSTEMS.00774-19-SD003.XLSX},
   issn = {2379-5077},
   issue = {1},
   journal = {mSystems},
   keywords = {AMR,antimicrobial resistance,machine learning,neural networks},
   month = {2},
   pmid = {31964771},
   publisher = {American Society for Microbiology},
   title = {Prediction of Acquired Antimicrobial Resistance for Multiple Bacterial Species Using Neural Networks},
   volume = {5},
   url = {https://journals.asm.org/doi/10.1128/msystems.00774-19},
   year = {2020},
}
@article{Muzio2023,
   abstract = {Motivation: While the search for associations between genetic markers and complex traits has led to the discovery of tens of thousands of trait-related genetic variants, the vast majority of these only explain a small fraction of the observed phenotypic variation. One possible strategy to overcome this while leveraging biological prior is to aggregate the effects of several genetic markers and to test entire genes, pathways or (sub)networks of genes for association to a phenotype. The latter, network-based genome-wide association studies, in particular suffer from a vast search space and an inherent multiple testing problem. As a consequence, current approaches are either based on greedy feature selection, thereby risking that they miss relevant associations, or neglect doing a multiple testing correction, which can lead to an abundance of false positive findings. Results: To address the shortcomings of current approaches of network-based genome-wide association studies, we propose networkGWAS, a computationally efficient and statistically sound approach to network-based genome-wide association studies using mixed models and neighborhood aggregation. It allows for population structure correction and for well-calibrated P-values, which are obtained through circular and degree-preserving network permutations. networkGWAS successfully detects known associations on diverse synthetic phenotypes, as well as known and novel genes in phenotypes from Saccharomycescerevisiae and Homo sapiens. It thereby enables the systematic combination of gene-based genome-wide association studies with biological network information.},
   author = {Giulia Muzio and Leslie O'bray and Laetitia Meng-Papaxanthos and Juliane Klatt and Krista Fischer and Karsten Borgwardt},
   doi = {10.1093/BIOINFORMATICS/BTAD370},
   issn = {13674811},
   issue = {6},
   journal = {Bioinformatics},
   month = {6},
   pmid = {37285313},
   publisher = {Oxford Academic},
   title = {networkGWAS: a network-based approach to discover genetic associations},
   volume = {39},
   url = {https://dx.doi.org/10.1093/bioinformatics/btad370},
   year = {2023},
}
@article{Yang2022,
   abstract = {Background: Predicting which pathogens might exhibit antimicrobial resistance (AMR) based on genomics data is one of the promising ways to swiftly and precisely identify AMR pathogens. Currently, the most widely used genomics approach is through identifying known AMR genes from genomic information in order to predict whether a pathogen might be resistant to certain antibiotic drugs. The list of known AMR genes, however, is still far from comprehensive and may result in inaccurate AMR pathogen predictions. We thus felt the need to expand the AMR gene set and proposed a pan-genome-based feature selection method to identify potential gene sets for AMR prediction purposes. Results: By building pan-genome datasets and extracting gene presence/absence patterns from four bacterial species, each with more than 2000 strains, we showed that machine learning models built from pan-genome data can be very promising for predicting AMR pathogens. The gene set selected by the eXtreme Gradient Boosting (XGBoost) feature selection approach further improved prediction outcomes, and an incremental approach selecting subsets of XGBoost-selected features brought the machine learning model performance to the next level. Investigating selected gene sets revealed that on average about 50% of genes had no known function and very few of them were known AMR genes, indicating the potential of the selected gene sets to expand resistance gene repertoires. Conclusions: We demonstrated that a pan-genome-based feature selection approach is suitable for building machine learning models for predicting AMR pathogens. The extracted gene sets may provide future clues to expand our knowledge of known AMR genes and provide novel hypotheses for inferring bacterial AMR mechanisms.},
   author = {Ming Ren Yang and Yu Wei Wu},
   doi = {10.1186/S12859-022-04666-2/FIGURES/4},
   issn = {14712105},
   issue = {4},
   journal = {BMC Bioinformatics},
   keywords = {Antimicrobial resistance,Feature selection,Hypothetical proteins,Pan-genome,XGBoost,eXtreme gradient boosting},
   month = {4},
   pages = {1-15},
   pmid = {35428201},
   publisher = {BioMed Central Ltd},
   title = {Enhancing predictions of antimicrobial resistance of pathogens by expanding the potential resistance gene repertoire using a pan-genome-based feature selection approach},
   volume = {23},
   url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-022-04666-2},
   year = {2022},
}
@article{Li2020,
   abstract = {Antimicrobial resistance (AMR) has emerged as one of the most urgent global threats to public health. Accurate detection of AMR phenotypes is critical for reducing the spread of AMR strains. Here, we developed PARMAP (Prediction of Antimicrobial Resistance by MAPping genetic alterations in pan-genome) to predict AMR phenotypes and to identify AMR-associated genetic alterations based on the pan-genome of bacteria by utilizing machine learning algorithms. When we applied PARMAP to 1,597 Neisseria gonorrhoeae strains, it successfully predicted their AMR phenotypes based on a pan-genome analysis. Furthermore, it identified 328 genetic alterations in 23 known AMR genes and discovered many new AMR-associated genetic alterations in ciprofloxacin-resistant N. gonorrhoeae, and it clearly indicated the genetic heterogeneity of AMR genes in different subtypes of resistant N. gonorrhoeae. Additionally, PARMAP performed well in predicting the AMR phenotypes of Mycobacterium tuberculosis and Escherichia coli, indicating the robustness of the PARMAP framework. In conclusion, PARMAP not only precisely predicts the AMR of a population of strains of a given species but also uses whole-genome sequencing data to prioritize candidate AMR-associated genetic alterations based on their likelihood of contributing to AMR. Thus, we believe that PARMAP will accelerate investigations into AMR mechanisms in other human pathogens.},
   author = {Xuefei Li and Jingxia Lin and Yongfei Hu and Jiajian Zhou},
   doi = {10.3389/fmicb.2020.578795},
   issn = {1664302X},
   journal = {Frontiers in Microbiology},
   keywords = {AMR prediction,Neisseria gonorrhoeae,antibiotic resistance genes,antimicrobial resistance (AMR),machine learning (ML),pan-genome},
   month = {10},
   publisher = {Frontiers Media S.A.},
   title = {PARMAP: A Pan-Genome-Based Computational Framework for Predicting Antimicrobial Resistance},
   volume = {11},
   year = {2020},
}
@article{Moradigaravand2018,
   abstract = {The emergence of microbial antibiotic resistance is a global health threat. In clinical settings, the key to controlling spread of resistant strains is accurate and rapid detection. As traditional culture-based methods are time consuming, genetic approaches have recently been developed for this task. The detection of antibiotic resistance is typically made by measuring a few known determinants previously identified from genome sequencing, and thus requires the prior knowledge of its biological mechanisms. To overcome this limitation, we employed machine learning models to predict resistance to 11 compounds across four classes of antibiotics from existing and novel whole genome sequences of 1936 E. coli strains. We considered a range of methods, and examined population structure, isolation year, gene content, and polymorphism information as predictors. Gradient boosted decision trees consistently outperformed alternative models with an average accuracy of 0.91 on held-out data (range 0.81–0.97). While the best models most frequently employed gene content, an average accuracy score of 0.79 could be obtained using population structure information alone. Single nucleotide variation data were less useful, and significantly improved prediction only for two antibiotics, including ciprofloxacin. These results demonstrate that antibiotic resistance in E. coli can be accurately predicted from whole genome sequences without a priori knowledge of mechanisms, and that both genomic and epidemiological data can be informative. This paves way to integrating machine learning approaches into diagnostic tools in the clinic.},
   author = {Danesh Moradigaravand and Martin Palm and Anne Farewell and Ville Mustonen and Jonas Warringer and Leopold Parts},
   doi = {10.1371/JOURNAL.PCBI.1006258},
   isbn = {1111111111},
   issn = {1553-7358},
   issue = {12},
   journal = {PLOS Computational Biology},
   keywords = {Antibiotic resistance,Antibiotics,Decision tree learning,Forecasting,Genetics,Genomics,Machine learning,Single nucleotide polymorphisms},
   month = {12},
   pages = {e1006258},
   pmid = {30550564},
   publisher = {Public Library of Science},
   title = {Prediction of antibiotic resistance in Escherichia coli from large-scale pan-genome data},
   volume = {14},
   url = {https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1006258},
   year = {2018},
}
@article{pei2024argnet,
  title={ARGNet: using deep neural networks for robust identification and classification of antibiotic resistance genes from sequences},
  author={Pei, Yao and Shum, Marcus Ho-Hin and Liao, Yunshi and Leung, Vivian W and Gong, Yu-Nong and Smith, David K and Yin, Xiaole and Guan, Yi and Luo, Ruibang and Zhang, Tong and others},
  journal={Microbiome},
  volume={12},
  number={1},
  pages={1--17},
  year={2024},
  publisher={BioMed Central}
}
@article{jaillard2018fast,
  title={A fast and agnostic method for bacterial genome-wide association studies: Bridging the gap between k-mers and genetic events},
  author={Jaillard, Magali and Lima, Leandro and Tournoud, Maud and Mah{\'e}, Pierre and Van Belkum, Alex and Lacroix, Vincent and Jacob, Laurent},
  journal={PLoS genetics},
  volume={14},
  number={11},
  pages={e1007758},
  year={2018},
  publisher={Public Library of Science San Francisco, CA USA}
}
@article{brynildsrud2016rapid,
  title={Rapid scoring of genes in microbial pan-genome-wide association studies with Scoary},
  author={Brynildsrud, Ola and Bohlin, Jon and Scheffer, Lonneke and Eldholm, Vegard},
  journal={Genome biology},
  volume={17},
  pages={1--9},
  year={2016},
  publisher={Springer}
}

@article{charitou2016using,
  title={Using biological networks to integrate, visualize and analyze genomics data},
  author={Charitou, Theodosia and Bryan, Kenneth and Lynn, David J},
  journal={Genetics Selection Evolution},
  volume={48},
  pages={1--12},
  year={2016},
  publisher={Springer}
}

@article{schlitt2007current,
  title={Current approaches to gene regulatory network modelling},
  author={Schlitt, Thomas and Brazma, Alvis},
  journal={BMC bioinformatics},
  volume={8},
  pages={1--22},
  year={2007},
  publisher={Springer}
}

@article{vernikos2015ten,
  title={Ten years of pan-genome analyses},
  author={Vernikos, George and Medini, Duccio and Riley, David R and Tettelin, Herve},
  journal={Current opinion in microbiology},
  volume={23},
  pages={148--154},
  year={2015},
  publisher={Elsevier}
}

@article{smoot2011cytoscape,
  title={Cytoscape 2.8: new features for data integration and network visualization},
  author={Smoot, Michael E and Ono, Keiichiro and Ruscheinski, Johannes and Wang, Peng-Liang and Ideker, Trey},
  journal={Bioinformatics},
  volume={27},
  number={3},
  pages={431--432},
  year={2011},
  publisher={Oxford University Press}
}


@article{piper2024evolutionary,
  title={Evolutionary dynamics of the accessory genomes of Staphylococcus aureus},
  author={Piper, Kathryn R and Ikhimiukor, Odion O and Souza, Stephanie SR and Garcia-Aroca, Teddy and Andam, Cheryl P},
  journal={Msphere},
  pages={e00751--23},
  year={2024},
  publisher={Am Soc Microbiol}
}

@article{her2021pangenomenet,
  title={PangenomeNet: a pan-genome-based network reveals functional modules on antimicrobial resistome for Escherichia coli strains},
  author={Her, Hsuan-Lin and Lin, Po-Ting and Wu, Yu-Wei},
  journal={BMC bioinformatics},
  volume={22},
  pages={1--19},
  year={2021},
  publisher={Springer}
}


@article{beavan2024contingency,
  title={Contingency, repeatability, and predictability in the evolution of a prokaryotic pangenome},
  author={Beavan, Alan JS and Domingo-Sananes, Maria Rosa and McInerney, James O},
  journal={Proceedings of the National Academy of Sciences},
  volume={121},
  number={1},
  pages={e2304934120},
  year={2024},
  publisher={National Acad Sciences}
}

@article{yang2022enhancing,
  title={Enhancing predictions of antimicrobial resistance of pathogens by expanding the potential resistance gene repertoire using a pan-genome-based feature selection approach},
  author={Yang, Ming-Ren and Wu, Yu-Wei},
  journal={BMC bioinformatics},
  volume={23},
  number={Suppl 4},
  pages={131},
  year={2022},
  publisher={Springer}
}

@article{nadeau2001modifier,
  title={Modifier genes in mice and humans},
  author={Nadeau, Joseph H},
  journal={Nature Reviews Genetics},
  volume={2},
  number={3},
  pages={165--174},
  year={2001},
  publisher={Nature Publishing Group UK London}
}


@book{James2013,
  added-at = {2019-10-12T20:03:56.000+0200},
  author = {James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert},
  biburl = {https://www.bibsonomy.org/bibtex/2444186c86d18bddb4433c12fa126f6be/lopusz_kdd},
  interhash = {b3febabdc45a8629023cee7323dfbd86},
  intrahash = {444186c86d18bddb4433c12fa126f6be},
  keywords = {general_machine_learning},
  publisher = {Springer},
  timestamp = {2019-10-12T23:45:37.000+0200},
  title = {An Introduction to Statistical Learning: with Applications in R },
  url = {https://faculty.marshall.usc.edu/gareth-james/ISL/},
  year = 2013
}

@book{horvath2011weighted,
  title={Weighted Network Analysis: Applications in Genomics and Systems Biology},
  author={Horvath, S.},
  isbn={9781441988195},
  lccn={2011925163},
  series={SpringerLink : B{\"u}cher},
  url={https://books.google.com.lb/books?id=ZCh06NgMFesC},
  year={2011},
  publisher={Springer New York}
}


@article{ren2022prediction,
  title={Prediction of antimicrobial resistance based on whole-genome sequencing and machine learning},
  author={Ren, Yunxiao and Chakraborty, Trinad and Doijad, Swapnil and Falgenhauer, Linda and Falgenhauer, Jane and Goesmann, Alexander and Hauschild, Anne-Christin and Schwengers, Oliver and Heider, Dominik},
  journal={Bioinformatics},
  volume={38},
  number={2},
  pages={325--334},
  year={2022},
  publisher={Oxford University Press}
}


@article{nsubuga2024generalizability,
  title={Generalizability of machine learning in predicting antimicrobial resistance in E. coli: a multi-country case study in Africa},
  author={Nsubuga, Mike and Galiwango, Ronald and Jjingo, Daudi and Mboowa, Gerald},
  journal={BMC genomics},
  volume={25},
  number={1},
  pages={287},
  year={2024},
  publisher={Springer}
}


@article{ren2022deep,
  title={Deep transfer learning enables robust prediction of antimicrobial resistance for novel antibiotics},
  author={Ren, Yunxiao and Chakraborty, Trinad and Doijad, Swapnil and Falgenhauer, Linda and Falgenhauer, Jane and Goesmann, Alexander and Schwengers, Oliver and Heider, Dominik},
  journal={Antibiotics},
  volume={11},
  number={11},
  pages={1611},
  year={2022},
  publisher={MDPI}
}


@article{tang2022machine,
  title={Machine learning in predicting antimicrobial resistance: a systematic review and meta-analysis},
  author={Tang, Rui and Luo, Rui and Tang, Shiwei and Song, Haoxin and Chen, Xiujuan},
  journal={International Journal of Antimicrobial Agents},
  volume={60},
  number={5-6},
  pages={106684},
  year={2022},
  publisher={Elsevier}
}


@inproceedings{provost2000machine,
  title={Machine learning from imbalanced data sets 101},
  author={Provost, Foster},
  booktitle={Proceedings of the AAAI’2000 workshop on imbalanced data sets},
  volume={68},
  number={2000},
  pages={1--3},
  year={2000},
  organization={AAAI Press}
}


@article{valizadehaslani2020amino,
  title={Amino acid K-mer feature extraction for quantitative antimicrobial resistance (AMR) prediction by machine learning and model interpretation for biological insights},
  author={ValizadehAslani, Taha and Zhao, Zhengqiao and Sokhansanj, Bahrad A and Rosen, Gail L},
  journal={Biology},
  volume={9},
  number={11},
  pages={365},
  year={2020},
  publisher={MDPI}
}


@article{yang2023cross,
  title={A Cross-Validated Feature Selection (CVFS) approach for extracting the most parsimonious feature sets and discovering potential antimicrobial resistance (AMR) biomarkers},
  author={Yang, Ming-Ren and Wu, Yu-Wei},
  journal={Computational and Structural Biotechnology Journal},
  volume={21},
  pages={769--779},
  year={2023},
  publisher={Elsevier}
}


@article{shannon2001mathematical,
  title={A mathematical theory of communication},
  author={Shannon, Claude Elwood},
  journal={ACM SIGMOBILE mobile computing and communications review},
  volume={5},
  number={1},
  pages={3--55},
  year={2001},
  publisher={ACM New York, NY, USA}
}


@article{peng2018metapgn,
  title={MetaPGN: a pipeline for construction and graphical visualization of annotated pangenome networks},
  author={Peng, Ye and Tang, Shanmei and Wang, Dan and Zhong, Huanzi and Jia, Huijue and Cai, Xianghang and Zhang, Zhaoxi and Xiao, Minfeng and Yang, Huanming and Wang, Jian and others},
  journal={GigaScience},
  volume={7},
  number={11},
  pages={giy121},
  year={2018},
  publisher={Oxford University Press}
}


@article{medini2005microbial,
  title={The microbial pan-genome},
  author={Medini, Duccio and Donati, Claudio and Tettelin, Herv{\'e} and Masignani, Vega and Rappuoli, Rino},
  journal={Current opinion in genetics \& development},
  volume={15},
  number={6},
  pages={589--594},
  year={2005},
  publisher={Elsevier}
}


@article{zhang2005general,
  title={A general framework for weighted gene co-expression network analysis},
  author={Zhang, Bin and Horvath, Steve},
  journal={Statistical applications in genetics and molecular biology},
  volume={4},
  number={1},
  year={2005},
  publisher={De Gruyter}
}

@article{aguilar2023tool,
  title={A tool to enhance antimicrobial stewardship using similarity networks to identify antimicrobial resistance patterns across farms},
  author={Aguilar-Vega, Cecilia and Scoglio, Caterina and Clavijo, Mar{\'\i}a J and Robbins, Rebecca and Karriker, Locke and Liu, Xin and Mart{\'\i}nez-L{\'o}pez, Beatriz},
  journal={Scientific Reports},
  volume={13},
  number={1},
  pages={2931},
  year={2023},
  publisher={Nature Publishing Group UK London}
}


@article{barabasi2004network,
  title={Network biology: understanding the cell's functional organization},
  author={Barabasi, Albert-Laszlo and Oltvai, Zoltan N},
  journal={Nature reviews genetics},
  volume={5},
  number={2},
  pages={101--113},
  year={2004},
  publisher={Nature Publishing Group UK London}
}


@article{muzio2021biological,
  title={Biological network analysis with deep learning},
  author={Muzio, Giulia and O’Bray, Leslie and Borgwardt, Karsten},
  journal={Briefings in bioinformatics},
  volume={22},
  number={2},
  pages={1515--1530},
  year={2021},
  publisher={Oxford University Press}
}


@article{liu2020computational,
  title={Computational network biology: data, models, and applications},
  author={Liu, Chuang and Ma, Yifang and Zhao, Jing and Nussinov, Ruth and Zhang, Yi-Cheng and Cheng, Feixiong and Zhang, Zi-Ke},
  journal={Physics Reports},
  volume={846},
  pages={1--66},
  year={2020},
  publisher={Elsevier}
}


@article{lees2018pyseer,
  title={Pyseer: a comprehensive tool for microbial pangenome-wide association studies},
  author={Lees, John A and Galardini, Marco and Bentley, Stephen D and Weiser, Jeffrey N and Corander, Jukka},
  journal={Bioinformatics},
  volume={34},
  number={24},
  pages={4310--4312},
  year={2018},
  publisher={Oxford University Press}
}


@article{arango2018deeparg,
  title={DeepARG: a deep learning approach for predicting antibiotic resistance genes from metagenomic data},
  author={Arango-Argoty, Gustavo and Garner, Emily and Pruden, Amy and Heath, Lenwood S and Vikesland, Peter and Zhang, Liqing},
  journal={Microbiome},
  volume={6},
  pages={1--15},
  year={2018},
  publisher={Springer}
}


@misc{Kim2022,
   abstract = {SUMMARY Antimicrobial resistance (AMR) is a global health crisis that poses a great threat to modern medicine. Effective prevention strategies are urgently required to slow the emergence and further dissemination of AMR. Given the availability of data sets encompassing hundreds or thousands of pathogen genomes, machine learning (ML) is increasingly being used to predict resistance to different antibiotics in pathogens based on gene content and genome composition. A key objective of this work is to advocate for the incorporation of ML into front-line settings but also highlight the further refinements that are necessary to safely and confidently incorporate these methods. The question of what to predict is not trivial given the existence of different quantitative and qualitative laboratory measures of AMR. ML models typically treat genes as independent predictors, with no consideration of structural and functional linkages; they also may not be accurate when new mutational variants of known AMR genes emerge. Finally, to have the technology trusted by end users in public health settings, ML models need to be transparent and explainable to ensure that the basis for prediction is clear. We strongly advocate that the next set of AMR-ML studies should focus on the refinement of these limitations to be able to bridge the gap to diagnostic implementation.},
   author = {Jee In Kim and Finlay Maguire and Kara K. Tsang and Theodore Gouliouris and Sharon J. Peacock and Tim A. McAllister and Andrew G. McArthur and Robert G. Beiko},
   doi = {10.1128/cmr.00179-21},
   issn = {10986618},
   issue = {3},
   journal = {Clinical Microbiology Reviews},
   keywords = {antimicrobial resistance,machine learning},
   month = {9},
   pmid = {35612324},
   publisher = {American Society for Microbiology},
   title = {Machine Learning for Antimicrobial Resistance Prediction: Current Practice, Limitations, and Clinical Perspective},
   volume = {35},
   year = {2022},
}
@article{,
   abstract = {<p>Antibiotic resistance is a significant threat to public health worldwide. Genome-wide association studies (GWAS) have emerged as a powerful tool to identify genetic variants associated with this antibiotic resistance. By analyzing large datasets of bacterial genomes, GWAS can provide valuable insights into the resistance mechanisms and facilitate the discovery of new drug targets. The present study aimed to undertake a systematic review of different GWAS approaches used for detecting genetic variants associated with antibiotic resistance. We comprehensively searched the PubMed and Scopus databases to identify relevant studies published from 2013 to February 2023. A total of 40 studies met our inclusion criteria. These studies explored a wide range of bacterial species, antibiotics, and study designs. Notably, most of the studies were centered around human pathogens such as Mycobacterium tuberculosis, Escherichia coli, Neisseria gonorrhoeae, and Staphylococcus aureus. The review seeks to explore the several GWAS approaches utilized to investigate the genetic mechanisms associated with antibiotic resistance. Furthermore, it examines the contributions of GWAS approaches in identifying resistance-associated genetic variants through binary and continuous phenotypes. Overall, GWAS holds great potential to enhance our understanding of bacterial resistance and improve strategies to combat infectious diseases.</p>},
   author = {Jeanneth Mosquera-Rendón and Claudia Ximena Moreno-Herrera and Jaime Robledo and Uriel Hurtado-Páez},
   doi = {10.3390/microorganisms11122866},
   issn = {2076-2607},
   issue = {12},
   journal = {Microorganisms},
   keywords = {antimicrobial resistance,bacteria,genetic variants,genome-wide association study},
   month = {11},
   pages = {2866},
   publisher = {Multidisciplinary Digital Publishing Institute (MDPI)},
   title = {Genome-Wide Association Studies (GWAS) Approaches for the Detection of Genetic Variants Associated with Antibiotic Resistance: A Systematic Review},
   volume = {11},
   url = {https://www.mdpi.com/2076-2607/11/12/2866},
   year = {2023},
}
@article{Davis2016,
   abstract = {The emergence and spread of antimicrobial resistance (AMR) mechanisms in bacterial pathogens, coupled with the dwindling number of effective antibiotics, has created a global health crisis. Being able to identify the genetic mechanisms of AMR and predict the resistance phenotypes of bacterial pathogens prior to culturing could inform clinical decision-making and improve reaction time. At PATRIC (http://patricbrc.org/), we have been collecting bacterial genomes with AMR metadata for several years. In order to advance phenotype prediction and the identification of genomic regions relating to AMR, we have updated the PATRIC FTP server to enable access to genomes that are binned by their AMR phenotypes, as well as metadata including minimum inhibitory concentrations. Using this infrastructure, we custom built AdaBoost (adaptive boosting) machine learning classifiers for identifying carbapenem resistance in Acinetobacter baumannii, methicillin resistance in Staphylococcus aureus, and beta-lactam and co-trimoxazole resistance in Streptococcus pneumoniae with accuracies ranging from 88-99%. We also did this for isoniazid, kanamycin, ofloxacin, rifampicin, and streptomycin resistance in Mycobacterium tuberculosis, achieving accuracies ranging from 71-88%. This set of classifiers has been used to provide an initial framework for species-specific AMR phenotype and genomic feature prediction in the RAST and PATRIC annotation services.},
   author = {James J. Davis and Sébastien Boisvert and Thomas Brettin and Ronald W. Kenyon and Chunhong Mao and Robert Olson and Ross Overbeek and John Santerre and Maulik Shukla and Alice R. Wattam and Rebecca Will and Fangfang Xia and Rick Stevens},
   doi = {10.1038/srep27930},
   issn = {20452322},
   journal = {Scientific Reports},
   month = {6},
   pmid = {27297683},
   publisher = {Nature Publishing Group},
   title = {Antimicrobial Resistance Prediction in PATRIC and RAST},
   volume = {6},
   year = {2016},
}
@misc{Lv2021,
   abstract = {The wide use and abuse of antibiotics could make antimicrobial resistance (AMR) an increasingly serious issue that threatens global health and imposes an enormous burden on society and the economy. To avoid the crisis of AMR, we have to fundamentally change our approach. Artificial intelligence (AI) represents a new paradigm to combat AMR. Thus, various AI approaches to this problem have sprung up, some of which may be considered successful cases of domain-specific AI applications in AMR. However, to the best of our knowledge, there is no systematic review illustrating the use of these AI-based applications for AMR. Therefore, this review briefly introduces how to employ AI technology against AMR by using the predictive AMR model, the rational use of antibiotics, antimicrobial peptides (AMPs) and antibiotic combinations, as well as future research directions.},
   author = {Ji Lv and Senyi Deng and Le Zhang},
   doi = {10.1016/j.bsheal.2020.08.003},
   issn = {25900536},
   issue = {1},
   journal = {Biosafety and Health},
   keywords = {Antimicrobial resistance,Artificial intelligence,Clinical decision support systems,Drug combinations,Whole-genome sequencing},
   month = {2},
   pages = {22-31},
   publisher = {Elsevier B.V.},
   title = {A review of artificial intelligence applications for antimicrobial resistance},
   volume = {3},
   year = {2021},
}
@article{Hyun2023,
   abstract = {Surveillance programs for managing antimicrobial resistance (AMR) have yielded thousands of genomes suited for data-driven mechanism discovery. We present a workflow integrating pangenomics, gene annotation, and machine learning to identify AMR genes at scale. When applied to 12 species, 27,155 genomes, and 69 drugs, we 1) find AMR gene transfer mostly confined within related species, with 925 genes in multiple species but just eight in multiple phylogenetic classes, 2) demonstrate that discovery-oriented support vector machines outperform contemporary methods at recovering known AMR genes, recovering 263 genes compared to 145 by Pyseer, and 3) identify 142 AMR gene candidates. Validation of two candidates in E. coli BW25113 reveals cases of conditional resistance: ΔcycA confers ciprofloxacin resistance in minimal media with D-serine, and frdD V111D confers ampicillin resistance in the presence of ampC by modifying the overlapping promoter. We expect this approach to be adaptable to other species and phenotypes.},
   author = {Jason C. Hyun and Jonathan M. Monk and Richard Szubin and Ying Hefner and Bernhard O. Palsson},
   doi = {10.1038/s41467-023-43549-9},
   issn = {20411723},
   issue = {1},
   journal = {Nature Communications},
   month = {12},
   pmid = {38001096},
   publisher = {Nature Research},
   title = {Global pathogenomic analysis identifies known and candidate genetic antimicrobial resistance determinants in twelve species},
   volume = {14},
   year = {2023},
}
@misc{Power2016,
   abstract = {The reduced costs of sequencing have led to whole-genome sequences for a large number of microorganisms, enabling the application of microbial genome-wide association studies (GWAS). Given the successes of human GWAS in understanding disease aetiology and identifying potential drug targets, microbial GWAS are likely to further advance our understanding of infectious diseases. These advances include insights into pressing global health problems, such as antibiotic resistance and disease transmission. In this Review, we outline the methodologies of GWAS, the current state of the field of microbial GWAS, and how lessons from human GWAS can direct the future of the field.},
   author = {Robert A. Power and Julian Parkhill and Tulio De Oliveira},
   doi = {10.1038/nrg.2016.132},
   issn = {14710064},
   issue = {1},
   journal = {Nature Reviews Genetics},
   month = {12},
   pages = {41-50},
   pmid = {27840430},
   publisher = {Nature Publishing Group},
   title = {Microbial genome-wide association studies: lessons from human GWAS},
   volume = {18},
   year = {2016},
}
@article{Anusha2023,
   abstract = {Antimicrobial resistance has caused chaos worldwide due to the depiction of multidrug-resistant (MDR) infective microorganisms. A thorough examination of antimicrobial resistance (AMR) genes and associated resistant mechanisms is vital to solving this problem. Clostridium difficile (C. difficile) is an opportunistic nosocomial bacterial strain that has acquired exogenous AMR genes that confer resistance to antimicrobials such as erythromycin, azithromycin, clarithromycin, rifampicin, moxifloxacin, fluoroquinolones, vancomycin, and others. A network of interactions, including 20 AMR genes, was created and analyzed. In functional enrichment analysis, Cellular components (CC), Molecular Functions (MF), and Biological Processes (BP) were discovered to have substantial involvement. Mutations in the rpl genes, which encode ribosomal proteins, confer resistance in Gram-positive bacteria. Full erythromycin and azithromycin cross-resistance can be conferred if more than one of the abovementioned genes is present. In the enriched BP, rps genes related to transcriptional regulation and biosynthesis were found. The genes belong to the rpoB gene family, which has previously been related to rifampicin resistance. The genes rpoB, gyrA, gyrB, rpoS, rpl genes, rps genes, and Van genes are thought to be the hub genes implicated in resistance in C. difficile. As a result, new medications could be developed using these genes. Overall, our observations provide a thorough understanding of C. difficile AMR mechanisms.},
   author = {M. Anusha and V. Tejaswini and S. Udhaya Kumar and C.N. Prashantha and Karthick Vasudevan and C. George Priya Doss},
   doi = {10.1016/j.micpath.2023.106083},
   issn = {08824010},
   journal = {Microbial Pathogenesis},
   keywords = {Clostridium difficile,Clustering analysis,Functional enrichment analysis,Gene ontology,Topological parameters},
   month = {5},
   pages = {106083},
   publisher = {Academic Press},
   title = {Gene network interaction analysis to elucidate the antimicrobial resistance mechanisms in the Clostridium difficile},
   volume = {178},
   url = {https://linkinghub.elsevier.com/retrieve/pii/S088240102300116X},
   year = {2023},
}
@misc{Kim2020,
   abstract = {Biological knowledge accumulated over the decades and advances in computational methods have facilitated the implementation of pan-genome analysis that aims at better understanding of genotype-phenotype associations of a specific group of organisms. Pan-genome analysis has been shown to be an effective approach to better understand a clade of pathogenic bacteria because it helps developing various and tailored therapeutic strategies on the basis of their biological similarities and differences. Here, we review recent progress in the pan-genome analysis of pathogenic bacteria. In particular, we focus on computational tools that allow streamlined pan-genome analysis. Also, various applications of pan-genome analysis including those relevant to devising strategies for the prevention and treatment of pathogenic bacteria are reviewed.},
   author = {Yeji Kim and Changdai Gu and Hyun Uk Kim and Sang Yup Lee},
   doi = {10.1016/j.copbio.2019.12.001},
   issn = {18790429},
   journal = {Current Opinion in Biotechnology},
   month = {6},
   pages = {54-62},
   pmid = {31891864},
   publisher = {Elsevier Ltd},
   title = {Current status of pan-genome analysis for pathogenic bacteria},
   volume = {63},
   year = {2020},
}
@misc{Kim2020,
   abstract = {Biological knowledge accumulated over the decades and advances in computational methods have facilitated the implementation of pan-genome analysis that aims at better understanding of genotype-phenotype associations of a specific group of organisms. Pan-genome analysis has been shown to be an effective approach to better understand a clade of pathogenic bacteria because it helps developing various and tailored therapeutic strategies on the basis of their biological similarities and differences. Here, we review recent progress in the pan-genome analysis of pathogenic bacteria. In particular, we focus on computational tools that allow streamlined pan-genome analysis. Also, various applications of pan-genome analysis including those relevant to devising strategies for the prevention and treatment of pathogenic bacteria are reviewed.},
   author = {Yeji Kim and Changdai Gu and Hyun Uk Kim and Sang Yup Lee},
   doi = {10.1016/j.copbio.2019.12.001},
   issn = {18790429},
   journal = {Current Opinion in Biotechnology},
   month = {6},
   pages = {54-62},
   pmid = {31891864},
   publisher = {Elsevier Ltd},
   title = {Current status of pan-genome analysis for pathogenic bacteria},
   volume = {63},
   year = {2020},
}
@article{Kavvas2018,
   abstract = {<p> <italic>Mycobacterium tuberculosis</italic> is a serious human pathogen threat exhibiting complex evolution of antimicrobial resistance (AMR). Accordingly, the many publicly available datasets describing its AMR characteristics demand disparate data-type analyses. Here, we develop a reference strain-agnostic computational platform that uses machine learning approaches, complemented by both genetic interaction analysis and 3D structural mutation-mapping, to identify signatures of AMR evolution to 13 antibiotics. This platform is applied to 1595 sequenced strains to yield four key results. First, a pan-genome analysis shows that <italic>M. tuberculosis</italic> is highly conserved with sequenced variation concentrated in PE/PPE/PGRS genes. Second, the platform corroborates 33 genes known to confer resistance and identifies 24 new genetic signatures of AMR. Third, 97 epistatic interactions across 10 resistance classes are revealed. Fourth, detailed structural analysis of these genes yields mechanistic bases for their selection. The platform can be used to study other human pathogens. </p>},
   author = {Erol S. Kavvas and Edward Catoiu and Nathan Mih and James T. Yurkovich and Yara Seif and Nicholas Dillon and David Heckmann and Amitesh Anand and Laurence Yang and Victor Nizet and Jonathan M. Monk and Bernhard O. Palsson},
   doi = {10.1038/s41467-018-06634-y},
   issn = {2041-1723},
   issue = {1},
   journal = {Nature Communications},
   month = {10},
   pages = {4306},
   publisher = {Nature Publishing Group},
   title = {Machine learning and structural analysis of Mycobacterium tuberculosis pan-genome identifies genetic signatures of antibiotic resistance},
   volume = {9},
   url = {https://www.nature.com/articles/s41467-018-06634-y},
   year = {2018},
}
@article{Sun2021,
   abstract = {<p>Over the last decade, genome-wide association studies (GWAS) have discovered thousands of genetic variants underlying complex human diseases and agriculturally important traits. These findings have been utilized to dissect the biological basis of diseases, to develop new drugs, to advance precision medicine and to boost breeding. However, the potential of GWAS is still underexploited due to methodological limitations. Many challenges have emerged, including detecting epistasis and single-nucleotide polymorphisms (SNPs) with small effects and distinguishing causal variants from other SNPs associated through linkage disequilibrium. These issues have motivated advancements in GWAS analyses in two contrasting cultures—statistical modelling and machine learning. In this review, we systematically present the basic concepts and the benefits and limitations in both methods. We further discuss recent efforts to mitigate their weaknesses. Additionally, we summarize the state-of-the-art tools for detecting the missed signals, ultrarare mutations and gene–gene interactions and for prioritizing SNPs. Our work can offer both theoretical and practical guidelines for performing GWAS analyses and for developing further new robust methods to fully exploit the potential of GWAS.</p>},
   author = {Shanwen Sun and Benzhi Dong and Quan Zou},
   doi = {10.1093/bib/bbaa263},
   issn = {1467-5463},
   issue = {4},
   journal = {Briefings in Bioinformatics},
   keywords = {Bayesian,GWAS,SNPs,epistasis,fine-map,machine learning,regression},
   month = {7},
   publisher = {Oxford University Press},
   title = {Revisiting genome-wide association studies from statistical modelling to machine learning},
   volume = {22},
   url = {https://academic.oup.com/bib/article/doi/10.1093/bib/bbaa263/5943789},
   year = {2021},
}
@article{Gola2016,
   abstract = {Complex diseases are defined to be determined by multiple genetic and environmental factors alone as well as in interactions. To analyze interactions in genetic data, many statistical methods have been suggested, with most of them relying on statistical regression models. Given the known limitations of classical methods, approaches from the machine-learning community have also become attractive. From this latter family, a fast-growing collection of methods emerged that are based on the Multifactor Dimensionality Reduction (MDR) approach. Since its first introduction, MDR has enjoyed great popularity in applications and has been extended and modified multiple times. Based on a literature search, we here provide a systematic and comprehensive overview of these suggested methods. The methods are described in detail, and the availability of implementations is listed. Most recent approaches offer to deal with large-scale data sets and rare variants, which is why we expect these methods to even gain in popularity.},
   author = {Damian Gola and Jestinah M. Mahachie John and Kristel Van Steen and Inke R. König},
   doi = {10.1093/bib/bbv038},
   issn = {14774054},
   issue = {2},
   journal = {Briefings in Bioinformatics},
   keywords = {Data mining,Epistasis,Interaction,Machine learning,Multifactor dimensionality reduction},
   month = {3},
   pages = {293-308},
   pmid = {26108231},
   publisher = {Oxford University Press},
   title = {A roadmap to multifactor dimensionality reduction methods},
   volume = {17},
   year = {2016},
}
@article{Jiang2011,
   abstract = {Background: Gene-gene epistatic interactions likely play an important role in the genetic basis of many common diseases. Recently, machine-learning and data mining methods have been developed for learning epistatic relationships from data. A well-known combinatorial method that has been successfully applied for detecting epistasis is Multifactor Dimensionality Reduction (MDR). Jiang et al. created a combinatorial epistasis learning method called BNMBL to learn Bayesian network (BN) epistatic models. They compared BNMBL to MDR using simulated data sets. Each of these data sets was generated from a model that associates two SNPs with a disease and includes 18 unrelated SNPs. For each data set, BNMBL and MDR were used to score all 2-SNP models, and BNMBL learned significantly more correct models. In real data sets, we ordinarily do not know the number of SNPs that influence phenotype. BNMBL may not perform as well if we also scored models containing more than two SNPs. Furthermore, a number of other BN scoring criteria have been developed. They may detect epistatic interactions even better than BNMBL.Although BNs are a promising tool for learning epistatic relationships from data, we cannot confidently use them in this domain until we determine which scoring criteria work best or even well when we try learning the correct model without knowledge of the number of SNPs in that model.Results: We evaluated the performance of 22 BN scoring criteria using 28,000 simulated data sets and a real Alzheimer's GWAS data set. Our results were surprising in that the Bayesian scoring criterion with large values of a hyperparameter called α performed best. This score performed better than other BN scoring criteria and MDR at recall using simulated data sets, at detecting the hardest-to-detect models using simulated data sets, and at substantiating previous results using the real Alzheimer's data set.Conclusions: We conclude that representing epistatic interactions using BN models and scoring them using a BN scoring criterion holds promise for identifying epistatic genetic variants in data. In particular, the Bayesian scoring criterion with large values of a hyperparameter α appears more promising than a number of alternatives. © 2011 Jiang et al; licensee BioMed Central Ltd.},
   author = {Xia Jiang and Richard E. Neapolitan and M. Michael Barmada and Shyam Visweswaran},
   doi = {10.1186/1471-2105-12-89},
   issn = {14712105},
   journal = {BMC Bioinformatics},
   month = {4},
   pmid = {21453508},
   title = {Learning genetic epistasis using Bayesian network scoring criteria},
   volume = {12},
   year = {2011},
}
@article{Leem2014,
   abstract = {There are many algorithms for detecting epistatic interactions in GWAS. However, most of these algorithms are applicable only for detecting two-locus interactions. Some algorithms are designed to detect only two-locus interactions from the beginning. Others do not have limits to the order of interactions, but in practice take very long time to detect higher order interactions in real data of GWAS. Even the better ones take days to detect higher order interactions in WTCCC data. We propose a fast algorithm for detection of high order epistatic interactions in GWAS. It runs k-means clustering algorithm on the set of all SNPs. Then candidates are selected from each cluster. These candidates are examined to find the causative SNPs of k-locus interactions. We use mutual information from information theory as the measure of association between genotypes and phenotypes. We tested the power and speed of our method on extensive sets of simulated data. The results show that our method has more or equal power, and runs much faster than previously reported methods. We also applied our algorithm on each of seven diseases in WTCCC data to analyze up to 5-locus interactions. It takes only a few hours to analyze 5-locus interactions in one dataset. From the results we make some interesting and meaningful observations on each disease in WTCCC data. In this study, a simple yet powerful two-step approach is proposed for fast detection of high order epistatic interaction. Our algorithm makes it possible to detect high order epistatic interactions in GWAS in a matter of hours on a PC. © 2014 Elsevier Ltd.},
   author = {Sangseob Leem and Hyun Hwan Jeong and Jungseob Lee and Kyubum Wee and Kyung Ah Sohn},
   doi = {10.1016/j.compbiolchem.2014.01.005},
   issn = {14769271},
   journal = {Computational Biology and Chemistry},
   keywords = {Genome-wide association studies,High-order epistatic interactions,K-means clustering,Mutual information,WTCCC},
   pages = {19-28},
   pmid = {24581733},
   publisher = {Elsevier Ltd},
   title = {Fast detection of high-order epistatic interactions in genome-wide association studies using information theoretic measure},
   volume = {50},
   year = {2014},
}
@misc{Wong2017,
   abstract = {The fitness effects of a mutation can depend, sometimes dramatically, on genetic background; this phenomenon is often referred to as "epistasis." Epistasis can have important practical consequences in the context of antimicrobial resistance (AMR). For example, genetic background plays an important role in determining the costs of resistance, and hence in whether resistance will persist in the absence of antibiotic pressure. Furthermore, interactions between resistance mutations can have important implications for the evolution of multi-drug resistance. I argue that there is a need to better characterize the extent and nature of epistasis for mutations and horizontally transferred elements conferring AMR, particularly in clinical contexts. Furthermore, I suggest that epistasis should be an important consideration in attempts to slow or limit the evolution of AMR.},
   author = {Alex Wong},
   doi = {10.3389/fmicb.2017.00246},
   issn = {1664302X},
   issue = {FEB},
   journal = {Frontiers in Microbiology},
   keywords = {Antimicrobial resistance,Compensatory evolution,Epistasis,Genetic interaction,Multi-drug resistance},
   month = {2},
   publisher = {Frontiers Research Foundation},
   title = {Epistasis and the evolution of antimicrobial resistance},
   volume = {8},
   year = {2017},
}
@article{Walakira2022,
   abstract = {Genes and gene products do not function in isolation but as components of complex networks of macromolecules through physical or biochemical interactions. Dependencies of gene mutations on genetic background (i.e., epistasis) are believed to play a role in understanding molecular underpinnings of complex diseases such as inflammatory bowel disease (IBD). However, the process of identifying such interactions is complex due to for instance the curse of high dimensionality, dependencies in the data and non-linearity. Here, we propose a novel approach for robust and computationally efficient epistasis detection. We do so by first reducing dimensionality, per gene via diffusion kernel principal components (kpc). Subsequently, kpc gene summaries are used for downstream analysis including the construction of a gene-based epistasis network. We show that our approach is not only able to recover known IBD associated genes but also additional genes of interest linked to this difficult gastrointestinal disease.},
   author = {Andrew Walakira and Junior Ocira and Diane Duroux and Ramouna Fouladi and Miha Moškon and Damjana Rozman and Kristel Van Steen},
   doi = {10.1186/s12859-022-04580-7},
   issn = {14712105},
   issue = {1},
   journal = {BMC Bioinformatics},
   keywords = {Bivariate synergy,Diffusion kernel principal components,Gene epistasis network,Inflammatory bowel disease,Spike and slab priors},
   month = {12},
   pmid = {35105309},
   publisher = {BioMed Central Ltd},
   title = {Detecting gene–gene interactions from GWAS using diffusion kernel principal components},
   volume = {23},
   year = {2022},
}
@article{Walakira2022,
   abstract = {Genes and gene products do not function in isolation but as components of complex networks of macromolecules through physical or biochemical interactions. Dependencies of gene mutations on genetic background (i.e., epistasis) are believed to play a role in understanding molecular underpinnings of complex diseases such as inflammatory bowel disease (IBD). However, the process of identifying such interactions is complex due to for instance the curse of high dimensionality, dependencies in the data and non-linearity. Here, we propose a novel approach for robust and computationally efficient epistasis detection. We do so by first reducing dimensionality, per gene via diffusion kernel principal components (kpc). Subsequently, kpc gene summaries are used for downstream analysis including the construction of a gene-based epistasis network. We show that our approach is not only able to recover known IBD associated genes but also additional genes of interest linked to this difficult gastrointestinal disease.},
   author = {Andrew Walakira and Junior Ocira and Diane Duroux and Ramouna Fouladi and Miha Moškon and Damjana Rozman and Kristel Van Steen},
   doi = {10.1186/s12859-022-04580-7},
   issn = {14712105},
   issue = {1},
   journal = {BMC Bioinformatics},
   keywords = {Bivariate synergy,Diffusion kernel principal components,Gene epistasis network,Inflammatory bowel disease,Spike and slab priors},
   month = {12},
   pmid = {35105309},
   publisher = {BioMed Central Ltd},
   title = {Detecting gene–gene interactions from GWAS using diffusion kernel principal components},
   volume = {23},
   year = {2022},
}
@misc{,
   abstract = {Motivation: Genome-wide association studies (GWAS) have enabled large-scale analysis of the role of genetic variants in human disease. Despite impressive methodological advances, subsequent clinical interpretation and application remains challenging when GWAS suffer from a lack of statistical power. In recent years, however, the use of information diffusion algorithms with molecular networks has led to fruitful insights on disease genes. Results: We present an overview of the design choices and pitfalls that prove crucial in the application of network propagation methods to GWAS summary statistics. We highlight general trends from the literature, and present benchmark experiments to expand on these insights selecting as case study three diseases and five molecular networks. We verify that the use of gene-level scores based on GWAS P-values offers advantages over the selection of a set of ‘seed’ disease genes not weighted by the associated P-values if the GWAS summary statistics are of sufficient quality. Beyond that, the size and the density of the networks prove to be important factors for consideration. Finally, we explore several ensemble methods and show that combining multiple networks may improve the network propagation approach.},
   author = {Giovanni Visonà and Emmanuelle Bouzigon and Florence Demenais and Gabriele Schweikert},
   doi = {10.1093/bib/bbae014},
   issn = {14774054},
   issue = {2},
   journal = {Briefings in Bioinformatics},
   keywords = {GWAS,disease gene,molecular network,network propagation},
   month = {3},
   pmid = {38340090},
   publisher = {Oxford University Press},
   title = {Network propagation for GWAS analysis: a practical guide to leveraging molecular networks for disease gene discovery},
   volume = {25},
   year = {2024},
}
@article{Yip2018,
   abstract = {Motivation Individual genetic variants explain only a small fraction of heritability in some diseases. Some variants have weak marginal effects on disease risk, but their joint effects are significantly stronger when occurring together. Most studies on such epistatic interactions have focused on methods for identifying the interactions and interpreting individual cases, but few have explored their general functional basis. This was due to the lack of a comprehensive list of epistatic interactions and uncertainties in associating variants to genes. Results We conducted a large-scale survey of published research articles to compile the first comprehensive list of epistatic interactions in human diseases with detailed annotations. We used various methods to associate these variants to genes to ensure robustness. We found that these genes are significantly more connected in protein interaction networks, are more co-expressed and participate more often in the same pathways. We demonstrate using the list to discover novel disease pathways.},
   author = {Danny Kit Sang Yip and Landon L. Chan and Iris K. Pang and Wei Jiang and Nelson L.S. Tang and Weichuan Yu and Kevin Y. Yip},
   doi = {10.1093/bioinformatics/bty005},
   issn = {14602059},
   issue = {10},
   journal = {Bioinformatics},
   month = {5},
   pages = {1741-1749},
   pmid = {29329369},
   publisher = {Oxford University Press},
   title = {A network approach to exploring the functional basis of gene-gene epistatic interactions in disease susceptibility},
   volume = {34},
   year = {2018},
}
@article{Chang2020,
   abstract = {Background: Genome-wide association studies (GWAS) provide a powerful means to identify associations between genetic variants and phenotypes. However, GWAS techniques for detecting epistasis, the interactions between genetic variants associated with phenotypes, are still limited. We believe that developing an efficient and effective GWAS method to detect epistasis will be a key for discovering sophisticated pathogenesis, which is especially important for complex diseases such as Alzheimer's disease (AD). Results: In this regard, this study presents GenEpi, a computational package to uncover epistasis associated with phenotypes by the proposed machine learning approach. GenEpi identifies both within-gene and cross-gene epistasis through a two-stage modeling workflow. In both stages, GenEpi adopts two-element combinatorial encoding when producing features and constructs the prediction models by L1-regularized regression with stability selection. The simulated data showed that GenEpi outperforms other widely-used methods on detecting the ground-Truth epistasis. As real data is concerned, this study uses AD as an example to reveal the capability of GenEpi in finding disease-related variants and variant interactions that show both biological meanings and predictive power. Conclusions: The results on simulation data and AD demonstrated that GenEpi has the ability to detect the epistasis associated with phenotypes effectively and efficiently. The released package can be generalized to largely facilitate the studies of many complex diseases in the near future.},
   author = {Yu Chuan Chang and June Tai Wu and Ming Yi Hong and Yi An Tung and Ping Han Hsieh and Sook Wah Yee and Kathleen M. Giacomini and Yen Jen Oyang and Chien Yu Chen},
   doi = {10.1186/s12859-020-3368-2},
   issn = {14712105},
   issue = {1},
   journal = {BMC Bioinformatics},
   keywords = {Epistasis,GWAS,Machine learning},
   month = {2},
   pmid = {32093643},
   publisher = {BioMed Central Ltd.},
   title = {GenEpi: Gene-based epistasis discovery using machine learning},
   volume = {21},
   year = {2020},
}
@article{Sabik2021,
   abstract = {This protocol describes the application of the “omnigenic” model of the genetic architecture of complex traits to identify novel “core” genes influencing a disease-associated phenotype. Core genes are hypothesized to directly regulate disease and may serve as therapeutic targets. This protocol leverages GWAS data, a co-expression network, and publicly available data, including the GTEx database and the International Mouse Phenotyping Consortium Database, to identify modules enriched for genes with “core-like” characteristics. For complete details on the use and execution of this protocol, please refer to Sabik et al. (2020).},
   author = {Olivia L. Sabik and Cheryl L. Ackert-Bicknell and Charles R. Farber},
   doi = {10.1016/j.xpro.2021.100768},
   issn = {26661667},
   issue = {3},
   journal = {STAR Protocols},
   keywords = {Bioinformatics,Genetics,Genomics,RNAseq,Systems biology},
   month = {9},
   pmid = {34467232},
   publisher = {Cell Press},
   title = {A computational approach for identification of core modules from a co-expression network and GWAS data},
   volume = {2},
   year = {2021},
}
@misc{Boolchandani2019,
   abstract = {Antimicrobial resistance extracts high morbidity, mortality and economic costs yearly by rendering bacteria immune to antibiotics. Identifying and understanding antimicrobial resistance are imperative for clinical practice to treat resistant infections and for public health efforts to limit the spread of resistance. Technologies such as next-generation sequencing are expanding our abilities to detect and study antimicrobial resistance. This Review provides a detailed overview of antimicrobial resistance identification and characterization methods, from traditional antimicrobial susceptibility testing to recent deep-learning methods. We focus on sequencing-based resistance discovery and discuss tools and databases used in antimicrobial resistance studies.},
   author = {Manish Boolchandani and Alaric W. D’Souza and Gautam Dantas},
   doi = {10.1038/s41576-019-0108-4},
   issn = {14710064},
   issue = {6},
   journal = {Nature Reviews Genetics},
   month = {6},
   pages = {356-370},
   pmid = {30886350},
   publisher = {Nature Publishing Group},
   title = {Sequencing-based methods and resources to study antimicrobial resistance},
   volume = {20},
   year = {2019},
}
@misc{Phillips2008,
   abstract = {Epistasis, or interactions between genes, has long been recognized as fundamentally important to understanding the structure and function of genetic pathways and the evolutionary dynamics of complex genetic systems. With the advent of high-throughput functional genomics and the emergence of systems approaches to biology, as well as a new-found ability to pursue the genetic basis of evolution down to specific molecular changes, there is a renewed appreciation both for the importance of studying gene interactions and for addressing these questions in a unified, quantitative manner. © 2008 Macmillan Publishers Limited. All rights reserved.},
   author = {Patrick C. Phillips},
   doi = {10.1038/nrg2452},
   issn = {14710056},
   issue = {11},
   journal = {Nature Reviews Genetics},
   month = {11},
   pages = {855-867},
   pmid = {18852697},
   title = {Epistasis - The essential role of gene interactions in the structure and evolution of genetic systems},
   volume = {9},
   year = {2008},
}
@misc{McArthur2015,
   abstract = {Antimicrobial resistance is a global health challenge and has an evolutionary trajectory ranging from proto-resistance in the environment to untreatable clinical pathogens. Resistance is not static, as pathogenic strains can move among patient populations and individual resistance genes can move among pathogens. Effective treatment of resistant infections, antimicrobial stewardship, and new drug discovery increasingly rely upon genotype information, powered by decreasing costs of DNA sequencing. These new approaches will require advances in microbial informatics, particularly in development of reference databases of molecular determinants such as our Comprehensive Antibiotic Resistance Database and clinical metadata, new algorithms for prediction of resistome and resistance phenotype from genotype, and new protocols for global collection and sharing of high-throughput molecular epidemiology data.},
   author = {Andrew G. McArthur and Gerard D. Wright},
   doi = {10.1016/j.mib.2015.07.004},
   issn = {18790364},
   journal = {Current Opinion in Microbiology},
   month = {10},
   pages = {45-50},
   pmid = {26241506},
   publisher = {Elsevier Ltd},
   title = {Bioinformatics of antimicrobial resistance in the age of molecular epidemiology},
   volume = {27},
   year = {2015},
}
@misc{,
   abstract = {Interpreting neural networks is a crucial and challenging task in machine learning. In this paper, we develop a novel framework for detecting statistical interactions captured by a feedforward multilayer neural network by directly interpreting its learned weights. Depending on the desired interactions, our method can achieve significantly better or similar interaction detection performance compared to the state-of-the-art without searching an exponential solution space of possible interactions. We obtain this accuracy and efficiency by observing that interactions between input features are created by the non-additive effect of nonlinear activation functions, and that interacting paths are encoded in weight matrices. We demonstrate the performance of our method and the importance of discovered interactions via experimental results on both synthetic datasets and real-world application datasets.},
   author = {Michael Tsang and Dehua Cheng and Yan Liu},
   title = {DETECTING STATISTICAL INTERACTIONS FROM NEURAL NETWORK WEIGHTS},
}
@article{,
   abstract = {Background: Understanding the genetic networks and their role in chronic diseases (e.g., cancer) is one of the important objectives of biological researchers. In this work, we present a text mining system that constructs a gene-gene-interaction network for the entire human genome and then performs network analysis to identify disease-related genes. We recognize the interacting genes based on their co-occurrence frequency within the biomedical literature and by employing linear and non-linear rare-event classification models. We analyze the constructed network of genes by using different network centrality measures to decide on the importance of each gene. Specifically, we apply betweenness, closeness, eigenvector, and degree centrality metrics to rank the central genes of the network and to identify possible cancer-related genes. Results: We evaluated the top 15 ranked genes for different cancer types (i.e., Prostate, Breast, and Lung Cancer). The average precisions for identifying breast, prostate, and lung cancer genes vary between 80-100%. On a prostate case study, the system predicted an average of 80% prostate-related genes. Conclusions: The results show that our system has the potential for improving the prediction accuracy of identifying gene-gene interaction and disease-gene associations. We also conduct a prostate cancer case study by using the threshold property in logistic regression, and we compare our approach with some of the state-of-the-art methods.},
   author = {Amira Al-Aamri and Kamal Taha and Yousof Al-Hammadi and Maher Maalouf and Dirar Homouz},
   doi = {10.1186/s12859-019-2634-7},
   issn = {1471-2105},
   issue = {1},
   journal = {BMC Bioinformatics},
   keywords = {Biological NLP,Biomedical literature,Disease-gene association,Genetic network,Text mining},
   month = {12},
   pages = {70},
   publisher = {BioMed Central Ltd.},
   title = {Analyzing a co-occurrence gene-interaction network to identify disease-gene association},
   volume = {20},
   url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2634-7},
   year = {2019},
}
@misc{Mckinney2006,
   abstract = {disease aetiology. There is a growing body of evidence to suggest that complex interactions are 'the norm' and, rather than amounting to a small perturbation to classical Mendelian genetics, interactions may be the predominant effect. Traditional statistical methods are not well suited for detecting such interactions, especially when the data are high dimensional (many attributes or independent variables) or when interactions occur between more than two polymorphisms. In this review, we discuss machine-learning models and algorithms for identifying and characterising susceptibility genes in common, complex, multifactorial human diseases. We focus on the following machine-learning methods that have been used to detect gene-gene interactions: neural networks, cellular automata, random forests, and multifactor dimensionality reduction. We conclude with some ideas about how these methods and others can be integrated into a comprehensive and flexible framework for data mining and knowledge discovery in human genetics.},
   author = {Brett A Mckinney and David M Reif and Marylyn D Ritchie and Jason H Moore},
   issue = {2},
   journal = {Appl Bioinformatics},
   pages = {77-88},
   title = {BIOMEDICAL GENOMICS AND PROTEOMICS Machine Learning for Detecting Gene-Gene Interactions A Review},
   volume = {5},
   year = {2006},
}
@article{Peng2020,
   abstract = {Background: Drug-target interaction prediction is of great significance for narrowing down the scope of candidate medications, and thus is a vital step in drug discovery. Because of the particularity of biochemical experiments, the development of new drugs is not only costly, but also time-consuming. Therefore, the computational prediction of drug target interactions has become an essential way in the process of drug discovery, aiming to greatly reducing the experimental cost and time. Results: We propose a learning-based method based on feature representation learning and deep neural network named DTI-CNN to predict the drug-target interactions. We first extract the relevant features of drugs and proteins from heterogeneous networks by using the Jaccard similarity coefficient and restart random walk model. Then, we adopt a denoising autoencoder model to reduce the dimension and identify the essential features. Third, based on the features obtained from last step, we constructed a convolutional neural network model to predict the interaction between drugs and proteins. The evaluation results show that the average AUROC score and AUPR score of DTI-CNN were 0.9416 and 0.9499, which obtains better performance than the other three existing state-of-the-art methods. Conclusions: All the experimental results show that the performance of DTI-CNN is better than that of the three existing methods and the proposed method is appropriately designed.},
   author = {Jiajie Peng and Jingyi Li and Xuequn Shang},
   doi = {10.1186/s12859-020-03677-1},
   issn = {14712105},
   journal = {BMC Bioinformatics},
   keywords = {Convolutional neural network,DTIs prediction,Feature representation learning},
   month = {9},
   pmid = {32938374},
   publisher = {BioMed Central Ltd},
   title = {A learning-based method for drug-target interaction prediction based on feature representation learning and deep neural network},
   volume = {21},
   year = {2020},
}
@misc{,
   abstract = {Detecting statistical interactions between input features is a crucial and challenging task. Recent advances demonstrate that it is possible to extract learned interactions from trained neural networks. It has also been observed that, in neural networks, any interacting features must follow a strongly weighted connection to common hidden units. Motivated by the observation, in this paper, we propose to investigate the interaction detection problem from a novel topological perspective by analyzing the connectivity in neural networks. Specially, we propose a new measure for quantifying interaction strength, based upon the well-received theory of persistent homology. Based on this measure, a Persistence Interaction Detection (PID) algorithm is developed to efficiently detect interactions. Our proposed algorithm is evaluated across a number of interaction detection tasks on several synthetic and real world datasets with different hyperparameters. Experimental results validate that the PID algorithm outperforms the state-of-the-art baselines.},
   author = {Zirui Liu and Qingquan Song and Kaixiong Zhou and Ting-Hsiang Wang and Ying Shan and Xia Hu},
   title = {Towards Interaction Detection Using Topological Analysis on Neural Networks},
}
@misc{Wei2014,
   abstract = {Genome-wide association studies (GWASs) have become the focus of the statistical analysis of complex traits in humans, successfully shedding light on several aspects of genetic architecture and biological aetiology. Single-nucleotide polymorphisms (SNPs) are usually modelled as having additive, cumulative and independent effects on the phenotype. Although evidently a useful approach, it is often argued that this is not a realistic biological model and that epistasis (that is, the statistical interaction between SNPs) should be included. The purpose of this Review is to summarize recent directions in methodology for detecting epistasis and to discuss evidence of the role of epistasis in human complex trait variation. We also discuss the relevance of epistasis in the context of GWASs and potential hazards in the interpretation of statistical interaction terms.},
   author = {Wen Hua Wei and Gibran Hemani and Chris S. Haley},
   doi = {10.1038/nrg3747},
   issn = {14710064},
   issue = {11},
   journal = {Nature Reviews Genetics},
   month = {11},
   pages = {722-733},
   pmid = {25200660},
   publisher = {Nature Publishing Group},
   title = {Detecting epistasis in human complex traits},
   volume = {15},
   year = {2014},
}
@article{Id2020,
   abstract = {The evolution of antimicrobial resistance (AMR) poses a persistent threat to global public health. Sequencing efforts have already yielded genome sequences for thousands of resistant microbial isolates and require robust computational tools to systematically elucidate the genetic basis for AMR. Here, we present a generalizable machine learning workflow for identifying genetic features driving AMR based on constructing reference strain-agnostic pan-genomes and training random subspace ensembles (RSEs). This workflow was applied to the resistance profiles of 14 antimicrobials across three urgent threat pathogens encompassing 288 Staphylococcus aureus, 456 Pseudomonas aeruginosa, and 1588 Escherichia coli genomes. We find that feature selection by RSE detects known AMR associations more reliably than common statistical tests and previous ensemble approaches, identifying a total of 45 known AMR-conferring genes and alleles across the three organisms, as well as 25 candidate associations backed by domain-level annotations. Furthermore, we find that results from the RSE approach are consistent with existing understanding of fluoroquinolone (FQ) resistance due to mutations in the main drug targets, gyrA and parC, in all three organisms , and suggest the mutational landscape of those genes with respect to FQ resistance is simple. As larger datasets become available, we expect this approach to more reliably predict AMR determinants for a wider range of microbial pathogens.},
   author = {Jason C Hyun Id and Erol S Kavvas Id and Jonathan M Monk Id and Bernhard O Palsson Id},
   doi = {10.1371/journal.pcbi.1007608},
   isbn = {1111111111},
   title = {Machine learning with random subspace ensembles identifies antimicrobial resistance determinants from pan-genomes of three pathogens},
   url = {https://doi.org/10.1371/journal.pcbi.1007608},
   year = {2020},
}