diff --git a/bin/make_journals_explicit.py b/bin/make_journals_explicit.py new file mode 100644 index 0000000000..3eb11e6e8c --- /dev/null +++ b/bin/make_journals_explicit.py @@ -0,0 +1,169 @@ +#! /usr/bin/env python3 +# +# Copyright 2023 Marcel Bollmann +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Usage: _make_journals_explicit.py [--importdir=DIR] [-c | --commit] [--debug] + +Options: + --importdir=DIR Directory to import XML files from. + [default: {scriptdir}/../data/] + -c, --commit Commit (=write) the changes to the XML files; + will only do a dry run otherwise. + --debug Output debug-level log messages. + -h, --help Display this helpful text. +""" + +from docopt import docopt +from pathlib import Path +import logging as log +import os + +from anthology import Anthology +from anthology.formatter import MarkupFormatter +from anthology.utils import SeverityTracker, build_anthology_id, make_simple_element, indent, is_newstyle_id + +import lxml.etree as ET + + +### Copy-pasted from data.py and utils.py + +JOURNAL_IDS = ("cl", "tacl", "tal", "lilt", "ijclclp") + +def is_journal(anthology_id): + if is_newstyle_id(anthology_id): + # TODO: this function is sometimes called with "full_id", sometimes with + # "collection_id", so we're not using `deconstruct_anthology_id` here at + # the moment + venue = anthology_id.split("-")[0].split(".")[-1] + # TODO: this is currently hard-coded, but should be moved to the XML representation + return venue in JOURNAL_IDS + else: + return anthology_id[0] in ("J", "Q") + + +def match_volume_and_issue(booktitle) -> tuple[str, str]: + """Parses a volume name and issue name from a title. + + Examples: + - Computational Linguistics, Volume 26, Number 1, March 2000 + - Traitement Automatique des Langues 2011 Volume 52 Numéro 1 + - Computational Linguistics, Volume 26, Number 1, March 2000 + + :param booktitle: The booktitle + :return: the volume and issue numbers + """ + volume_no = re.search(r"Volume\s*(\d+)", booktitle, flags=re.IGNORECASE) + if volume_no is not None: + volume_no = volume_no.group(1) + + issue_no = re.search( + r"(Number|Numéro|Issue)\s*(\d+-?\d*)", booktitle, flags=re.IGNORECASE + ) + if issue_no is not None: + issue_no = issue_no.group(2) + + return volume_no, issue_no + + +def get_journal_info(top_level_id, volume_title) -> tuple[str, str, str]: + """Returns info about the journal: title, volume no., and issue no. + Currently (Feb 2023), this information is parsed from the tag! + We should move instead to an explicit representation. See + + https://github.com/acl-org/acl-anthology/issues/2379 + + :param top_level_id: The collection ID + :param volume_title: The text from the tag + :return: The journal title, volume number, and issue number + """ + + # TODO: consider moving this from code to data (perhaps + # under in the volume metadata + + top_level_id = top_level_id.split(".")[-1] # for new-style IDs; is a no-op otherwise + + journal_title = None + volume_no = None + issue_no = None + +### End copy-paste + + +formatter = MarkupFormatter() + + +def fix_journals(srcdir, commit=False): + for xml_file in Path(srcdir).glob("xml/*.xml"): + tree = ET.parse(xml_file) + root = tree.getroot() + id_ = root.get("id") + volume_type = "journal" if is_journal(id_) else "proceedings" + + for meta in root.findall("./volume/meta"): + volume_id = build_anthology_id(id_, meta.getparent().get('id')) + if meta.getparent().get("type") is not None: + continue + + meta.getparent().set("type", volume_type) + + if not is_journal(id_): + continue + + xml_booktitle = meta.find("booktitle") + booktitle = formatter.as_text(xml_booktitle) + + title, volume_no, issue_no = get_journal_info(id_, booktitle) + #xml_booktitle.clear() + #xml_booktitle.text = title + if volume_no is not None: + make_simple_element("journal-volume", text=volume_no, parent=meta) + if issue_no is not None: + make_simple_element("journal-issue", text=issue_no, parent=meta) + if title == "American Journal of Computational Linguistics": + make_simple_element("journal-title", text=title, parent=meta) + + if commit: + indent(root) + tree.write(xml_file, encoding="UTF-8", xml_declaration=True) + #log.info(f"Wrote {added} years to {xml_file.name}") + + +if __name__ == "__main__": + args = docopt(__doc__) + scriptdir = os.path.dirname(os.path.abspath(__file__)) + if "{scriptdir}" in args["--importdir"]: + args["--importdir"] = os.path.abspath( + args["--importdir"].format(scriptdir=scriptdir) + ) + + log_level = log.DEBUG if args["--debug"] else log.INFO + log.basicConfig(format="%(levelname)-8s %(message)s", level=log_level) + tracker = SeverityTracker() + log.getLogger().addHandler(tracker) + + fix_journals(args["--importdir"], commit=bool(args["--commit"])) + + if not args["--commit"]: + if tracker.highest >= log.ERROR: + log.warning( + "There were errors! Please check them carefully before re-running this script with -c/--commit." + ) + else: + log.info( + "Re-run this script with -c/--commit to save these changes to the XML files." + ) + + if tracker.highest >= log.ERROR: + exit(1) diff --git a/data/xml/2023.acl.xml b/data/xml/2023.acl.xml index 9ce0668169..0328c36e97 100644 --- a/data/xml/2023.acl.xml +++ b/data/xml/2023.acl.xml @@ -1,6 +1,6 @@ - + Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) AnnaRogers @@ -11258,7 +11258,7 @@ gao-etal-2023-rarr - + Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers) AnnaRogers @@ -13138,7 +13138,7 @@ ai-etal-2023-tecs - + Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations) DanushkaBollegala @@ -13943,7 +13943,7 @@ zharikova-etal-2023-deeppavlov - + Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop) VishakhPadmakumar @@ -14328,7 +14328,7 @@ raiyan-etal-2023-math - + Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track) SunayanaSitaram @@ -15286,7 +15286,7 @@ sharma-etal-2023-automated - + Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 6: Tutorial Abstracts) Yun-Nung (Vivian)Chen diff --git a/data/xml/2023.bea.xml b/data/xml/2023.bea.xml index 86e126b1e2..812037bf33 100644 --- a/data/xml/2023.bea.xml +++ b/data/xml/2023.bea.xml @@ -1,6 +1,6 @@ - + Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023) EkaterinaKochmar diff --git a/data/xml/2023.bionlp.xml b/data/xml/2023.bionlp.xml index 0fb36dd4ca..4cfd4e94fb 100644 --- a/data/xml/2023.bionlp.xml +++ b/data/xml/2023.bionlp.xml @@ -1,6 +1,6 @@ - + The 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks DinaDemner-fushman diff --git a/data/xml/2023.codi.xml b/data/xml/2023.codi.xml index 8cc112165d..35a5db7e82 100644 --- a/data/xml/2023.codi.xml +++ b/data/xml/2023.codi.xml @@ -1,6 +1,6 @@ - + Proceedings of the 4th Workshop on Computational Approaches to Discourse (CODI 2023) MichaelStrube diff --git a/data/xml/2023.disrpt.xml b/data/xml/2023.disrpt.xml index 4292f16313..acfadfabf0 100644 --- a/data/xml/2023.disrpt.xml +++ b/data/xml/2023.disrpt.xml @@ -1,6 +1,6 @@ - + Proceedings of the 3rd Shared Task on Discourse Relation Parsing and Treebanking (DISRPT 2023) ChloéBraudIrit, Cnrs diff --git a/data/xml/2023.findings.xml b/data/xml/2023.findings.xml index 61b8f4877a..2711bbe9aa 100644 --- a/data/xml/2023.findings.xml +++ b/data/xml/2023.findings.xml @@ -2257,7 +2257,7 @@ vijjini-etal-2023-curricular - + Findings of the Association for Computational Linguistics: ACL 2023 AnnaRogers diff --git a/data/xml/2023.iwslt.xml b/data/xml/2023.iwslt.xml index 12ebc88e65..6400151d5e 100644 --- a/data/xml/2023.iwslt.xml +++ b/data/xml/2023.iwslt.xml @@ -1,6 +1,6 @@ - + Proceedings of the 20th International Conference on Spoken Language Translation (IWSLT 2023) ElizabethSalesky diff --git a/data/xml/2023.law.xml b/data/xml/2023.law.xml index f4d21ac4d2..31eb1201e4 100644 --- a/data/xml/2023.law.xml +++ b/data/xml/2023.law.xml @@ -1,6 +1,6 @@ - + Proceedings of the 17th Linguistic Annotation Workshop (LAW-XVII) JakobPrange diff --git a/data/xml/2023.semeval.xml b/data/xml/2023.semeval.xml index ddb13649de..b824f4c5a5 100644 --- a/data/xml/2023.semeval.xml +++ b/data/xml/2023.semeval.xml @@ -1,6 +1,6 @@ - + Proceedings of the The 17th International Workshop on Semantic Evaluation (SemEval-2023) Atul Kr.Ojha diff --git a/data/xml/2023.wnu.xml b/data/xml/2023.wnu.xml index 5d75d44a28..adc5034363 100644 --- a/data/xml/2023.wnu.xml +++ b/data/xml/2023.wnu.xml @@ -1,6 +1,6 @@ - + Proceedings of the The 5th Workshop on Narrative Understanding NaderAkoury diff --git a/data/xml/2023.woah.xml b/data/xml/2023.woah.xml index 4fa5acbb47..d942581b96 100644 --- a/data/xml/2023.woah.xml +++ b/data/xml/2023.woah.xml @@ -1,6 +1,6 @@ - + The 7th Workshop on Online Abuse and Harms (WOAH) Yi-lingChung