Skip to content

Commit

Permalink
Add type attribute for new proceedings, + script for adding this
Browse files Browse the repository at this point in the history
  • Loading branch information
mbollmann committed Jul 12, 2023
1 parent a356539 commit f89fd4c
Show file tree
Hide file tree
Showing 12 changed files with 185 additions and 16 deletions.
169 changes: 169 additions & 0 deletions bin/make_journals_explicit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#! /usr/bin/env python3
#
# Copyright 2023 Marcel Bollmann <marcel@bollmann.me>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Usage: _make_journals_explicit.py [--importdir=DIR] [-c | --commit] [--debug]
Options:
--importdir=DIR Directory to import XML files from.
[default: {scriptdir}/../data/]
-c, --commit Commit (=write) the changes to the XML files;
will only do a dry run otherwise.
--debug Output debug-level log messages.
-h, --help Display this helpful text.
"""

from docopt import docopt
from pathlib import Path
import logging as log
import os

from anthology import Anthology
from anthology.formatter import MarkupFormatter
from anthology.utils import SeverityTracker, build_anthology_id, make_simple_element, indent, is_newstyle_id

import lxml.etree as ET


### Copy-pasted from data.py and utils.py

JOURNAL_IDS = ("cl", "tacl", "tal", "lilt", "ijclclp")

def is_journal(anthology_id):
if is_newstyle_id(anthology_id):
# TODO: this function is sometimes called with "full_id", sometimes with
# "collection_id", so we're not using `deconstruct_anthology_id` here at
# the moment
venue = anthology_id.split("-")[0].split(".")[-1]
# TODO: this is currently hard-coded, but should be moved to the XML representation
return venue in JOURNAL_IDS
else:
return anthology_id[0] in ("J", "Q")


def match_volume_and_issue(booktitle) -> tuple[str, str]:
"""Parses a volume name and issue name from a title.
Examples:
- <booktitle>Computational Linguistics, Volume 26, Number 1, March 2000</booktitle>
- <booktitle>Traitement Automatique des Langues 2011 Volume 52 Numéro 1</booktitle>
- <booktitle>Computational Linguistics, Volume 26, Number 1, March 2000</booktitle>
:param booktitle: The booktitle
:return: the volume and issue numbers
"""
volume_no = re.search(r"Volume\s*(\d+)", booktitle, flags=re.IGNORECASE)
if volume_no is not None:
volume_no = volume_no.group(1)

issue_no = re.search(
r"(Number|Numéro|Issue)\s*(\d+-?\d*)", booktitle, flags=re.IGNORECASE
)
if issue_no is not None:
issue_no = issue_no.group(2)

return volume_no, issue_no


def get_journal_info(top_level_id, volume_title) -> tuple[str, str, str]:
"""Returns info about the journal: title, volume no., and issue no.
Currently (Feb 2023), this information is parsed from the <booktitle> tag!
We should move instead to an explicit representation. See
https://github.com/acl-org/acl-anthology/issues/2379
:param top_level_id: The collection ID
:param volume_title: The text from the <booktitle> tag
:return: The journal title, volume number, and issue number
"""

# TODO: consider moving this from code to data (perhaps
# under <booktitle> in the volume metadata

top_level_id = top_level_id.split(".")[-1] # for new-style IDs; is a no-op otherwise

journal_title = None
volume_no = None
issue_no = None

### End copy-paste


formatter = MarkupFormatter()


def fix_journals(srcdir, commit=False):
for xml_file in Path(srcdir).glob("xml/*.xml"):
tree = ET.parse(xml_file)
root = tree.getroot()
id_ = root.get("id")
volume_type = "journal" if is_journal(id_) else "proceedings"

for meta in root.findall("./volume/meta"):
volume_id = build_anthology_id(id_, meta.getparent().get('id'))
if meta.getparent().get("type") is not None:
continue

meta.getparent().set("type", volume_type)

if not is_journal(id_):
continue

xml_booktitle = meta.find("booktitle")
booktitle = formatter.as_text(xml_booktitle)

title, volume_no, issue_no = get_journal_info(id_, booktitle)
#xml_booktitle.clear()
#xml_booktitle.text = title
if volume_no is not None:
make_simple_element("journal-volume", text=volume_no, parent=meta)
if issue_no is not None:
make_simple_element("journal-issue", text=issue_no, parent=meta)
if title == "American Journal of Computational Linguistics":
make_simple_element("journal-title", text=title, parent=meta)

if commit:
indent(root)
tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
#log.info(f"Wrote {added} years to {xml_file.name}")


if __name__ == "__main__":
args = docopt(__doc__)
scriptdir = os.path.dirname(os.path.abspath(__file__))
if "{scriptdir}" in args["--importdir"]:
args["--importdir"] = os.path.abspath(
args["--importdir"].format(scriptdir=scriptdir)
)

log_level = log.DEBUG if args["--debug"] else log.INFO
log.basicConfig(format="%(levelname)-8s %(message)s", level=log_level)
tracker = SeverityTracker()
log.getLogger().addHandler(tracker)

fix_journals(args["--importdir"], commit=bool(args["--commit"]))

if not args["--commit"]:
if tracker.highest >= log.ERROR:
log.warning(
"There were errors! Please check them carefully before re-running this script with -c/--commit."
)
else:
log.info(
"Re-run this script with -c/--commit to save these changes to the XML files."
)

if tracker.highest >= log.ERROR:
exit(1)
12 changes: 6 additions & 6 deletions data/xml/2023.acl.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<collection id="2023.acl">
<volume id="long" ingest-date="2023-07-08">
<volume id="long" ingest-date="2023-07-08" type="proceedings">
<meta>
<booktitle>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</booktitle>
<editor><first>Anna</first><last>Rogers</last></editor>
Expand Down Expand Up @@ -11258,7 +11258,7 @@
<bibkey>gao-etal-2023-rarr</bibkey>
</paper>
</volume>
<volume id="short" ingest-date="2023-07-08">
<volume id="short" ingest-date="2023-07-08" type="proceedings">
<meta>
<booktitle>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</booktitle>
<editor><first>Anna</first><last>Rogers</last></editor>
Expand Down Expand Up @@ -13138,7 +13138,7 @@
<bibkey>ai-etal-2023-tecs</bibkey>
</paper>
</volume>
<volume id="demo" ingest-date="2023-07-08">
<volume id="demo" ingest-date="2023-07-08" type="proceedings">
<meta>
<booktitle>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)</booktitle>
<editor><first>Danushka</first><last>Bollegala</last></editor>
Expand Down Expand Up @@ -13943,7 +13943,7 @@
<bibkey>zharikova-etal-2023-deeppavlov</bibkey>
</paper>
</volume>
<volume id="srw" ingest-date="2023-07-10">
<volume id="srw" ingest-date="2023-07-10" type="proceedings">
<meta>
<booktitle>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</booktitle>
<editor><first>Vishakh</first><last>Padmakumar</last></editor>
Expand Down Expand Up @@ -14328,7 +14328,7 @@
<bibkey>raiyan-etal-2023-math</bibkey>
</paper>
</volume>
<volume id="industry" ingest-date="2023-07-08">
<volume id="industry" ingest-date="2023-07-08" type="proceedings">
<meta>
<booktitle>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)</booktitle>
<editor><first>Sunayana</first><last>Sitaram</last></editor>
Expand Down Expand Up @@ -15286,7 +15286,7 @@
<bibkey>sharma-etal-2023-automated</bibkey>
</paper>
</volume>
<volume id="tutorials" ingest-date="2023-07-08">
<volume id="tutorials" ingest-date="2023-07-08" type="proceedings">
<meta>
<booktitle>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 6: Tutorial Abstracts)</booktitle>
<editor><first>Yun-Nung (Vivian)</first><last>Chen</last></editor>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2023.bea.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<collection id="2023.bea">
<volume id="1" ingest-date="2023-07-08">
<volume id="1" ingest-date="2023-07-08" type="proceedings">
<meta>
<booktitle>Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023)</booktitle>
<editor><first>Ekaterina</first><last>Kochmar</last></editor>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2023.bionlp.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<collection id="2023.bionlp">
<volume id="1" ingest-date="2023-07-08">
<volume id="1" ingest-date="2023-07-08" type="proceedings">
<meta>
<booktitle>The 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks</booktitle>
<editor><first>Dina</first><last>Demner-fushman</last></editor>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2023.codi.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<collection id="2023.codi">
<volume id="1" ingest-date="2023-07-08">
<volume id="1" ingest-date="2023-07-08" type="proceedings">
<meta>
<booktitle>Proceedings of the 4th Workshop on Computational Approaches to Discourse (CODI 2023)</booktitle>
<editor><first>Michael</first><last>Strube</last></editor>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2023.disrpt.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<collection id="2023.disrpt">
<volume id="1" ingest-date="2023-07-08">
<volume id="1" ingest-date="2023-07-08" type="proceedings">
<meta>
<booktitle>Proceedings of the 3rd Shared Task on Discourse Relation Parsing and Treebanking (DISRPT 2023)</booktitle>
<editor><first>Chloé</first><last>Braud</last><affiliation>Irit, Cnrs</affiliation></editor>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2023.findings.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2257,7 +2257,7 @@
<bibkey>vijjini-etal-2023-curricular</bibkey>
</paper>
</volume>
<volume id="acl" ingest-date="2023-07-09">
<volume id="acl" ingest-date="2023-07-09" type="proceedings">
<meta>
<booktitle>Findings of the Association for Computational Linguistics: ACL 2023</booktitle>
<editor><first>Anna</first><last>Rogers</last></editor>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2023.iwslt.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<collection id="2023.iwslt">
<volume id="1" ingest-date="2023-07-08">
<volume id="1" ingest-date="2023-07-08" type="proceedings">
<meta>
<booktitle>Proceedings of the 20th International Conference on Spoken Language Translation (IWSLT 2023)</booktitle>
<editor><first>Elizabeth</first><last>Salesky</last></editor>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2023.law.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<collection id="2023.law">
<volume id="1" ingest-date="2023-07-09">
<volume id="1" ingest-date="2023-07-09" type="proceedings">
<meta>
<booktitle>Proceedings of the 17th Linguistic Annotation Workshop (LAW-XVII)</booktitle>
<editor><first>Jakob</first><last>Prange</last></editor>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2023.semeval.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<collection id="2023.semeval">
<volume id="1" ingest-date="2023-07-09">
<volume id="1" ingest-date="2023-07-09" type="proceedings">
<meta>
<booktitle>Proceedings of the The 17th International Workshop on Semantic Evaluation (SemEval-2023)</booktitle>
<editor><first>Atul Kr.</first><last>Ojha</last></editor>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2023.wnu.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<collection id="2023.wnu">
<volume id="1" ingest-date="2023-07-09">
<volume id="1" ingest-date="2023-07-09" type="proceedings">
<meta>
<booktitle>Proceedings of the The 5th Workshop on Narrative Understanding</booktitle>
<editor><first>Nader</first><last>Akoury</last></editor>
Expand Down
2 changes: 1 addition & 1 deletion data/xml/2023.woah.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version='1.0' encoding='UTF-8'?>
<collection id="2023.woah">
<volume id="1" ingest-date="2023-07-09">
<volume id="1" ingest-date="2023-07-09" type="proceedings">
<meta>
<booktitle>The 7th Workshop on Online Abuse and Harms (WOAH)</booktitle>
<editor><first>Yi-ling</first><last>Chung</last></editor>
Expand Down

0 comments on commit f89fd4c

Please sign in to comment.