Refactor Python lib to use new XML fields, remove hard-coded data

acl-org · Jul 12, 2023 · fdbfb2c · fdbfb2c
1 parent 10c37ca
commit fdbfb2c
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 121 deletions.
diff --git a/bin/anthology/data.py b/bin/anthology/data.py
@@ -83,9 +83,6 @@
     "booktitle",
 )
 
-# New-style IDs that should be handled as journals
-JOURNAL_IDS = ("cl", "tacl", "tal", "lilt", "ijclclp")
-
 # Constants associated with DOI assignation
 DOI_URL_PREFIX = "https://dx.doi.org/"
 DOI_PREFIX = "10.18653/v1/"
@@ -95,91 +92,3 @@
 
 # The venue format must match this pattern
 VENUE_FORMAT = r"^[a-z\d]+$"
-
-
-def match_volume_and_issue(booktitle) -> Tuple[str, str]:
-    """Parses a volume name and issue name from a title.
-
-    Examples:
-    - <booktitle>Computational Linguistics, Volume 26, Number 1, March 2000</booktitle>
-    - <booktitle>Traitement Automatique des Langues 2011 Volume 52 Numéro 1</booktitle>
-    - <booktitle>Computational Linguistics, Volume 26, Number 1, March 2000</booktitle>
-
-    :param booktitle: The booktitle
-    :return: the volume and issue numbers
-    """
-    volume_no = re.search(r"Volume\s*(\d+)", booktitle, flags=re.IGNORECASE)
-    if volume_no is not None:
-        volume_no = volume_no.group(1)
-
-    issue_no = re.search(
-        r"(Number|Numéro|Issue)\s*(\d+-?\d*)", booktitle, flags=re.IGNORECASE
-    )
-    if issue_no is not None:
-        issue_no = issue_no.group(2)
-
-    return volume_no, issue_no
-
-
-def get_journal_info(top_level_id, volume_title) -> Tuple[str, str, str]:
-    """Returns info about the journal: title, volume no., and issue no.
-    Currently (Feb 2023), this information is parsed from the <booktitle> tag!
-    We should move instead to an explicit representation. See
-
-        https://github.com/acl-org/acl-anthology/issues/2379
-
-    :param top_level_id: The collection ID
-    :param volume_title: The text from the <booktitle> tag
-    :return: The journal title, volume number, and issue number
-    """
-
-    # TODO: consider moving this from code to data (perhaps
-    # under <booktitle> in the volume metadata
-
-    top_level_id = top_level_id.split(".")[-1]  # for new-style IDs; is a no-op otherwise
-
-    journal_title = None
-    volume_no = None
-    issue_no = None
-
-    if top_level_id == "cl":
-        # <booktitle>Computational Linguistics, Volume 26, Number 1, March 2000</booktitle>
-        journal_title = "Computational Linguistics"
-        volume_no, issue_no = match_volume_and_issue(volume_title)
-
-    elif top_level_id == "lilt":
-        # <booktitle>Linguistic Issues in Language Technology, Volume 10, 2015</booktitle>
-        journal_title = "Linguistic Issues in Language Technology"
-        volume_no, _ = match_volume_and_issue(volume_title)
-
-    elif top_level_id == "tal":
-        # <booktitle>Traitement Automatique des Langues 2011 Volume 52 Numéro 1</booktitle>
-        journal_title = "Traitement Automatique des Langues"
-        volume_no, issue_no = match_volume_and_issue(volume_title)
-
-    elif top_level_id == "ijclclp":
-        journal_title = "International Journal of Computational Linguistics & Chinese Language Processing"
-        volume_no, issue_no = match_volume_and_issue(volume_title)
-
-    elif top_level_id == "nejlt":
-        journal_title = "Northern European Journal of Language Technology"
-        volume_no, _ = match_volume_and_issue(volume_title)
-
-    elif top_level_id[0] == "J":
-        # <booktitle>Computational Linguistics, Volume 26, Number 1, March 2000</booktitle>
-        year = int(top_level_id[1:3])
-        if year >= 65 and year <= 83:
-            journal_title = "American Journal of Computational Linguistics"
-        else:
-            journal_title = "Computational Linguistics"
-
-        volume_no, issue_no = match_volume_and_issue(volume_title)
-
-    elif top_level_id[0] == "Q" or top_level_id == "tacl":
-        journal_title = "Transactions of the Association for Computational Linguistics"
-        volume_no, _ = match_volume_and_issue(volume_title)
-
-    else:
-        journal_title = volume_title
-
-    return journal_title, volume_no, issue_no
diff --git a/bin/anthology/papers.py b/bin/anthology/papers.py
@@ -23,7 +23,6 @@
     parse_element,
     infer_url,
     infer_attachment_url,
-    is_journal,
 )
 from . import data
 
@@ -154,7 +153,7 @@ def from_xml(xml_element, *args):
             paper.attrib["xml_title"].tag = "title"
 
         # Remove booktitle for frontmatter and journals
-        if paper.is_volume or is_journal(paper.full_id):
+        if paper.is_volume or paper.parent_volume.is_journal:
             del paper.attrib["xml_booktitle"]
 
         if "editor" in paper.attrib:
@@ -248,7 +247,7 @@ def bibtype(self):
         """Return the BibTeX entry type for this paper."""
         if self.is_volume:
             return "proceedings"
-        elif is_journal(self.full_id):
+        elif self.parent_volume.is_journal:
             return "article"
         else:
             return "inproceedings"
@@ -259,7 +258,7 @@ def csltype(self):
 
         cf. https://docs.citationstyles.org/en/stable/specification.html#appendix-iii-types
         """
-        if is_journal(self.full_id):
+        if self.parent_volume.is_journal:
             return "article-journal"
         elif self.is_volume:
             return "book"
@@ -348,7 +347,7 @@ def as_bibtex(self, concise=False):
                 entries.append(
                     (people, "  and  ".join(p.as_bibtex() for p, _ in self.get(people)))
                 )
-        if is_journal(self.full_id):
+        if self.parent_volume.is_journal:
             entries.append(
                 ("journal", bibtex_encode(self.parent_volume.get("meta_journal_title")))
             )
@@ -402,7 +401,7 @@ def as_citeproc_json(self):
             if "editor" in self.attrib:
                 # or should this be "container-author"/"collection-editor" here?
                 data["editor"] = [p.as_citeproc_json() for p, _ in self.get("editor")]
-            if is_journal(self.full_id):
+            if self.parent_volume.is_journal:
                 data["container-title"] = self.parent_volume.get("meta_journal_title")
                 journal_volume = self.parent_volume.get(
                     "meta_volume", self.parent_volume.get("volume")

diff --git a/bin/anthology/utils.py b/bin/anthology/utils.py
@@ -40,18 +40,6 @@ def is_newstyle_id(anthology_id):
     return anthology_id[0].isdigit()  # New-style IDs are year-first
 
 
-def is_journal(anthology_id):
-    if is_newstyle_id(anthology_id):
-        # TODO: this function is sometimes called with "full_id", sometimes with
-        # "collection_id", so we're not using `deconstruct_anthology_id` here at
-        # the moment
-        venue = anthology_id.split("-")[0].split(".")[-1]
-        # TODO: this is currently hard-coded, but should be moved to the XML representation
-        return venue in data.JOURNAL_IDS
-    else:
-        return anthology_id[0] in ("J", "Q")
-
-
 def is_volume_id(anthology_id):
     collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id)
     return paper_id == "0"

diff --git a/bin/anthology/volumes.py b/bin/anthology/volumes.py
@@ -23,7 +23,6 @@
 from .utils import (
     build_anthology_id,
     parse_element,
-    is_journal,
     month_str2num,
     infer_url,
 )
@@ -34,6 +33,7 @@ def __init__(
         self,
         collection_id,
         volume_id,
+        volume_type,
         ingest_date,
         meta_data,
         venue_index: VenueIndex,
@@ -47,10 +47,11 @@ def __init__(
         """
         self.collection_id = collection_id
         self._id = volume_id
+        self._type = volume_type
         self.ingest_date = ingest_date
         self.formatter = formatter
-        self._set_meta_info(meta_data)
         self.venue_index = venue_index
+        self._set_meta_info(meta_data)
         self.attrib["venues"] = meta_data.get("venue", [])
         self.attrib["events"] = meta_data.get("event", [])
         self.attrib["sigs"] = sig_index.get_associated_sigs(self.full_id)
@@ -71,6 +72,7 @@ def from_xml(
         volume_xml, collection_id, venue_index: VenueIndex, sig_index: SIGIndex, formatter
     ):
         volume_id = volume_xml.attrib["id"]
+        volume_type = volume_xml.attrib["type"]
         # The date of publication, defaulting to earlier than anything we'll encounter
         ingest_date = volume_xml.attrib.get("ingest-date", data.UNKNOWN_INGEST_DATE)
         meta_data = parse_element(volume_xml.find("meta"))
@@ -81,6 +83,7 @@ def from_xml(
         volume = Volume(
             collection_id,
             volume_id,
+            volume_type,
             ingest_date,
             meta_data,
             venue_index,
@@ -108,6 +111,10 @@ def pdf(self):
             return infer_url(url, template=data.PDF_LOCATION_TEMPLATE)
         return None
 
+    @property
+    def is_journal(self):
+        return self._type == "journal"
+
     def _set_meta_info(self, meta_data):
         """Derive journal title, volume, and issue no. used in metadata.
 
@@ -120,18 +127,25 @@ def _set_meta_info(self, meta_data):
             month = month_str2num(self.get("month"))
             if month is not None:
                 self.attrib["meta_date"] = f"{self.get('year')}/{month}"
-        if is_journal(self.collection_id):
-            # TODO: This should be explicitly represented in the XML instead of
-            # hardcoding and parsing it.
-            journal_title, volume_no, issue_no = data.get_journal_info(
-                self.collection_id, self.attrib["title"]
-            )
+        if self.is_journal:
+            if "journal-title" in self.attrib:
+                journal_title = self.attrib["journal-title"]
+            else:
+                venues = meta_data.get("venue", [])
+                if len(venues) != 1:
+                    raise ValueError(f"{self.full_id}: journal volume must have exactly one venue or an explicit <journal-title>")
+                journal_title = self.venue_index.get_venue(venues[0])["name"]
 
             self.attrib["meta_journal_title"] = journal_title
-            if volume_no is not None:
-                self.attrib["meta_volume"] = volume_no
-            if issue_no is not None:
-                self.attrib["meta_issue"] = issue_no
+
+            # For compatibility reasons, we rename the attributes here; might
+            # change this later
+            if "journal-volume" in self.attrib:
+                self.attrib["meta_volume"] = self.attrib["journal-volume"]
+                del self.attrib["journal-volume"]
+            if "journal-issue" in self.attrib:
+                self.attrib["meta_issue"] = self.attrib["journal-issue"]
+                del self.attrib["journal-issue"]
 
     @property
     def volume_id(self):