diff --git a/bin/anthology/data.py b/bin/anthology/data.py
index 8bc6b8aa33..8dd41d065d 100644
--- a/bin/anthology/data.py
+++ b/bin/anthology/data.py
@@ -83,9 +83,6 @@
"booktitle",
)
-# New-style IDs that should be handled as journals
-JOURNAL_IDS = ("cl", "tacl", "tal", "lilt", "ijclclp")
-
# Constants associated with DOI assignation
DOI_URL_PREFIX = "https://dx.doi.org/"
DOI_PREFIX = "10.18653/v1/"
@@ -95,91 +92,3 @@
# The venue format must match this pattern
VENUE_FORMAT = r"^[a-z\d]+$"
-
-
-def match_volume_and_issue(booktitle) -> Tuple[str, str]:
- """Parses a volume name and issue name from a title.
-
- Examples:
- - Computational Linguistics, Volume 26, Number 1, March 2000
- - Traitement Automatique des Langues 2011 Volume 52 Numéro 1
- - Computational Linguistics, Volume 26, Number 1, March 2000
-
- :param booktitle: The booktitle
- :return: the volume and issue numbers
- """
- volume_no = re.search(r"Volume\s*(\d+)", booktitle, flags=re.IGNORECASE)
- if volume_no is not None:
- volume_no = volume_no.group(1)
-
- issue_no = re.search(
- r"(Number|Numéro|Issue)\s*(\d+-?\d*)", booktitle, flags=re.IGNORECASE
- )
- if issue_no is not None:
- issue_no = issue_no.group(2)
-
- return volume_no, issue_no
-
-
-def get_journal_info(top_level_id, volume_title) -> Tuple[str, str, str]:
- """Returns info about the journal: title, volume no., and issue no.
- Currently (Feb 2023), this information is parsed from the tag!
- We should move instead to an explicit representation. See
-
- https://github.com/acl-org/acl-anthology/issues/2379
-
- :param top_level_id: The collection ID
- :param volume_title: The text from the tag
- :return: The journal title, volume number, and issue number
- """
-
- # TODO: consider moving this from code to data (perhaps
- # under in the volume metadata
-
- top_level_id = top_level_id.split(".")[-1] # for new-style IDs; is a no-op otherwise
-
- journal_title = None
- volume_no = None
- issue_no = None
-
- if top_level_id == "cl":
- # Computational Linguistics, Volume 26, Number 1, March 2000
- journal_title = "Computational Linguistics"
- volume_no, issue_no = match_volume_and_issue(volume_title)
-
- elif top_level_id == "lilt":
- # Linguistic Issues in Language Technology, Volume 10, 2015
- journal_title = "Linguistic Issues in Language Technology"
- volume_no, _ = match_volume_and_issue(volume_title)
-
- elif top_level_id == "tal":
- # Traitement Automatique des Langues 2011 Volume 52 Numéro 1
- journal_title = "Traitement Automatique des Langues"
- volume_no, issue_no = match_volume_and_issue(volume_title)
-
- elif top_level_id == "ijclclp":
- journal_title = "International Journal of Computational Linguistics & Chinese Language Processing"
- volume_no, issue_no = match_volume_and_issue(volume_title)
-
- elif top_level_id == "nejlt":
- journal_title = "Northern European Journal of Language Technology"
- volume_no, _ = match_volume_and_issue(volume_title)
-
- elif top_level_id[0] == "J":
- # Computational Linguistics, Volume 26, Number 1, March 2000
- year = int(top_level_id[1:3])
- if year >= 65 and year <= 83:
- journal_title = "American Journal of Computational Linguistics"
- else:
- journal_title = "Computational Linguistics"
-
- volume_no, issue_no = match_volume_and_issue(volume_title)
-
- elif top_level_id[0] == "Q" or top_level_id == "tacl":
- journal_title = "Transactions of the Association for Computational Linguistics"
- volume_no, _ = match_volume_and_issue(volume_title)
-
- else:
- journal_title = volume_title
-
- return journal_title, volume_no, issue_no
diff --git a/bin/anthology/papers.py b/bin/anthology/papers.py
index 79ebc8415c..49690e8119 100644
--- a/bin/anthology/papers.py
+++ b/bin/anthology/papers.py
@@ -23,7 +23,6 @@
parse_element,
infer_url,
infer_attachment_url,
- is_journal,
)
from . import data
@@ -154,7 +153,7 @@ def from_xml(xml_element, *args):
paper.attrib["xml_title"].tag = "title"
# Remove booktitle for frontmatter and journals
- if paper.is_volume or is_journal(paper.full_id):
+ if paper.is_volume or paper.parent_volume.is_journal:
del paper.attrib["xml_booktitle"]
if "editor" in paper.attrib:
@@ -248,7 +247,7 @@ def bibtype(self):
"""Return the BibTeX entry type for this paper."""
if self.is_volume:
return "proceedings"
- elif is_journal(self.full_id):
+ elif self.parent_volume.is_journal:
return "article"
else:
return "inproceedings"
@@ -259,7 +258,7 @@ def csltype(self):
cf. https://docs.citationstyles.org/en/stable/specification.html#appendix-iii-types
"""
- if is_journal(self.full_id):
+ if self.parent_volume.is_journal:
return "article-journal"
elif self.is_volume:
return "book"
@@ -348,7 +347,7 @@ def as_bibtex(self, concise=False):
entries.append(
(people, " and ".join(p.as_bibtex() for p, _ in self.get(people)))
)
- if is_journal(self.full_id):
+ if self.parent_volume.is_journal:
entries.append(
("journal", bibtex_encode(self.parent_volume.get("meta_journal_title")))
)
@@ -402,7 +401,7 @@ def as_citeproc_json(self):
if "editor" in self.attrib:
# or should this be "container-author"/"collection-editor" here?
data["editor"] = [p.as_citeproc_json() for p, _ in self.get("editor")]
- if is_journal(self.full_id):
+ if self.parent_volume.is_journal:
data["container-title"] = self.parent_volume.get("meta_journal_title")
journal_volume = self.parent_volume.get(
"meta_volume", self.parent_volume.get("volume")
diff --git a/bin/anthology/utils.py b/bin/anthology/utils.py
index c01e073da6..572e871f93 100644
--- a/bin/anthology/utils.py
+++ b/bin/anthology/utils.py
@@ -40,18 +40,6 @@ def is_newstyle_id(anthology_id):
return anthology_id[0].isdigit() # New-style IDs are year-first
-def is_journal(anthology_id):
- if is_newstyle_id(anthology_id):
- # TODO: this function is sometimes called with "full_id", sometimes with
- # "collection_id", so we're not using `deconstruct_anthology_id` here at
- # the moment
- venue = anthology_id.split("-")[0].split(".")[-1]
- # TODO: this is currently hard-coded, but should be moved to the XML representation
- return venue in data.JOURNAL_IDS
- else:
- return anthology_id[0] in ("J", "Q")
-
-
def is_volume_id(anthology_id):
collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id)
return paper_id == "0"
diff --git a/bin/anthology/volumes.py b/bin/anthology/volumes.py
index e53ebb011e..8972179491 100644
--- a/bin/anthology/volumes.py
+++ b/bin/anthology/volumes.py
@@ -23,7 +23,6 @@
from .utils import (
build_anthology_id,
parse_element,
- is_journal,
month_str2num,
infer_url,
)
@@ -34,6 +33,7 @@ def __init__(
self,
collection_id,
volume_id,
+ volume_type,
ingest_date,
meta_data,
venue_index: VenueIndex,
@@ -47,10 +47,11 @@ def __init__(
"""
self.collection_id = collection_id
self._id = volume_id
+ self._type = volume_type
self.ingest_date = ingest_date
self.formatter = formatter
- self._set_meta_info(meta_data)
self.venue_index = venue_index
+ self._set_meta_info(meta_data)
self.attrib["venues"] = meta_data.get("venue", [])
self.attrib["events"] = meta_data.get("event", [])
self.attrib["sigs"] = sig_index.get_associated_sigs(self.full_id)
@@ -71,6 +72,7 @@ def from_xml(
volume_xml, collection_id, venue_index: VenueIndex, sig_index: SIGIndex, formatter
):
volume_id = volume_xml.attrib["id"]
+ volume_type = volume_xml.attrib["type"]
# The date of publication, defaulting to earlier than anything we'll encounter
ingest_date = volume_xml.attrib.get("ingest-date", data.UNKNOWN_INGEST_DATE)
meta_data = parse_element(volume_xml.find("meta"))
@@ -81,6 +83,7 @@ def from_xml(
volume = Volume(
collection_id,
volume_id,
+ volume_type,
ingest_date,
meta_data,
venue_index,
@@ -108,6 +111,10 @@ def pdf(self):
return infer_url(url, template=data.PDF_LOCATION_TEMPLATE)
return None
+ @property
+ def is_journal(self):
+ return self._type == "journal"
+
def _set_meta_info(self, meta_data):
"""Derive journal title, volume, and issue no. used in metadata.
@@ -120,18 +127,25 @@ def _set_meta_info(self, meta_data):
month = month_str2num(self.get("month"))
if month is not None:
self.attrib["meta_date"] = f"{self.get('year')}/{month}"
- if is_journal(self.collection_id):
- # TODO: This should be explicitly represented in the XML instead of
- # hardcoding and parsing it.
- journal_title, volume_no, issue_no = data.get_journal_info(
- self.collection_id, self.attrib["title"]
- )
+ if self.is_journal:
+ if "journal-title" in self.attrib:
+ journal_title = self.attrib["journal-title"]
+ else:
+ venues = meta_data.get("venue", [])
+ if len(venues) != 1:
+ raise ValueError(f"{self.full_id}: journal volume must have exactly one venue or an explicit ")
+ journal_title = self.venue_index.get_venue(venues[0])["name"]
self.attrib["meta_journal_title"] = journal_title
- if volume_no is not None:
- self.attrib["meta_volume"] = volume_no
- if issue_no is not None:
- self.attrib["meta_issue"] = issue_no
+
+ # For compatibility reasons, we rename the attributes here; might
+ # change this later
+ if "journal-volume" in self.attrib:
+ self.attrib["meta_volume"] = self.attrib["journal-volume"]
+ del self.attrib["journal-volume"]
+ if "journal-issue" in self.attrib:
+ self.attrib["meta_issue"] = self.attrib["journal-issue"]
+ del self.attrib["journal-issue"]
@property
def volume_id(self):