diff --git a/bin/anthology/data.py b/bin/anthology/data.py index 8bc6b8aa33..8dd41d065d 100644 --- a/bin/anthology/data.py +++ b/bin/anthology/data.py @@ -83,9 +83,6 @@ "booktitle", ) -# New-style IDs that should be handled as journals -JOURNAL_IDS = ("cl", "tacl", "tal", "lilt", "ijclclp") - # Constants associated with DOI assignation DOI_URL_PREFIX = "https://dx.doi.org/" DOI_PREFIX = "10.18653/v1/" @@ -95,91 +92,3 @@ # The venue format must match this pattern VENUE_FORMAT = r"^[a-z\d]+$" - - -def match_volume_and_issue(booktitle) -> Tuple[str, str]: - """Parses a volume name and issue name from a title. - - Examples: - - Computational Linguistics, Volume 26, Number 1, March 2000 - - Traitement Automatique des Langues 2011 Volume 52 Numéro 1 - - Computational Linguistics, Volume 26, Number 1, March 2000 - - :param booktitle: The booktitle - :return: the volume and issue numbers - """ - volume_no = re.search(r"Volume\s*(\d+)", booktitle, flags=re.IGNORECASE) - if volume_no is not None: - volume_no = volume_no.group(1) - - issue_no = re.search( - r"(Number|Numéro|Issue)\s*(\d+-?\d*)", booktitle, flags=re.IGNORECASE - ) - if issue_no is not None: - issue_no = issue_no.group(2) - - return volume_no, issue_no - - -def get_journal_info(top_level_id, volume_title) -> Tuple[str, str, str]: - """Returns info about the journal: title, volume no., and issue no. - Currently (Feb 2023), this information is parsed from the tag! - We should move instead to an explicit representation. See - - https://github.com/acl-org/acl-anthology/issues/2379 - - :param top_level_id: The collection ID - :param volume_title: The text from the tag - :return: The journal title, volume number, and issue number - """ - - # TODO: consider moving this from code to data (perhaps - # under in the volume metadata - - top_level_id = top_level_id.split(".")[-1] # for new-style IDs; is a no-op otherwise - - journal_title = None - volume_no = None - issue_no = None - - if top_level_id == "cl": - # Computational Linguistics, Volume 26, Number 1, March 2000 - journal_title = "Computational Linguistics" - volume_no, issue_no = match_volume_and_issue(volume_title) - - elif top_level_id == "lilt": - # Linguistic Issues in Language Technology, Volume 10, 2015 - journal_title = "Linguistic Issues in Language Technology" - volume_no, _ = match_volume_and_issue(volume_title) - - elif top_level_id == "tal": - # Traitement Automatique des Langues 2011 Volume 52 Numéro 1 - journal_title = "Traitement Automatique des Langues" - volume_no, issue_no = match_volume_and_issue(volume_title) - - elif top_level_id == "ijclclp": - journal_title = "International Journal of Computational Linguistics & Chinese Language Processing" - volume_no, issue_no = match_volume_and_issue(volume_title) - - elif top_level_id == "nejlt": - journal_title = "Northern European Journal of Language Technology" - volume_no, _ = match_volume_and_issue(volume_title) - - elif top_level_id[0] == "J": - # Computational Linguistics, Volume 26, Number 1, March 2000 - year = int(top_level_id[1:3]) - if year >= 65 and year <= 83: - journal_title = "American Journal of Computational Linguistics" - else: - journal_title = "Computational Linguistics" - - volume_no, issue_no = match_volume_and_issue(volume_title) - - elif top_level_id[0] == "Q" or top_level_id == "tacl": - journal_title = "Transactions of the Association for Computational Linguistics" - volume_no, _ = match_volume_and_issue(volume_title) - - else: - journal_title = volume_title - - return journal_title, volume_no, issue_no diff --git a/bin/anthology/papers.py b/bin/anthology/papers.py index 79ebc8415c..49690e8119 100644 --- a/bin/anthology/papers.py +++ b/bin/anthology/papers.py @@ -23,7 +23,6 @@ parse_element, infer_url, infer_attachment_url, - is_journal, ) from . import data @@ -154,7 +153,7 @@ def from_xml(xml_element, *args): paper.attrib["xml_title"].tag = "title" # Remove booktitle for frontmatter and journals - if paper.is_volume or is_journal(paper.full_id): + if paper.is_volume or paper.parent_volume.is_journal: del paper.attrib["xml_booktitle"] if "editor" in paper.attrib: @@ -248,7 +247,7 @@ def bibtype(self): """Return the BibTeX entry type for this paper.""" if self.is_volume: return "proceedings" - elif is_journal(self.full_id): + elif self.parent_volume.is_journal: return "article" else: return "inproceedings" @@ -259,7 +258,7 @@ def csltype(self): cf. https://docs.citationstyles.org/en/stable/specification.html#appendix-iii-types """ - if is_journal(self.full_id): + if self.parent_volume.is_journal: return "article-journal" elif self.is_volume: return "book" @@ -348,7 +347,7 @@ def as_bibtex(self, concise=False): entries.append( (people, " and ".join(p.as_bibtex() for p, _ in self.get(people))) ) - if is_journal(self.full_id): + if self.parent_volume.is_journal: entries.append( ("journal", bibtex_encode(self.parent_volume.get("meta_journal_title"))) ) @@ -402,7 +401,7 @@ def as_citeproc_json(self): if "editor" in self.attrib: # or should this be "container-author"/"collection-editor" here? data["editor"] = [p.as_citeproc_json() for p, _ in self.get("editor")] - if is_journal(self.full_id): + if self.parent_volume.is_journal: data["container-title"] = self.parent_volume.get("meta_journal_title") journal_volume = self.parent_volume.get( "meta_volume", self.parent_volume.get("volume") diff --git a/bin/anthology/utils.py b/bin/anthology/utils.py index c01e073da6..572e871f93 100644 --- a/bin/anthology/utils.py +++ b/bin/anthology/utils.py @@ -40,18 +40,6 @@ def is_newstyle_id(anthology_id): return anthology_id[0].isdigit() # New-style IDs are year-first -def is_journal(anthology_id): - if is_newstyle_id(anthology_id): - # TODO: this function is sometimes called with "full_id", sometimes with - # "collection_id", so we're not using `deconstruct_anthology_id` here at - # the moment - venue = anthology_id.split("-")[0].split(".")[-1] - # TODO: this is currently hard-coded, but should be moved to the XML representation - return venue in data.JOURNAL_IDS - else: - return anthology_id[0] in ("J", "Q") - - def is_volume_id(anthology_id): collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id) return paper_id == "0" diff --git a/bin/anthology/volumes.py b/bin/anthology/volumes.py index e53ebb011e..8972179491 100644 --- a/bin/anthology/volumes.py +++ b/bin/anthology/volumes.py @@ -23,7 +23,6 @@ from .utils import ( build_anthology_id, parse_element, - is_journal, month_str2num, infer_url, ) @@ -34,6 +33,7 @@ def __init__( self, collection_id, volume_id, + volume_type, ingest_date, meta_data, venue_index: VenueIndex, @@ -47,10 +47,11 @@ def __init__( """ self.collection_id = collection_id self._id = volume_id + self._type = volume_type self.ingest_date = ingest_date self.formatter = formatter - self._set_meta_info(meta_data) self.venue_index = venue_index + self._set_meta_info(meta_data) self.attrib["venues"] = meta_data.get("venue", []) self.attrib["events"] = meta_data.get("event", []) self.attrib["sigs"] = sig_index.get_associated_sigs(self.full_id) @@ -71,6 +72,7 @@ def from_xml( volume_xml, collection_id, venue_index: VenueIndex, sig_index: SIGIndex, formatter ): volume_id = volume_xml.attrib["id"] + volume_type = volume_xml.attrib["type"] # The date of publication, defaulting to earlier than anything we'll encounter ingest_date = volume_xml.attrib.get("ingest-date", data.UNKNOWN_INGEST_DATE) meta_data = parse_element(volume_xml.find("meta")) @@ -81,6 +83,7 @@ def from_xml( volume = Volume( collection_id, volume_id, + volume_type, ingest_date, meta_data, venue_index, @@ -108,6 +111,10 @@ def pdf(self): return infer_url(url, template=data.PDF_LOCATION_TEMPLATE) return None + @property + def is_journal(self): + return self._type == "journal" + def _set_meta_info(self, meta_data): """Derive journal title, volume, and issue no. used in metadata. @@ -120,18 +127,25 @@ def _set_meta_info(self, meta_data): month = month_str2num(self.get("month")) if month is not None: self.attrib["meta_date"] = f"{self.get('year')}/{month}" - if is_journal(self.collection_id): - # TODO: This should be explicitly represented in the XML instead of - # hardcoding and parsing it. - journal_title, volume_no, issue_no = data.get_journal_info( - self.collection_id, self.attrib["title"] - ) + if self.is_journal: + if "journal-title" in self.attrib: + journal_title = self.attrib["journal-title"] + else: + venues = meta_data.get("venue", []) + if len(venues) != 1: + raise ValueError(f"{self.full_id}: journal volume must have exactly one venue or an explicit ") + journal_title = self.venue_index.get_venue(venues[0])["name"] self.attrib["meta_journal_title"] = journal_title - if volume_no is not None: - self.attrib["meta_volume"] = volume_no - if issue_no is not None: - self.attrib["meta_issue"] = issue_no + + # For compatibility reasons, we rename the attributes here; might + # change this later + if "journal-volume" in self.attrib: + self.attrib["meta_volume"] = self.attrib["journal-volume"] + del self.attrib["journal-volume"] + if "journal-issue" in self.attrib: + self.attrib["meta_issue"] = self.attrib["journal-issue"] + del self.attrib["journal-issue"] @property def volume_id(self):