Skip to content

Commit

Permalink
Refactor Python lib to use new XML fields, remove hard-coded data
Browse files Browse the repository at this point in the history
  • Loading branch information
mbollmann committed Jul 12, 2023
1 parent 10c37ca commit fdbfb2c
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 121 deletions.
91 changes: 0 additions & 91 deletions bin/anthology/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,6 @@
"booktitle",
)

# New-style IDs that should be handled as journals
JOURNAL_IDS = ("cl", "tacl", "tal", "lilt", "ijclclp")

# Constants associated with DOI assignation
DOI_URL_PREFIX = "https://dx.doi.org/"
DOI_PREFIX = "10.18653/v1/"
Expand All @@ -95,91 +92,3 @@

# The venue format must match this pattern
VENUE_FORMAT = r"^[a-z\d]+$"


def match_volume_and_issue(booktitle) -> Tuple[str, str]:
"""Parses a volume name and issue name from a title.
Examples:
- <booktitle>Computational Linguistics, Volume 26, Number 1, March 2000</booktitle>
- <booktitle>Traitement Automatique des Langues 2011 Volume 52 Numéro 1</booktitle>
- <booktitle>Computational Linguistics, Volume 26, Number 1, March 2000</booktitle>
:param booktitle: The booktitle
:return: the volume and issue numbers
"""
volume_no = re.search(r"Volume\s*(\d+)", booktitle, flags=re.IGNORECASE)
if volume_no is not None:
volume_no = volume_no.group(1)

issue_no = re.search(
r"(Number|Numéro|Issue)\s*(\d+-?\d*)", booktitle, flags=re.IGNORECASE
)
if issue_no is not None:
issue_no = issue_no.group(2)

return volume_no, issue_no


def get_journal_info(top_level_id, volume_title) -> Tuple[str, str, str]:
"""Returns info about the journal: title, volume no., and issue no.
Currently (Feb 2023), this information is parsed from the <booktitle> tag!
We should move instead to an explicit representation. See
https://github.com/acl-org/acl-anthology/issues/2379
:param top_level_id: The collection ID
:param volume_title: The text from the <booktitle> tag
:return: The journal title, volume number, and issue number
"""

# TODO: consider moving this from code to data (perhaps
# under <booktitle> in the volume metadata

top_level_id = top_level_id.split(".")[-1] # for new-style IDs; is a no-op otherwise

journal_title = None
volume_no = None
issue_no = None

if top_level_id == "cl":
# <booktitle>Computational Linguistics, Volume 26, Number 1, March 2000</booktitle>
journal_title = "Computational Linguistics"
volume_no, issue_no = match_volume_and_issue(volume_title)

elif top_level_id == "lilt":
# <booktitle>Linguistic Issues in Language Technology, Volume 10, 2015</booktitle>
journal_title = "Linguistic Issues in Language Technology"
volume_no, _ = match_volume_and_issue(volume_title)

elif top_level_id == "tal":
# <booktitle>Traitement Automatique des Langues 2011 Volume 52 Numéro 1</booktitle>
journal_title = "Traitement Automatique des Langues"
volume_no, issue_no = match_volume_and_issue(volume_title)

elif top_level_id == "ijclclp":
journal_title = "International Journal of Computational Linguistics & Chinese Language Processing"
volume_no, issue_no = match_volume_and_issue(volume_title)

elif top_level_id == "nejlt":
journal_title = "Northern European Journal of Language Technology"
volume_no, _ = match_volume_and_issue(volume_title)

elif top_level_id[0] == "J":
# <booktitle>Computational Linguistics, Volume 26, Number 1, March 2000</booktitle>
year = int(top_level_id[1:3])
if year >= 65 and year <= 83:
journal_title = "American Journal of Computational Linguistics"
else:
journal_title = "Computational Linguistics"

volume_no, issue_no = match_volume_and_issue(volume_title)

elif top_level_id[0] == "Q" or top_level_id == "tacl":
journal_title = "Transactions of the Association for Computational Linguistics"
volume_no, _ = match_volume_and_issue(volume_title)

else:
journal_title = volume_title

return journal_title, volume_no, issue_no
11 changes: 5 additions & 6 deletions bin/anthology/papers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
parse_element,
infer_url,
infer_attachment_url,
is_journal,
)
from . import data

Expand Down Expand Up @@ -154,7 +153,7 @@ def from_xml(xml_element, *args):
paper.attrib["xml_title"].tag = "title"

# Remove booktitle for frontmatter and journals
if paper.is_volume or is_journal(paper.full_id):
if paper.is_volume or paper.parent_volume.is_journal:
del paper.attrib["xml_booktitle"]

if "editor" in paper.attrib:
Expand Down Expand Up @@ -248,7 +247,7 @@ def bibtype(self):
"""Return the BibTeX entry type for this paper."""
if self.is_volume:
return "proceedings"
elif is_journal(self.full_id):
elif self.parent_volume.is_journal:
return "article"
else:
return "inproceedings"
Expand All @@ -259,7 +258,7 @@ def csltype(self):
cf. https://docs.citationstyles.org/en/stable/specification.html#appendix-iii-types
"""
if is_journal(self.full_id):
if self.parent_volume.is_journal:
return "article-journal"
elif self.is_volume:
return "book"
Expand Down Expand Up @@ -348,7 +347,7 @@ def as_bibtex(self, concise=False):
entries.append(
(people, " and ".join(p.as_bibtex() for p, _ in self.get(people)))
)
if is_journal(self.full_id):
if self.parent_volume.is_journal:
entries.append(
("journal", bibtex_encode(self.parent_volume.get("meta_journal_title")))
)
Expand Down Expand Up @@ -402,7 +401,7 @@ def as_citeproc_json(self):
if "editor" in self.attrib:
# or should this be "container-author"/"collection-editor" here?
data["editor"] = [p.as_citeproc_json() for p, _ in self.get("editor")]
if is_journal(self.full_id):
if self.parent_volume.is_journal:
data["container-title"] = self.parent_volume.get("meta_journal_title")
journal_volume = self.parent_volume.get(
"meta_volume", self.parent_volume.get("volume")
Expand Down
12 changes: 0 additions & 12 deletions bin/anthology/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,6 @@ def is_newstyle_id(anthology_id):
return anthology_id[0].isdigit() # New-style IDs are year-first


def is_journal(anthology_id):
if is_newstyle_id(anthology_id):
# TODO: this function is sometimes called with "full_id", sometimes with
# "collection_id", so we're not using `deconstruct_anthology_id` here at
# the moment
venue = anthology_id.split("-")[0].split(".")[-1]
# TODO: this is currently hard-coded, but should be moved to the XML representation
return venue in data.JOURNAL_IDS
else:
return anthology_id[0] in ("J", "Q")


def is_volume_id(anthology_id):
collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id)
return paper_id == "0"
Expand Down
38 changes: 26 additions & 12 deletions bin/anthology/volumes.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from .utils import (
build_anthology_id,
parse_element,
is_journal,
month_str2num,
infer_url,
)
Expand All @@ -34,6 +33,7 @@ def __init__(
self,
collection_id,
volume_id,
volume_type,
ingest_date,
meta_data,
venue_index: VenueIndex,
Expand All @@ -47,10 +47,11 @@ def __init__(
"""
self.collection_id = collection_id
self._id = volume_id
self._type = volume_type
self.ingest_date = ingest_date
self.formatter = formatter
self._set_meta_info(meta_data)
self.venue_index = venue_index
self._set_meta_info(meta_data)
self.attrib["venues"] = meta_data.get("venue", [])
self.attrib["events"] = meta_data.get("event", [])
self.attrib["sigs"] = sig_index.get_associated_sigs(self.full_id)
Expand All @@ -71,6 +72,7 @@ def from_xml(
volume_xml, collection_id, venue_index: VenueIndex, sig_index: SIGIndex, formatter
):
volume_id = volume_xml.attrib["id"]
volume_type = volume_xml.attrib["type"]
# The date of publication, defaulting to earlier than anything we'll encounter
ingest_date = volume_xml.attrib.get("ingest-date", data.UNKNOWN_INGEST_DATE)
meta_data = parse_element(volume_xml.find("meta"))
Expand All @@ -81,6 +83,7 @@ def from_xml(
volume = Volume(
collection_id,
volume_id,
volume_type,
ingest_date,
meta_data,
venue_index,
Expand Down Expand Up @@ -108,6 +111,10 @@ def pdf(self):
return infer_url(url, template=data.PDF_LOCATION_TEMPLATE)
return None

@property
def is_journal(self):
return self._type == "journal"

def _set_meta_info(self, meta_data):
"""Derive journal title, volume, and issue no. used in metadata.
Expand All @@ -120,18 +127,25 @@ def _set_meta_info(self, meta_data):
month = month_str2num(self.get("month"))
if month is not None:
self.attrib["meta_date"] = f"{self.get('year')}/{month}"
if is_journal(self.collection_id):
# TODO: This should be explicitly represented in the XML instead of
# hardcoding and parsing it.
journal_title, volume_no, issue_no = data.get_journal_info(
self.collection_id, self.attrib["title"]
)
if self.is_journal:
if "journal-title" in self.attrib:
journal_title = self.attrib["journal-title"]
else:
venues = meta_data.get("venue", [])
if len(venues) != 1:
raise ValueError(f"{self.full_id}: journal volume must have exactly one venue or an explicit <journal-title>")
journal_title = self.venue_index.get_venue(venues[0])["name"]

self.attrib["meta_journal_title"] = journal_title
if volume_no is not None:
self.attrib["meta_volume"] = volume_no
if issue_no is not None:
self.attrib["meta_issue"] = issue_no

# For compatibility reasons, we rename the attributes here; might
# change this later
if "journal-volume" in self.attrib:
self.attrib["meta_volume"] = self.attrib["journal-volume"]
del self.attrib["journal-volume"]
if "journal-issue" in self.attrib:
self.attrib["meta_issue"] = self.attrib["journal-issue"]
del self.attrib["journal-issue"]

@property
def volume_id(self):
Expand Down

0 comments on commit fdbfb2c

Please sign in to comment.