Skip to content

Commit

Permalink
Add utils.citation + Paper.to_citation()
Browse files Browse the repository at this point in the history
  • Loading branch information
mbollmann committed Apr 14, 2024
1 parent 27e65a9 commit 9a3ffbb
Show file tree
Hide file tree
Showing 7 changed files with 173 additions and 1 deletion.
1 change: 1 addition & 0 deletions python/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Added

- Papers can now generate citation reference strings in any CSL-supported style via `to_citation()`.
- SIGIndex now reverse-indexes co-located volumes, so it is now possible to get SIGs associated with volumes, e.g. via `Volume.get_sigs()`.
- Papers now have attribute `thumbnail`.
- Volumes now have attributes `has_abstracts` and `web_url`.
Expand Down
52 changes: 52 additions & 0 deletions python/acl_anthology/collections/paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import datetime
from attrs import define, field, Factory
from enum import Enum
from functools import cached_property
from lxml import etree
from lxml.builder import E
from typing import cast, Any, Optional, TYPE_CHECKING
Expand All @@ -32,6 +33,7 @@
)
from ..people import NameSpecification
from ..text import MarkupText
from ..utils.citation import citeproc_render_html
from ..utils.ids import build_id, AnthologyIDTuple
from ..utils.latex import make_bibtex_entry
from ..utils.logging import get_logger
Expand Down Expand Up @@ -154,6 +156,45 @@ def bibtype(self) -> str:
case _: # pragma: no cover
raise ValueError(f"Unknown volume type: {self.parent.type}")

@property
def csltype(self) -> str:
"""The [CSL type](https://docs.citationstyles.org/en/stable/specification.html#appendix-iii-types) for this paper."""
if self.is_frontmatter:
return "book"
if self.parent.type == VolumeType.JOURNAL:
return "article-journal"
# else:
return "paper-conference"

@cached_property
def citeproc_dict(self) -> dict[str, Any]:
"""The citation object corresponding to this paper for use with CiteProcJSON."""
data: dict[str, Any] = {
"id": self.bibkey,
"title": self.title.as_text(),
"type": self.csltype,
"author": [namespec.citeproc_dict for namespec in self.authors],
"editor": [namespec.citeproc_dict for namespec in self.get_editors()],
"publisher": self.publisher,
"publisher-place": self.address,
# TODO: month currently not included
"issued": {"date-parts": [[self.year]]},
"URL": self.web_url,
"DOI": self.doi,
"ISBN": self.parent.isbn,
"page": self.pages,
}
if self.is_frontmatter:
data["author"] = data["editor"]
match self.parent.type:
case VolumeType.JOURNAL:
data["container-title"] = self.parent.get_journal_title()
data["volume"] = self.parent.journal_volume
data["issue"] = self.parent.journal_issue
case VolumeType.PROCEEDINGS:
data["container-title"] = self.parent.title.as_text()
return {k: v for k, v in data.items() if v is not None}

@property
def address(self) -> Optional[str]:
"""The publisher's address for this paper. Inherited from the parent Volume."""
Expand Down Expand Up @@ -260,6 +301,17 @@ def to_bibtex(self, with_abstract: bool = False) -> str:
bibtex_fields.append(("abstract", self.abstract))
return make_bibtex_entry(self.bibtype, self.bibkey, bibtex_fields)

def to_citation(self, style: Optional[str] = None) -> str:
"""Generate a citation (reference) for this paper.
Arguments:
style: Any citation style supported by [`citeproc-py-styles`](https://github.com/inveniosoftware/citeproc-py-styles) or a path to a CSL file. If None (default), uses the built-in ACL citation style.
Returns:
The generated citation reference as a single string with HTML markup. See [`citeproc_render_html()`][acl_anthology.utils.citation.citeproc_render_html] for the rationale behind returning a single string here.
"""
return citeproc_render_html(self.citeproc_dict, style)

@classmethod
def from_frontmatter_xml(cls, parent: Volume, paper: etree._Element) -> Paper:
"""Instantiates a new paper from a `<frontmatter>` block in the XML."""
Expand Down
5 changes: 4 additions & 1 deletion python/acl_anthology/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from .citation import citeproc_render_html
from .git import clone_or_pull_from_repo
from .ids import build_id, parse_id, AnthologyID
from .latex import latex_encode, latex_convert_quotes
from .latex import latex_encode, latex_convert_quotes, make_bibtex_entry
from .logging import setup_rich_logging, get_logger
from .text import remove_extra_whitespace
from .xml import stringify_children
Expand All @@ -23,10 +24,12 @@
__all__ = [
"AnthologyID",
"build_id",
"citeproc_render_html",
"clone_or_pull_from_repo",
"get_logger",
"latex_encode",
"latex_convert_quotes",
"make_bibtex_entry",
"parse_id",
"remove_extra_whitespace",
"setup_rich_logging",
Expand Down
77 changes: 77 additions & 0 deletions python/acl_anthology/utils/citation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright 2024 Marcel Bollmann <marcel@bollmann.me>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Functions for generating citation references."""

import citeproc
from citeproc import Citation, CitationItem, CitationStylesBibliography, CitationStylesStyle
from citeproc.source.json import CiteProcJSON
from citeproc_styles import get_style_filepath
from pathlib import Path
import sys
from typing import Any, Optional


class CitationStyleDict(dict[str | Path, Any]):
"""Dictionary mapping names of citation styles to their `citeproc.CitationStylesStyle` objects, auto-loading styles on first access."""

def __getitem__(self, key: str | Path) -> CitationStylesStyle:
style = dict.get(self, key)
if style is None:
if Path(key).is_file():
# Assume that key is a filename
filename = key
else:
# Assume that key is the name of a style in citeproc-py-styles
filename = get_style_filepath(key)
if not Path(filename).is_file():
raise KeyError(f"Could not resolve '{key}' to a filename of a citation style")
style = CitationStylesStyle(filename)
self.__setitem__(key, style)
return style


citation_styles = CitationStyleDict()
"""Global object for accessing `citeproc.CitationStylesStyle` objects."""


def citeproc_render_html(citeproc_dict: dict[str, Any], style: Optional[str | Path] = None, link_title: bool = True) -> str:
"""Render a bibliography entry with a given CSL style.
Arguments:
citeproc_dict: A dictionary with publication metadata as expected by CiteProcJSON.
style: Any citation style supported by [`citeproc-py-styles`](https://github.com/inveniosoftware/citeproc-py-styles) or a path to a CSL file. If None (default), uses the built-in ACL citation style.
link_title: If True, wraps the title in a link to the entry's URL.
Returns:
The bibliography entry as a single string with HTML markup.
Note:
The reason for returning a string is that this is what we get from citeproc-py's `render_bibliography()` function. If the result was parsed with LXML, we could turn it into a proper [MarkupText][acl_anthology.text.MarkupText] object. However, since the most common use case of this function requires the HTML-ified string, we do not do this here as it would introduce unnecessary overhead in this case.
"""
if style is None:
style = Path(sys.modules["acl_anthology"].__path__[0]) / "data" / "acl.csl"

source = CiteProcJSON([citeproc_dict])
item = CitationItem(citeproc_dict["id"])
bib = CitationStylesBibliography(citation_styles[style], source, citeproc.formatter.html)
bib.register(Citation([item]))
rendered_list = bib.style.render_bibliography([item])[0]
if link_title:
link_text = f'<a href="{citeproc_dict["URL"]}">{citeproc_dict["title"]}</a>'
rendered_list = [
str(x) if x != citeproc_dict["title"] else link_text
for x in rendered_list
]
return "".join(rendered_list)
4 changes: 4 additions & 0 deletions python/docs/api/utils.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# utils

## utils.citation

::: acl_anthology.utils.citation

## utils.ids

::: acl_anthology.utils.ids
Expand Down
8 changes: 8 additions & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ ignore_missing_imports = true
module = 'latexcodec.*'
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = 'citeproc.*'
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = 'citeproc_styles.*'
ignore_missing_imports = true

[tool.pytest.ini_options]
markers = [
"integration: marks tests on the full acl-org/acl-anthology repo"
Expand Down
27 changes: 27 additions & 0 deletions python/tests/collections/paper_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,33 @@ def test_paper_to_bibtex_journal(anthology):
assert paper.to_bibtex() == expected


test_cases_papercitation = (
(
"J89-4001",
'Andrew Haas. 1989. <a href="https://aclanthology.org/J89-4001/">A Parsing Algorithm for Unification Grammar</a>. <i>Computational Linguistics</i>, 15(4):219–232.',
),
(
"2022.acl-short.0",
"Smaranda Muresan, Preslav Nakov, and Aline Villavicencio. 2022. <i>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</i>. Association for Computational Linguistics, Dublin, Ireland.",
),
(
"2022.acl-long.268",
'Elena Álvarez-Mellado and Constantine Lignos. 2022. <a href="https://aclanthology.org/2022.acl-long.268/">Detecting Unassimilated Borrowings in Spanish: An Annotated Corpus and Approaches to Modeling</a>. In <i>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</i>, pages 3868–3888, Dublin, Ireland. Association for Computational Linguistics.',
),
(
"L06-1060",
'Brian Roark, Mary Harper, Eugene Charniak, Bonnie Dorr, Mark Johnson, Jeremy Kahn, Yang Liu, Mari Ostendorf, John Hale, Anna Krasnyanskaya, Matthew Lease, Izhak Shafran, Matthew Snover, Robin Stewart, and Lisa Yung. 2006. <a href="https://aclanthology.org/L06-1060/">SParseval: Evaluation Metrics for Parsing Speech</a>. In <i>Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06)</i>, Genoa, Italy. European Language Resources Association (ELRA).',
),
)


@pytest.mark.parametrize("full_id, expected", test_cases_papercitation)
def test_paper_to_citation(anthology, full_id, expected):
paper = anthology.get(full_id)
citation = paper.to_citation()
assert citation == expected


test_cases_paperdeletionnotice = (
(
'<retracted date="2022-05-06">Paper was intended for the non-archival track.</retracted>',
Expand Down

0 comments on commit 9a3ffbb

Please sign in to comment.