Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#125 Create exception for empty figures and parse figure sub-points. #139

Merged
merged 3 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions pubmed_parser/pubmed_oa_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,19 +428,35 @@ def parse_pubmed_caption(path):
if figs is not None:
for fig in figs:
fig_id = fig.attrib["id"]
fig_label = stringify_children(fig.find("label"))
fig_captions = fig.find("caption").getchildren()
caption = " ".join([stringify_children(c) for c in fig_captions])

fig_label = fig.find("label")
if fig_label is not None:
fig_label = stringify_children(fig_label)

fig_captions = fig.find("caption")
if fig_captions is not None:
fig_captions = fig_captions.getchildren()
caption = " ".join([stringify_children(c) for c in fig_captions])

graphic = fig.find("graphic")
graphic_ref = None
if graphic is not None:
graphic_ref = graphic.attrib.values()[0]

list_items = fig.findall(".//list-item")
fig_list_items = []
for list_item in list_items:
item_label = stringify_children(list_item.find("label"))
item_text = stringify_children(list_item.find("p"))
fig_list_items.append((item_label, item_text))

dict_caption = {
"pmid": pmid,
"pmc": pmc,
"fig_caption": caption,
"fig_id": fig_id,
"fig_label": fig_label,
"fig_list-items": fig_list_items,
"graphic_ref": graphic_ref,
}
dict_captions.append(dict_caption)
Expand Down
23 changes: 22 additions & 1 deletion tests/test_pubmed_oa_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,15 @@ def fetch_pubmed_xml(db_dir):
return content

# Get up-to-date pubmed online article
pubmed_dir = {"3460867": "00/00/PMC3460867", "28298962": "8e/71/PMC5334499"}
pubmed_dir = {"3460867": "00/00/PMC3460867",
"28298962": "8e/71/PMC5334499",
"9539395": "51/b3/PMC9539395"
}
pubmed_xml_3460867 = fetch_pubmed_xml(pubmed_dir['3460867'])

pubmed_xml_9539395 = fetch_pubmed_xml(pubmed_dir['9539395'])
captions_9539395 = pp.parse_pubmed_caption(pubmed_xml_9539395)


def test_parse_pubmed_xml():
"""
Expand Down Expand Up @@ -68,3 +74,18 @@ def test_parse_pubmed_caption():
assert (
len(captions) == 4
), "Expected number of figures/captions to have a length of 4"


def test_parse_pubmed_caption_content():
"""This is a test for the caption content."""
fig_caption = 'Aerosol delivery of sACE22.v2.4‐IgG1 alleviates lung injury and improves survival of SARS‐CoV‐2 gamma variant infected K18‐hACE2 transgenic mice \n\n'
assert captions_9539395[0]['fig_caption'] == fig_caption
assert captions_9539395[0]['fig_id'] == 'emmm202216109-fig-0001'
assert captions_9539395[0]['fig_label'] == 'Figure 1'
assert captions_9539395[8]['fig_label'] is None
fig_list_items = [('A', 'K18‐hACE2 transgenic mice were inoculated with SARS‐CoV‐2 isolate /Japan/TY7‐503/2021 (gamma variant) at 1\u2009×\u2009104 PFU. sACE22.v2.4‐IgG1 (7.5\u2009ml at 8.3\u2009mg/ml in PBS) was delivered to the mice by a nebulizer in 25\u2009min at 12\u2009h, 48\u2009h, and 84\u2009h postinoculation. PBS was aerosol delivered as control.'), ('B, C', 'Survival (B) and weight loss (C). N\u2009=\u200910 mice for each group. The P‐value of the survival curve by the Gehan–Breslow–Wilcoxon test is shown. Error bars for mouse weight are centered on the mean and show SEM.'), ('D', "Viral load in the lung was measured by RT–qPCR on Day 7. The mRNA expression levels of SARS‐CoV‐2 Spike, Nsp, and Rdrp are normalized to the housekeeping gene peptidylprolyl isomerase A (Ppia). Data are presented as mean\u2009±\u2009SEM, N\u2009=\u20094 mice per group. *P\u2009<\u20090.05 by the unpaired Student's t‐test with two‐sided."), ('E', "Cytokine expression levels of Tnfa, Ifng, Il1a, and Il1b were measured by RT–qPCR normalized by Ppia. Data are presented as mean\u2009±\u2009SEM, N\u2009=\u20094 mice per group. *P\u2009<\u20090.05 by the unpaired Student's t‐test with two‐sided."), ('F, G', 'Representative H&E staining of lung sections on Day 7 postinoculation for control PBS group (F) and inhalation of the sACE22.v2.4‐IgG1 group (G). Images at left are low magnifications. Boxed regions (black) are shown at higher magnification on the right. Lungs from 4 independent mice were sectioned, stained, and imaged.')]
assert captions_9539395[0]['fig_list-items'] == fig_list_items
assert captions_9539395[0]['graphic_ref'] == 'EMMM-14-e16109-g008'
assert captions_9539395[8]['graphic_ref'] is None
assert captions_9539395[0]['pmc'] == '9539395'
assert captions_9539395[0]['pmid'] == '36094679'
Loading