From 245dd09cce0862082f1060ef0a04d15b2df7f871 Mon Sep 17 00:00:00 2001 From: Nils Herrmann <88451442+nils-herrmann@users.noreply.github.com> Date: Mon, 2 Sep 2024 21:10:25 +0200 Subject: [PATCH] Handle missing ppub and collection (close #154) --- pubmed_parser/pubmed_oa_parser.py | 2 +- tests/test_pubmed_oa_parser.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pubmed_parser/pubmed_oa_parser.py b/pubmed_parser/pubmed_oa_parser.py index 9e3cfe3..8b079f6 100644 --- a/pubmed_parser/pubmed_oa_parser.py +++ b/pubmed_parser/pubmed_oa_parser.py @@ -195,7 +195,7 @@ def parse_pubmed_xml(path, include_path=False, nxml=False): pub_date = format_date(pub_date_dict) try: - pub_year = int(pub_date_dict["year"]) + pub_year = int(pub_date_dict.get("year")) except TypeError: pub_year = None diff --git a/tests/test_pubmed_oa_parser.py b/tests/test_pubmed_oa_parser.py index b9c834e..a279b82 100644 --- a/tests/test_pubmed_oa_parser.py +++ b/tests/test_pubmed_oa_parser.py @@ -20,10 +20,12 @@ def fetch_pubmed_xml(db_dir): pubmed_dir = {"3460867": "00/00/PMC3460867", "28298962": "8e/71/PMC5334499", "9539395": "51/b3/PMC9539395", - "1280406": "5f/92/PMC1280406" + "1280406": "5f/92/PMC1280406", + "30443433": "6f/c7/PMC6218202" } pubmed_xml_3460867 = fetch_pubmed_xml(pubmed_dir['3460867']) pubmed_xml_1280406 = fetch_pubmed_xml(pubmed_dir['1280406']) +pubmed_xml_30443433 = fetch_pubmed_xml(pubmed_dir['30443433']) pubmed_xml_9539395 = fetch_pubmed_xml(pubmed_dir['9539395']) @@ -52,6 +54,10 @@ def test_parse_pubmed_xml(): assert parsed_1280406.get('publication_date') == "01-9-2005" assert parsed_1280406.get('epublication_date') == "31-5-2005" + parsed_30443433 = pp.parse_pubmed_xml(pubmed_xml_30443433) + assert parsed_30443433.get('publication_year') is None + assert parsed_30443433.get('publication_date') == "01-01" + def test_parse_pubmed_paragraph(): """