Skip to content

Commit

Permalink
TST: Fix page count and consistency checking script
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Jan 3, 2024
1 parent f661fe2 commit 8c405ec
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 5 deletions.
18 changes: 14 additions & 4 deletions .github/workflows/json_consistency.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def main() -> None:
seen_failure = True
else:
print(f"✅ Found {entry.path}")
check_meta(entry)
seen_failure = seen_failure or check_meta(entry)

# Are all files registered?
pdf_paths = Path().glob("**/*.pdf")
Expand Down Expand Up @@ -109,28 +109,36 @@ def get_annotation_counts(reader: PdfReader) -> dict[str, int]:
return pdf_annotations


def check_meta(entry: PdfEntry) -> None:
def check_meta(entry: PdfEntry) -> bool:
"""Check if the given entry metadata matches the extracted metadata."""
seen_failure = False
try:
reader = PdfReader(entry.path)
if reader.is_encrypted:
return
return seen_failure
info = reader.metadata
except Exception:
return
return seen_failure
if info is None:
info = {}
if info.get("/Producer") != entry.producer:
seen_failure = True
print(
f"❌ ERROR: Producer mismatch: {entry.producer} vs {info.get('/Producer')}",
)
if len(reader.pages) != entry.pages:
seen_failure = True
print(
f"❌ ERROR: Page mismatch: {len(reader.pages)} vs {entry.pages}",
)

pdf_date = pdf_to_datetime(info.get("/CreationDate"))
pdf_date = None if pdf_date is None else pdf_date.isoformat()
entry_date = (
None if entry.creation_date is None else entry.creation_date.isoformat()[:19]
)
if pdf_date != entry_date:
seen_failure = True
print(f"❌ ERROR: Creation date mismatch: {entry_date} vs {pdf_date}")

# Check annotations
Expand All @@ -140,6 +148,7 @@ def check_meta(entry: PdfEntry) -> None:
if entry.annotations:
entry_annotation_sum = entry.annotations.sum()
if pdf_annotations_sum != entry_annotation_sum:
seen_failure = True
print(
f"❌ ERROR: Annotation count mismatch: {entry_annotation_sum} vs {pdf_annotations_sum}"
)
Expand All @@ -162,6 +171,7 @@ def check_meta(entry: PdfEntry) -> None:
if subtype not in seen_subtypes:
todo_subtypes.append(subtype)
print(f" - {subtype}: {count}")
return seen_failure


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion files.json
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@
"producer": "pdfTeX-1.40.21",
"creation_date": "2024-01-03T09:38:26",
"encrypted": false,
"pages": 2,
"pages": 3,
"images": 0,
"forms": 0,
"annotations": {}
Expand Down

0 comments on commit 8c405ec

Please sign in to comment.