-
Notifications
You must be signed in to change notification settings - Fork 2
/
extract_using_pypdf.py
156 lines (135 loc) · 5.67 KB
/
extract_using_pypdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# DTE Aerial Photo Collection curation project
# Workflow for extracting JPGs and metadata from PDFs
# PyPDF2 Solution
# Garrett Morton, Sam Sciolla
# SI 699
# Written and tested using Python 3.7.0
# PyPDF2 documentation: https://pythonhosted.org/PyPDF2/index.html
# os documentation: https://docs.python.org/3/library/os.html#module-os
# This script uses algorithmic features of a solution posted by sylvain to a Stack Overflow question:
# https://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
# standard modules
import os
import json
import time
# third-party modules
import PyPDF2
# local modules
import misc_functions
# global variables
PATH_DELIMITER = misc_functions.PATH_DELIMITER
## Functions
# Identify embedded links in Index PDF and collect metadata
def pull_links_from_index(relative_path):
index_pdf_file_name = relative_path.split(PATH_DELIMITER)[-1]
print('// Index: {} //'.format(index_pdf_file_name))
index_pdf_file_object = PyPDF2.PdfFileReader(relative_path)
if index_pdf_file_object.getNumPages() > 1:
print('?? More than one page: {} ??'.format(str(index_pdf_file_object.getNumPages())))
pdf_page = index_pdf_file_object.getPage(0)
media_box = pdf_page['/MediaBox']
links = []
annotations = pdf_page['/Annots']
for annotation in annotations:
object_number = annotation.idnum
annot_object = annotation.getObject()
float_objects = annot_object['/Rect']
link_coords = []
for float_object in float_objects:
link_coord = float(float_object)
link_coords.append(link_coord)
if '/A' in annot_object.keys():
indirect_object = annot_object['/A'].getObject()
if '/F' in indirect_object.keys():
file_name = indirect_object['/F']['/F']
link_dictionary = {
'PDF Object ID Number': object_number,
'Linked Image File Name': file_name,
'Link Coordinates': link_coords,
'File or URI?': 'File'}
else:
# Link objects without an '/F' key may signal broken links.
print("?? indirectObject does not have a '/F' key ??")
print(indirect_object)
uri_name = indirect_object['/URI']
link_dictionary = {
'PDF Object ID Number': object_number,
'Linked Image File Name': uri_name,
'Link Coordinates': link_coords,
'File or URI': 'URI'
}
links.append(link_dictionary)
else:
print("?? Annotation does not have a '/A' key ??")
print(annot_object['/AP']['/N'].getObject()['/Subtype'])
#print(object_number) #for error testing
links = sorted(links, key=lambda x: x['Linked Image File Name'])
print('** Number of links identified: {} **'.format(str(len(links))))
index_file_name = index_pdf_file_name.split(PATH_DELIMITER)[-1]
index_file_metadata = {
'Index File Name': index_file_name,
'Source Relative Path': relative_path,
'Links': links,
'Media Box': media_box
}
return index_file_metadata
# Identify JPEG bytestream in Image PDF, write it to a new file, and collect image-level metadata
def extract_jpg_from_pdf(relative_path, output_location=''):
image_pdf_file_name = relative_path.split(PATH_DELIMITER)[-1]
print('// Image: {} //'.format(image_pdf_file_name))
image_pdf_file_object = PyPDF2.PdfFileReader(relative_path)
if image_pdf_file_object.getNumPages() > 1:
print('?? More than one page: {} ??'.format(str(test_image_pdf_file_object.getNumPages())))
pdf_page = image_pdf_file_object.getPage(0)
objects = pdf_page['/Resources']['/XObject']
image_objects = []
for object_name in objects:
if objects[object_name]['/Subtype'] == '/Image':
image_objects.append(objects[object_name])
if len(image_objects) > 1:
print("?? More than one image: {} ??".format(str(len(image_objects))))
image_object = image_objects[0]
image_metadata = {
'Image File Name': image_pdf_file_name,
'Source Relative Path': relative_path,
'Width': image_object['/Width'],
'Height': image_object['/Height'],
'ColorSpace': image_object['/ColorSpace'].replace('/', ''),
'BitsPerComponent': image_object['/BitsPerComponent'],
'Filter': image_object['/Filter'].replace('/', '')
}
identifier = image_pdf_file_name.replace('.pdf', '')
new_image_file_name = 'dte_aerial_' + identifier + '.jpg'
jpg_file = open(output_location + new_image_file_name, 'wb')
jpg_file.write(image_object._data)
jpg_file.close()
image_metadata['Created Image File Name'] = new_image_file_name
return image_metadata
# Manage function invocations and write resulting metadata to a JSON file
def run_pypdf2_workflow(pdf_file_paths, output_location, output_name):
print('** Image Extraction: PyPDF2 Solution **')
pypdf_start = time.time()
image_metadata_dicts = []
index_metadata_dicts = []
for pdf_file_path in pdf_file_paths:
if 'Index' in pdf_file_path:
new_index_metadata_dict = pull_links_from_index(pdf_file_path)
index_metadata_dicts.append(new_index_metadata_dict)
else:
image_metadata_dict = extract_jpg_from_pdf(pdf_file_path, output_location)
image_metadata_dicts.append(image_metadata_dict)
pypdf2_batch_metadata = {}
pypdf2_batch_metadata['Index Records'] = index_metadata_dicts
pypdf2_batch_metadata['Image Records'] = image_metadata_dicts
pypdf2_metadata_file = open(output_location + output_name, 'w', encoding='utf-8')
pypdf2_metadata_file.write(json.dumps(pypdf2_batch_metadata, indent=4))
pypdf2_metadata_file.close()
pypdf_end = time.time()
print('** Time to Run: {} **'.format(str(pypdf_end - pypdf_start)))
return pypdf2_batch_metadata
## Main Program
if __name__=="__main__":
print('\n** DTE Aerial Batch Processing Script **')
output_location = 'output/pypdf2/'
pdf_file_paths = misc_functions.collect_relative_paths_for_files('input/pdf_files/part1/macomb/1961')
batch_metadata = run_pypdf2_workflow(pdf_file_paths, output_location, 'sample_pypdf2_batch_metadata.json')