-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_config.py
74 lines (70 loc) · 8.4 KB
/
run_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python3
configs = {
'suffix' : '_prompt_12_3',
'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START. Remove references. Remove isolated digits and numbers. Only do edits, no additions. Do not autocomplete any sentences. The end of output may seem abrupt, that is correct. Do not complete the last sentence.",
# 'pre_prompt_2' : "Produce an exact replica of the text provided. The text starts at START. Remove the header.",
}
configs.update({
'input_file': 'data/poverty_famine_sen.pdf',
'output_file': f'data/poverty_famine_sen_cleaned{configs["suffix"]}.txt',
'output_file_raw': 'data/poverty_famine_sen_raw.txt',
'tmpdir': 'data/poverty_famine_sen_tmp',
'start_page': 30,
'end_page' : 90,
'output_to_pdf' : True,
'print_page_breaks' : True,
'remove_newlines' : True,
})
# configs.update({
# 'input_file': 'data/moral-fn.pdf',
# 'output_file': f'data/moral-fn_cleaned{configs["suffix"]}.txt',
# 'output_file_raw': 'data/moral-fn_raw.txt',
# 'tmpdir': 'data/poverty_famine_sen_tmp',
# 'start_page': 1,
# 'end_page' : 3,
# 'output_to_pdf' : True,
# 'print_page_breaks' : False,
# })
# other prompts tried
# 'suffix' : '_prompt_1',
# 'pre_prompt' : "Please cleanup this text for me by removing any page numbers, references, footers, or headers, if any. Don't change the text if it is not a page number, reference, footer, or header. Please also remove any blank lines, and fix spelling. Please add punctuation wherever necessary. Please add anything that will help an automated text to speech read it out. Thank you!",
# 'suffix' : '_prompt_2',
# 'pre_prompt' : "Please copy the text, and cleanup this text by removing any page numbers, references, footers, or headers. Don't change the text if it is not a page number, reference, footer, or header. Please also remove any blank lines, and fix spelling. Please add punctuation wherever necessary. Please add anything that will help an automated text to speech read it out.",
# 'suffix' : '_prompt_3',
# 'pre_prompt' : "After START is text extracted from a pdf. Please copy the text, and cleanup this text by removing any page numbers, references, footers, or headers. At END, the text ends. Do not summarize the text or complete its end even if its incomplete.",
# 'suffix' : '_prompt_4',
# 'pre_prompt' : "After START is text extracted from a page of a paper or a book in pdf format. Please copy the text, and cleanup this text by removing any page numbers, footnotes and references. Note that the start and end of the page (outside of footnotes and references in footnotes) will be incomplete snippets, please do not alter them. At END, the text ends.",
# 'suffix' : '_prompt_5',
# 'pre_prompt' : "After START is text extracted from a page of a paper or a book in pdf format. Please copy the text, and cleanup this text by removing any page numbers, footnotes and references. Note that the start of the page and the part right before footnotes (outside of footnotes and references in footnotes) will be incomplete snippets, please do not alter them. At END, the text ends. Please capitalize headings, and add new lines before and after headings. Please add newlines between paragraphs.",
# 'suffix' : '_prompt_6',
# 'pre_prompt' : "After START is text extracted from a page of a paper or a book in pdf format. Please copy the text, and cleanup this text by removing any page numbers, footnotes and references. Note that the start of the page and the part right before footnotes (outside of footnotes and references in footnotes) will be incomplete snippets, please do not alter them. At END, the text ends. Please capitalize headings, and add new lines before and after headings. Please add newlines between paragraphs. Please do not autocomplete incomplete sentences.",
# 'suffix' : '_prompt_7',
# 'pre_prompt' : "After START is text extracted from a page of a paper or a book in pdf format. Please copy the text, and cleanup this text by removing any page numbers, footnotes and references. Note that the start of the page and the part right before footnotes (outside of footnotes and references in footnotes) will be incomplete snippets, please do not alter them. Right before the footnotes, if there is an incomplete sentence, do not autocomplete it. At END, the text ends. Please capitalize headings, and add new lines before and after headings. Please add newlines between paragraphs. Please do not autocomplete incomplete sentences.",
# 'suffix' : '_prompt_8',
# 'pre_prompt' : "Produce an exact replica of the text provided, minus the footnotes and references. Do not autocomplete any sentences.",
# 'suffix' : '_prompt_9',
# 'pre_prompt' : "Produce an exact replica of the text provided, minus the footnotes and references. Do not autocomplete any sentences at the end of the text produced.",
# 'suffix' : '_prompt_10',
# 'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START and ends at END.",
# 'suffix' : '_prompt_11',
# 'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START and ends at END. Remove references. Do not auto complete any sentences. Do not finish incomplete sentences. Leave the end of the text as is, do not finish or autocomplete it. Only do edits, no additions.",
# 'suffix' : '_prompt_12',
# 'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START and ends at END. Remove references. Only do edits, no additions.",
# 'suffix' : '_prompt_12_1',
# 'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START and ends at END. Remove references. Only do edits, no additions. Do not autocomplete any sentences. The end of output may seem abrupt, that is correct.",
# 'suffix' : '_prompt_12_2',
# 'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START and ends at END. Remove START and END. Remove page numbers and isolated numbers. Remove references. Only do edits, no additions. Do not autocomplete any sentences. The end of output may seem abrupt, that is correct.",
# 'suffix' : '_prompt_12_3',
# 'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START. Remove references. Only do edits, no additions. Do not autocomplete any sentences. The end of output may seem abrupt, that is correct. Do not complete the last sentence.",
# 'suffix' : '_prompt_12_4',
# 'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START. Remove references. Remove isolated digits and numbers, but keep headings. Only do edits, no additions. Do not autocomplete any sentences. The end of output may seem abrupt, that is correct. Do not complete the last sentence.",
# 'suffix' : '_prompt_12_5',
# 'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START. Remove references. Remove isolated digits and numbers, but keep headings. Delete footers by detecting the start of it and deleting everything after that. Only do deletions, no additions. Do not autocomplete any sentences. The end of output may seem abrupt, that is correct. Do not complete the last sentence.",
# 'suffix' : '_prompt_13',
# 'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START and ends at END. Remove references and footnotes. Only do edits, no additions.",
# 'suffix' : '_prompt_14',
# 'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START and ends at END. Remove references. Please capitalize headings, and add new lines before and after headings. Only do edits, no additions.",
# 'suffix' : '_prompt_14',
# 'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START and ends at END. Remove references. Remove references both twoards the end of the page and inline. References are often in parantheses and contain the word see. Please capitalize headings, and add new lines before and after headings. Only do edits, no additions.",
# 'suffix' : '_prompt_12_3_1',
# 'pre_prompt' : "Produce an exact replica of the text provided. The text starts at START. Remove references. Remove isolated digits and numbers. Remove newlines and line breaks except after headings. Only do edits, no additions. Do not autocomplete any sentences. The end of output may seem abrupt, that is correct. Do not complete the last sentence.",