Skip to content

Commit

Permalink
fix test index name
Browse files Browse the repository at this point in the history
  • Loading branch information
DonHaul committed Oct 3, 2024
1 parent ef676a2 commit a0ac7a1
Show file tree
Hide file tree
Showing 27 changed files with 727 additions and 517 deletions.
17 changes: 17 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- id: fix-byte-order-marker
- id: mixed-line-ending
- id: name-tests-test
args: [ --pytest-test-first ]
exclude: '^(?!factories/)'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.5.6
hooks:
- id: ruff
args: [ --fix , --unsafe-fixes]
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ FROM python:3.8
RUN apt update && apt install poppler-utils -y
COPY setup.py setup.cfg README.rst ./
COPY refextract refextract/
RUN python setup.py install
RUN python setup.py install
ENV PROMETHEUS_MULTIPROC_DIR='/tmp'
ENTRYPOINT exec gunicorn -b :5000 --access-logfile - --error-logfile - refextract.app:app --timeout 650
2 changes: 1 addition & 1 deletion refextract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

"""Refextract."""

from .references.api import (
from refextract.references.api import (
extract_journal_reference,
extract_references_from_file,
extract_references_from_string,
Expand Down
14 changes: 8 additions & 6 deletions refextract/app.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import logging

from flask import Flask, jsonify, make_response
from prometheus_flask_exporter.multiprocess import \
GunicornInternalPrometheusMetrics
from prometheus_flask_exporter.multiprocess import GunicornInternalPrometheusMetrics
from webargs import fields
from webargs.flaskparser import FlaskParser

from refextract.references.api import (extract_journal_reference,
extract_references_from_string,
extract_references_from_url)
from refextract.references.api import (
extract_journal_reference,
extract_references_from_string,
extract_references_from_url,
)

parser = FlaskParser()

Expand Down Expand Up @@ -46,7 +47,8 @@ def extract_journal_info(args):
return make_response(
jsonify(
{
"message": f"Can not extract publication info data. Reason: {str(e)}"
"message":
f"Can not extract publication info data. Reason: {str(e)}"
}
),
500,
Expand Down
222 changes: 124 additions & 98 deletions refextract/authors/regexs.py

Large diffs are not rendered by default.

7 changes: 2 additions & 5 deletions refextract/documents/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
import re
import subprocess

from ..references.config import CFG_PATH_PDFTOTEXT
from refextract.references.config import CFG_PATH_PDFTOTEXT

LOGGER = logging.getLogger(__name__)

Expand All @@ -56,10 +56,7 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
if not os.path.isfile(CFG_PATH_PDFTOTEXT):
raise IOError('Missing pdftotext executable')

if keep_layout:
layout_option = "-layout"
else:
layout_option = "-raw"
layout_option = '-layout' if keep_layout else '-raw'
doclines = []
# Pattern to check for lines with a leading page-break character.
# If this pattern is matched, we want to split the page-break into
Expand Down
18 changes: 7 additions & 11 deletions refextract/documents/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

import re


re_space_comma = re.compile(r'\s,', re.UNICODE)
re_space_semicolon = re.compile(r'\s;', re.UNICODE)
re_space_period = re.compile(r'\s\.', re.UNICODE)
Expand All @@ -34,7 +33,8 @@
re_space_closing_square_bracket = re.compile(r'\s\]', re.UNICODE)
re_opening_square_bracket_space = re.compile(r'\[\s', re.UNICODE)
re_hyphens = re.compile(
br'(\\255|\u02D7|\u0335|\u0336|\u2212|\u002D|\uFE63|\uFF0D)'.decode('raw_unicode_escape'), re.UNICODE)
br'(\\255|\u02D7|\u0335|\u0336|\u2212|\u002D|\uFE63|\uFF0D)'
.decode('raw_unicode_escape'), re.UNICODE)
re_multiple_space = re.compile(r'\s{2,}', re.UNICODE)

re_group_captured_multiple_space = re.compile(r'(\s{2,})', re.UNICODE)
Expand Down Expand Up @@ -264,12 +264,8 @@ def get_number_header_lines(docbody, page_break_posns):
# pattern to search for a word in a line:
p_wordSearch = re.compile(r'([A-Za-z0-9-]+)', re.UNICODE)
if remaining_breaks > 2:
if remaining_breaks > 3:
# Only check odd page headers
next_head = 2
else:
# Check headers on each page
next_head = 1
# Only check odd page headers else check headers on each page
next_head = 2 if remaining_breaks > 3 else 1
keep_checking = 1
while keep_checking:
cur_break = 1
Expand Down Expand Up @@ -406,7 +402,7 @@ def strip_headers_footers_pagebreaks(docbody,
for i in range(0, len(page_break_posns)):
# Unless this is the last page break, chop headers
if not first:
for dummy in range(1, num_head_lines + 1):
for _dummy in range(1, num_head_lines + 1):
docbody[page_break_posns[i] +
1:page_break_posns[i] + 2] = []
else:
Expand All @@ -415,7 +411,7 @@ def strip_headers_footers_pagebreaks(docbody,
docbody[page_break_posns[i]:page_break_posns[i] + 1] = []
# Chop footers (unless this is the first page break)
if i != len(page_break_posns) - 1:
for dummy in range(1, num_foot_lines + 1):
for _dummy in range(1, num_foot_lines + 1):
docbody[page_break_posns[i] -
num_foot_lines:page_break_posns[i] -
num_foot_lines + 1] = []
Expand All @@ -429,7 +425,7 @@ def check_boundary_lines_similar(l_1, l_2):
@return: (int) 1/0.
"""
num_matches = 0
if (type(l_1) != list) or (type(l_2) != list) or (len(l_1) != len(l_2)):
if not isinstance(l_1, list) or not isinstance(l_2, list) or len(l_1) != len(l_2):
# these 'boundaries' are not similar
return 0

Expand Down
40 changes: 25 additions & 15 deletions refextract/references/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,25 +29,29 @@
"""

import os
import requests
import magic

from tempfile import mkstemp

import magic
import requests
from inspire_utils.dedupers import dedupe_list

from .engine import (
from refextract.references.engine import (
get_kbs,
get_plaintext_document_body,
parse_reference_line,
parse_references,
)
from .errors import FullTextNotAvailableError
from .find import (find_numeration_in_body,
get_reference_section_beginning)
from .pdf import extract_texkeys_and_urls_from_pdf
from .text import extract_references_from_fulltext, rebuild_reference_lines
from .record import update_reference_with_urls
from refextract.references.errors import FullTextNotAvailableError
from refextract.references.find import (
find_numeration_in_body,
get_reference_section_beginning,
)
from refextract.references.pdf import extract_texkeys_and_urls_from_pdf
from refextract.references.record import update_reference_with_urls
from refextract.references.text import (
extract_references_from_fulltext,
rebuild_reference_lines,
)


def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
Expand All @@ -71,7 +75,8 @@ def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
To override KBs for journal names etc., use ``override_kbs_files``:
>>> extract_references_from_url(path, override_kbs_files={'journals': 'my/path/to.kb'})
>>> extract_references_from_url(path,
override_kbs_files={'journals': 'my/path/to.kb'})
"""
# Get temporary filepath to download to
Expand Down Expand Up @@ -122,7 +127,8 @@ def extract_references_from_file(path,
To override KBs for journal names etc., use ``override_kbs_files``:
>>> extract_references_from_file(path, override_kbs_files={'journals': 'my/path/to.kb'})
>>> extract_references_from_file(path,
override_kbs_files={'journals': 'my/path/to.kb'})
"""
if not os.path.isfile(path):
Expand All @@ -146,11 +152,14 @@ def extract_references_from_file(path,
extracted_texkeys_urls = extract_texkeys_and_urls_from_pdf(path)
if len(extracted_texkeys_urls) == len(parsed_refs):
parsed_refs_updated = []
for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls):
for ref, ref_texkey_urls in zip(parsed_refs,
extracted_texkeys_urls,
strict=False):
update_reference_with_urls(ref, ref_texkey_urls.get('urls', []))
if ref.get('url'):
ref['url'] = dedupe_list(ref['url'])
parsed_refs_updated.append(dict(ref, texkey=[ref_texkey_urls['texkey']]))
parsed_refs_updated.append(dict(ref,
texkey=[ref_texkey_urls['texkey']]))

return parsed_refs_updated
return parsed_refs
Expand Down Expand Up @@ -182,7 +191,8 @@ def extract_references_from_string(source,
To override KBs for journal names etc., use ``override_kbs_files``:
>>> extract_references_from_string(path, override_kbs_files={'journals': 'my/path/to.kb'})
>>> extract_references_from_string(path,
override_kbs_files={'journals': 'my/path/to.kb'})
"""
docbody = source.split('\n')
if not is_only_references:
Expand Down
Loading

0 comments on commit a0ac7a1

Please sign in to comment.