Skip to content

Commit

Permalink
fix test index name
Browse files Browse the repository at this point in the history
  • Loading branch information
DonHaul committed Oct 2, 2024
1 parent ef676a2 commit ce53808
Show file tree
Hide file tree
Showing 26 changed files with 376 additions and 268 deletions.
17 changes: 17 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- id: fix-byte-order-marker
- id: mixed-line-ending
- id: name-tests-test
args: [ --pytest-test-first ]
exclude: '^(?!factories/)'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.5.6
hooks:
- id: ruff
args: [ --fix , --unsafe-fixes]
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ FROM python:3.8
RUN apt update && apt install poppler-utils -y
COPY setup.py setup.cfg README.rst ./
COPY refextract refextract/
RUN python setup.py install
RUN python setup.py install
ENV PROMETHEUS_MULTIPROC_DIR='/tmp'
ENTRYPOINT exec gunicorn -b :5000 --access-logfile - --error-logfile - refextract.app:app --timeout 650
106 changes: 106 additions & 0 deletions Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "163771b1-17d9-4648-875c-63f1a54c9201",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n",
"6\n"
]
}
],
"source": [
"real_index = 0\n",
"s = \"sdasdas\"\n",
"\n",
"for real_index, char in enumerate(s):\n",
" print(real_index)\n",
"\n",
"print(real_index)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6132dad4-7fce-4719-beea-693eb32eed16",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'asdsad'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"asdsad\""
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "d72da078-f2c3-4879-a1a1-7557688ee727",
"metadata": {},
"outputs": [],
"source": [
"path = \"adsad\""
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "4497ac16-b4fd-407a-b567-2b5a67ec5d55",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wow\n"
]
}
],
"source": [
"if path.startswith:\n",
" print(\"wow\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
2 changes: 1 addition & 1 deletion refextract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

"""Refextract."""

from .references.api import (
from refextract.references.api import (
extract_journal_reference,
extract_references_from_file,
extract_references_from_string,
Expand Down
11 changes: 6 additions & 5 deletions refextract/app.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import logging

from flask import Flask, jsonify, make_response
from prometheus_flask_exporter.multiprocess import \
GunicornInternalPrometheusMetrics
from prometheus_flask_exporter.multiprocess import GunicornInternalPrometheusMetrics
from webargs import fields
from webargs.flaskparser import FlaskParser

from refextract.references.api import (extract_journal_reference,
extract_references_from_string,
extract_references_from_url)
from refextract.references.api import (
extract_journal_reference,
extract_references_from_string,
extract_references_from_url,
)

parser = FlaskParser()

Expand Down
49 changes: 20 additions & 29 deletions refextract/authors/regexs.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import logging
import re

from ..references.config import CFG_REFEXTRACT_KBS
from refextract.references.config import CFG_REFEXTRACT_KBS

LOGGER = logging.getLogger(__name__)

Expand All @@ -42,10 +42,7 @@ def get_author_affiliation_numeration_str(punct=None):
re_number = r'(?:\d\d?)'
re_chained_numbers = r"(?:(?:[,;]\s*%s\.?\s*))*" % re_number
# Punctuation surrounding the number, either general or specific again
if punct is None:
re_punct = r"(?:[\{\(\[]?)"
else:
re_punct = re.escape(punct)
re_punct = '(?:[\\{\\(\\[]?)' if punct is None else re.escape(punct)

# Generic number finder (MUST NOT INCLUDE NAMED GROUPS!!!)
numeration_str = r"""
Expand Down Expand Up @@ -86,10 +83,7 @@ def get_initial_surname_author_pattern(incl_numeration=False):
@return (string): The 'Initials Surname' author pattern."""
# Possible inclusion of superscript numeration at the end of author names
# Will match the empty string
if incl_numeration:
append_num_re = get_author_affiliation_numeration_str() + '?'
else:
append_num_re = ""
append_num_re = get_author_affiliation_numeration_str() + '?' if incl_numeration else ''

return r"""
(?:
Expand Down Expand Up @@ -137,10 +131,7 @@ def get_surname_initial_author_pattern(incl_numeration=False):
@return (string): The 'Surname Initials' author pattern."""
# Possible inclusion of superscript numeration at the end of author names
# Will match the empty string
if incl_numeration:
append_num_re = get_author_affiliation_numeration_str() + '?'
else:
append_num_re = ""
append_num_re = get_author_affiliation_numeration_str() + '?' if incl_numeration else ''

return r"""
(?:
Expand Down Expand Up @@ -410,27 +401,27 @@ def add_to_auth_list(s):
fpath = CFG_REFEXTRACT_KBS['collaborations']

try:
fh = open(fpath, "r")
with open(fpath, 'r') as fh:
for line_num, rawline in enumerate(fh):
try:
rawline = rawline.decode("utf-8")
except UnicodeError:
LOGGER.debug(u"Unicode problems in %s for line %d", fpath, line_num)
raise UnicodeError(
"Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
if rawline.strip() and rawline[0].strip() != '#':
add_to_auth_list(rawline)
# Shorten collaboration to 'coll'
if rawline.lower().endswith('collaboration\n'):
coll_version = rawline[:rawline.lower().find(
u'collaboration\n')] + r"coll[\.\,]"
add_to_auth_list(
coll_version.strip().replace(' ', r'\s') + u"s?")
except IOError:
# problem opening KB for reading, or problem while reading from it:
LOGGER.debug(u"Error: Could not build knowledge base containing author patterns - failed to read from KB %s s.\n", fpath)
raise IOError("Error: Unable to open collaborations kb '%s'" % fpath)

for line_num, rawline in enumerate(fh):
try:
rawline = rawline.decode("utf-8")
except UnicodeError:
LOGGER.debug(u"Unicode problems in %s for line %d", fpath, line_num)
raise UnicodeError(
"Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
if rawline.strip() and rawline[0].strip() != '#':
add_to_auth_list(rawline)
# Shorten collaboration to 'coll'
if rawline.lower().endswith('collaboration\n'):
coll_version = rawline[:rawline.lower().find(
u'collaboration\n')] + r"coll[\.\,]"
add_to_auth_list(
coll_version.strip().replace(' ', r'\s') + u"s?")

author_match_re = ""
if len(auths) > 0:
Expand Down
7 changes: 2 additions & 5 deletions refextract/documents/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
import re
import subprocess

from ..references.config import CFG_PATH_PDFTOTEXT
from refextract.references.config import CFG_PATH_PDFTOTEXT

LOGGER = logging.getLogger(__name__)

Expand All @@ -56,10 +56,7 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
if not os.path.isfile(CFG_PATH_PDFTOTEXT):
raise IOError('Missing pdftotext executable')

if keep_layout:
layout_option = "-layout"
else:
layout_option = "-raw"
layout_option = '-layout' if keep_layout else '-raw'
doclines = []
# Pattern to check for lines with a leading page-break character.
# If this pattern is matched, we want to split the page-break into
Expand Down
15 changes: 5 additions & 10 deletions refextract/documents/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

import re


re_space_comma = re.compile(r'\s,', re.UNICODE)
re_space_semicolon = re.compile(r'\s;', re.UNICODE)
re_space_period = re.compile(r'\s\.', re.UNICODE)
Expand Down Expand Up @@ -264,12 +263,8 @@ def get_number_header_lines(docbody, page_break_posns):
# pattern to search for a word in a line:
p_wordSearch = re.compile(r'([A-Za-z0-9-]+)', re.UNICODE)
if remaining_breaks > 2:
if remaining_breaks > 3:
# Only check odd page headers
next_head = 2
else:
# Check headers on each page
next_head = 1
# Only check odd page headers else check headers on each page
next_head = 2 if remaining_breaks > 3 else 1
keep_checking = 1
while keep_checking:
cur_break = 1
Expand Down Expand Up @@ -406,7 +401,7 @@ def strip_headers_footers_pagebreaks(docbody,
for i in range(0, len(page_break_posns)):
# Unless this is the last page break, chop headers
if not first:
for dummy in range(1, num_head_lines + 1):
for _dummy in range(1, num_head_lines + 1):
docbody[page_break_posns[i] +
1:page_break_posns[i] + 2] = []
else:
Expand All @@ -415,7 +410,7 @@ def strip_headers_footers_pagebreaks(docbody,
docbody[page_break_posns[i]:page_break_posns[i] + 1] = []
# Chop footers (unless this is the first page break)
if i != len(page_break_posns) - 1:
for dummy in range(1, num_foot_lines + 1):
for _dummy in range(1, num_foot_lines + 1):
docbody[page_break_posns[i] -
num_foot_lines:page_break_posns[i] -
num_foot_lines + 1] = []
Expand All @@ -429,7 +424,7 @@ def check_boundary_lines_similar(l_1, l_2):
@return: (int) 1/0.
"""
num_matches = 0
if (type(l_1) != list) or (type(l_2) != list) or (len(l_1) != len(l_2)):
if not isinstance(l_1, list) or not isinstance(l_2, list) or len(l_1) != len(l_2):
# these 'boundaries' are not similar
return 0

Expand Down
26 changes: 15 additions & 11 deletions refextract/references/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,25 +29,29 @@
"""

import os
import requests
import magic

from tempfile import mkstemp

import magic
import requests
from inspire_utils.dedupers import dedupe_list

from .engine import (
from refextract.references.engine import (
get_kbs,
get_plaintext_document_body,
parse_reference_line,
parse_references,
)
from .errors import FullTextNotAvailableError
from .find import (find_numeration_in_body,
get_reference_section_beginning)
from .pdf import extract_texkeys_and_urls_from_pdf
from .text import extract_references_from_fulltext, rebuild_reference_lines
from .record import update_reference_with_urls
from refextract.references.errors import FullTextNotAvailableError
from refextract.references.find import (
find_numeration_in_body,
get_reference_section_beginning,
)
from refextract.references.pdf import extract_texkeys_and_urls_from_pdf
from refextract.references.record import update_reference_with_urls
from refextract.references.text import (
extract_references_from_fulltext,
rebuild_reference_lines,
)


def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
Expand Down Expand Up @@ -146,7 +150,7 @@ def extract_references_from_file(path,
extracted_texkeys_urls = extract_texkeys_and_urls_from_pdf(path)
if len(extracted_texkeys_urls) == len(parsed_refs):
parsed_refs_updated = []
for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls):
for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls, strict=False):
update_reference_with_urls(ref, ref_texkey_urls.get('urls', []))
if ref.get('url'):
ref['url'] = dedupe_list(ref['url'])
Expand Down
Loading

0 comments on commit ce53808

Please sign in to comment.