fix test index name

inspirehep · Oct 2, 2024 · ce53808 · ce53808
1 parent ef676a2
commit ce53808
Show file tree

Hide file tree

Showing 26 changed files with 376 additions and 268 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: fix-byte-order-marker
+      - id: mixed-line-ending
+      - id: name-tests-test
+        args: [ --pytest-test-first ]
+        exclude: '^(?!factories/)'
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.5.6
+    hooks:
+      - id: ruff
+        args: [ --fix , --unsafe-fixes]
diff --git a/Dockerfile b/Dockerfile
@@ -2,6 +2,6 @@ FROM python:3.8
 RUN apt update && apt install poppler-utils -y
 COPY setup.py setup.cfg README.rst ./
 COPY refextract refextract/
-RUN python setup.py install 
+RUN python setup.py install
 ENV PROMETHEUS_MULTIPROC_DIR='/tmp'
 ENTRYPOINT exec gunicorn -b :5000 --access-logfile - --error-logfile - refextract.app:app --timeout 650
diff --git a/Untitled.ipynb b/Untitled.ipynb
@@ -0,0 +1,106 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "163771b1-17d9-4648-875c-63f1a54c9201",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "1\n",
+      "2\n",
+      "3\n",
+      "4\n",
+      "5\n",
+      "6\n",
+      "6\n"
+     ]
+    }
+   ],
+   "source": [
+    "real_index = 0\n",
+    "s = \"sdasdas\"\n",
+    "\n",
+    "for real_index, char in enumerate(s):\n",
+    "    print(real_index)\n",
+    "\n",
+    "print(real_index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6132dad4-7fce-4719-beea-693eb32eed16",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'asdsad'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"asdsad\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "d72da078-f2c3-4879-a1a1-7557688ee727",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = \"adsad\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "4497ac16-b4fd-407a-b567-2b5a67ec5d55",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wow\n"
+     ]
+    }
+   ],
+   "source": [
+    "if path.startswith:\n",
+    "    print(\"wow\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/refextract/__init__.py b/refextract/__init__.py
@@ -23,7 +23,7 @@
 
 """Refextract."""
 
-from .references.api import (
+from refextract.references.api import (
     extract_journal_reference,
     extract_references_from_file,
     extract_references_from_string,

diff --git a/refextract/app.py b/refextract/app.py
@@ -1,14 +1,15 @@
 import logging
 
 from flask import Flask, jsonify, make_response
-from prometheus_flask_exporter.multiprocess import \
-    GunicornInternalPrometheusMetrics
+from prometheus_flask_exporter.multiprocess import GunicornInternalPrometheusMetrics
 from webargs import fields
 from webargs.flaskparser import FlaskParser
 
-from refextract.references.api import (extract_journal_reference,
-                                       extract_references_from_string,
-                                       extract_references_from_url)
+from refextract.references.api import (
+    extract_journal_reference,
+    extract_references_from_string,
+    extract_references_from_url,
+)
 
 parser = FlaskParser()
 

diff --git a/refextract/authors/regexs.py b/refextract/authors/regexs.py
@@ -24,7 +24,7 @@
 import logging
 import re
 
-from ..references.config import CFG_REFEXTRACT_KBS
+from refextract.references.config import CFG_REFEXTRACT_KBS
 
 LOGGER = logging.getLogger(__name__)
 
@@ -42,10 +42,7 @@ def get_author_affiliation_numeration_str(punct=None):
     re_number = r'(?:\d\d?)'
     re_chained_numbers = r"(?:(?:[,;]\s*%s\.?\s*))*" % re_number
     # Punctuation surrounding the number, either general or specific again
-    if punct is None:
-        re_punct = r"(?:[\{\(\[]?)"
-    else:
-        re_punct = re.escape(punct)
+    re_punct = '(?:[\\{\\(\\[]?)' if punct is None else re.escape(punct)
 
     # Generic number finder (MUST NOT INCLUDE NAMED GROUPS!!!)
     numeration_str = r"""
@@ -86,10 +83,7 @@ def get_initial_surname_author_pattern(incl_numeration=False):
     @return (string): The 'Initials Surname' author pattern."""
     # Possible inclusion of superscript numeration at the end of author names
     # Will match the empty string
-    if incl_numeration:
-        append_num_re = get_author_affiliation_numeration_str() + '?'
-    else:
-        append_num_re = ""
+    append_num_re = get_author_affiliation_numeration_str() + '?' if incl_numeration else ''
 
     return r"""
     (?:
@@ -137,10 +131,7 @@ def get_surname_initial_author_pattern(incl_numeration=False):
     @return (string): The 'Surname Initials' author pattern."""
     # Possible inclusion of superscript numeration at the end of author names
     # Will match the empty string
-    if incl_numeration:
-        append_num_re = get_author_affiliation_numeration_str() + '?'
-    else:
-        append_num_re = ""
+    append_num_re = get_author_affiliation_numeration_str() + '?' if incl_numeration else ''
 
     return r"""
     (?:
@@ -410,27 +401,27 @@ def add_to_auth_list(s):
     fpath = CFG_REFEXTRACT_KBS['collaborations']
 
     try:
-        fh = open(fpath, "r")
+        with open(fpath, 'r') as fh:
+            for line_num, rawline in enumerate(fh):
+                try:
+                    rawline = rawline.decode("utf-8")
+                except UnicodeError:
+                    LOGGER.debug(u"Unicode problems in %s for line %d", fpath, line_num)
+                    raise UnicodeError(
+                        "Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
+                if rawline.strip() and rawline[0].strip() != '#':
+                    add_to_auth_list(rawline)
+                    # Shorten collaboration to 'coll'
+                    if rawline.lower().endswith('collaboration\n'):
+                        coll_version = rawline[:rawline.lower().find(
+                            u'collaboration\n')] + r"coll[\.\,]"
+                        add_to_auth_list(
+                            coll_version.strip().replace(' ', r'\s') + u"s?")
     except IOError:
         # problem opening KB for reading, or problem while reading from it:
         LOGGER.debug(u"Error: Could not build knowledge base containing author patterns - failed to read from KB %s s.\n", fpath)
         raise IOError("Error: Unable to open collaborations kb '%s'" % fpath)
 
-    for line_num, rawline in enumerate(fh):
-        try:
-            rawline = rawline.decode("utf-8")
-        except UnicodeError:
-            LOGGER.debug(u"Unicode problems in %s for line %d", fpath, line_num)
-            raise UnicodeError(
-                "Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
-        if rawline.strip() and rawline[0].strip() != '#':
-            add_to_auth_list(rawline)
-            # Shorten collaboration to 'coll'
-            if rawline.lower().endswith('collaboration\n'):
-                coll_version = rawline[:rawline.lower().find(
-                    u'collaboration\n')] + r"coll[\.\,]"
-                add_to_auth_list(
-                    coll_version.strip().replace(' ', r'\s') + u"s?")
 
     author_match_re = ""
     if len(auths) > 0:

diff --git a/refextract/documents/pdf.py b/refextract/documents/pdf.py
@@ -39,7 +39,7 @@
 import re
 import subprocess
 
-from ..references.config import CFG_PATH_PDFTOTEXT
+from refextract.references.config import CFG_PATH_PDFTOTEXT
 
 LOGGER = logging.getLogger(__name__)
 
@@ -56,10 +56,7 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
     if not os.path.isfile(CFG_PATH_PDFTOTEXT):
         raise IOError('Missing pdftotext executable')
 
-    if keep_layout:
-        layout_option = "-layout"
-    else:
-        layout_option = "-raw"
+    layout_option = '-layout' if keep_layout else '-raw'
     doclines = []
     # Pattern to check for lines with a leading page-break character.
     # If this pattern is matched, we want to split the page-break into

diff --git a/refextract/documents/text.py b/refextract/documents/text.py
@@ -25,7 +25,6 @@
 
 import re
 
-
 re_space_comma = re.compile(r'\s,', re.UNICODE)
 re_space_semicolon = re.compile(r'\s;', re.UNICODE)
 re_space_period = re.compile(r'\s\.', re.UNICODE)
@@ -264,12 +263,8 @@ def get_number_header_lines(docbody, page_break_posns):
     # pattern to search for a word in a line:
     p_wordSearch = re.compile(r'([A-Za-z0-9-]+)', re.UNICODE)
     if remaining_breaks > 2:
-        if remaining_breaks > 3:
-            # Only check odd page headers
-            next_head = 2
-        else:
-            # Check headers on each page
-            next_head = 1
+        # Only check odd page headers else check headers on each page
+        next_head = 2 if remaining_breaks > 3 else 1
         keep_checking = 1
         while keep_checking:
             cur_break = 1
@@ -406,7 +401,7 @@ def strip_headers_footers_pagebreaks(docbody,
         for i in range(0, len(page_break_posns)):
             # Unless this is the last page break, chop headers
             if not first:
-                for dummy in range(1, num_head_lines + 1):
+                for _dummy in range(1, num_head_lines + 1):
                     docbody[page_break_posns[i] +
                             1:page_break_posns[i] + 2] = []
             else:
@@ -415,7 +410,7 @@ def strip_headers_footers_pagebreaks(docbody,
             docbody[page_break_posns[i]:page_break_posns[i] + 1] = []
             # Chop footers (unless this is the first page break)
             if i != len(page_break_posns) - 1:
-                for dummy in range(1, num_foot_lines + 1):
+                for _dummy in range(1, num_foot_lines + 1):
                     docbody[page_break_posns[i] -
                             num_foot_lines:page_break_posns[i] -
                             num_foot_lines + 1] = []
@@ -429,7 +424,7 @@ def check_boundary_lines_similar(l_1, l_2):
     @return: (int) 1/0.
     """
     num_matches = 0
-    if (type(l_1) != list) or (type(l_2) != list) or (len(l_1) != len(l_2)):
+    if not isinstance(l_1, list) or not isinstance(l_2, list) or len(l_1) != len(l_2):
         # these 'boundaries' are not similar
         return 0
 

diff --git a/refextract/references/api.py b/refextract/references/api.py
@@ -29,25 +29,29 @@
 """
 
 import os
-import requests
-import magic
-
 from tempfile import mkstemp
 
+import magic
+import requests
 from inspire_utils.dedupers import dedupe_list
 
-from .engine import (
+from refextract.references.engine import (
     get_kbs,
     get_plaintext_document_body,
     parse_reference_line,
     parse_references,
 )
-from .errors import FullTextNotAvailableError
-from .find import (find_numeration_in_body,
-                   get_reference_section_beginning)
-from .pdf import extract_texkeys_and_urls_from_pdf
-from .text import extract_references_from_fulltext, rebuild_reference_lines
-from .record import update_reference_with_urls
+from refextract.references.errors import FullTextNotAvailableError
+from refextract.references.find import (
+    find_numeration_in_body,
+    get_reference_section_beginning,
+)
+from refextract.references.pdf import extract_texkeys_and_urls_from_pdf
+from refextract.references.record import update_reference_with_urls
+from refextract.references.text import (
+    extract_references_from_fulltext,
+    rebuild_reference_lines,
+)
 
 
 def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
@@ -146,7 +150,7 @@ def extract_references_from_file(path,
         extracted_texkeys_urls = extract_texkeys_and_urls_from_pdf(path)
         if len(extracted_texkeys_urls) == len(parsed_refs):
             parsed_refs_updated = []
-            for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls):
+            for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls, strict=False):
                 update_reference_with_urls(ref, ref_texkey_urls.get('urls', []))
                 if ref.get('url'):
                     ref['url'] = dedupe_list(ref['url'])