Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lib test #5

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@ config.env
logs/
output/
cache/

.idea
load_env.bash
__pycache__/
9 changes: 4 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
FROM python:3.7

WORKDIR /usr/src/app
RUN mkdir output
RUN mkdir cache
VOLUME ["/usr/src/app/output"]
VOLUME ["/usr/src/app/cache"]
RUN mkdir output cache

VOLUME ["/usr/src/app/output", "/usr/src/app/cache"]

COPY ./config/requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt && \
rm requirements.txt

COPY ./src/* ./

RUN flake8 --ignore=E221,E241 ./*.py ./run ./annotate_download_logs
RUN flake8 ./*

COPY ./config/spiders ./

Expand Down
3 changes: 3 additions & 0 deletions config/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
flake8==3.6.0
httplib2==0.12.1

# New requirements
hirmeos-clients>=0.1.6
89 changes: 59 additions & 30 deletions src/annotate_download_logs
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,36 @@ Additionally, we convert the URLs to ISBNs and collate request data by date,
outputting a CSV for ingest via the stats system.
"""

import os
import csv
import sys
import datetime
import httplib2
import json
import os
import sys
import time
import httplib2
import datetime
from typing import Iterator, Tuple

# Deprecated since version 3.2: https://docs.python.org/3/library/optparse.html
from optparse import OptionParser

from hirmeos_clients import TokenClient

from geolookup import GeoLookup


SESSION_TIMEOUT = int(os.environ['SESSION_TIMEOUT'])
ROLLOVER = os.environ['ROLLOVER'] in ('True', 'true', 't', 1)
URI_API_ENDP = os.environ['URI_API_ENDP']
URI_API_USER = os.environ['URI_API_USER']
URI_API_PASS = os.environ['URI_API_PASS']
AUTH_API_ENDP = os.environ['AUTH_API_ENDP']
URI_SCHEME = os.environ['URI_SCHEME']
URI_STRICT = os.environ['URI_STRICT']
ROLLOVER = os.environ['ROLLOVER'] in ('True', 'true', 't', 1)
URI_API_ENDP = os.environ['URI_API_ENDP']
URI_API_USER = os.environ['URI_API_USER']
URI_API_PASS = os.environ['URI_API_PASS']
AUTH_API_ENDP = os.environ['AUTH_API_ENDP']
URI_SCHEME = os.environ['URI_SCHEME']
URI_STRICT = os.environ['URI_STRICT']
GEO_LOOKUP_PATH = os.environ.get('GEO_LOOKUP_PATH', None)
EXCLUDED_URLS = json.loads(os.getenv('EXCLUDED_URLS'))
CACHE = {}
EXCLUDED_URLS = json.loads(os.getenv('EXCLUDED_URLS'))
CACHE = {}
TOKENS_KEY = os.getenv('TOKENS_KEY')

ARGS = [
{
'val': '--measure',
Expand All @@ -53,52 +61,69 @@ ARGS = [
]


def get_token(url, email, passwd):
h = httplib2.Http()
def get_token(url: str, email: str, passwd: str) -> json:
if TOKENS_KEY:
return tokens_client()
http = httplib2.Http()
credentials = {'email': email, 'password': passwd}
headers = {'content-type': 'application/json'}
res, content = h.request(url, 'POST', json.dumps(credentials), headers)
res, content = http.request(url, 'POST', json.dumps(credentials), headers)
try:
assert res.status == 200
except AssertionError:
raise ValueError(content.decode('utf-8'))
return json.loads(content.decode('utf-8'))['data'][0]['token']


def get_options(args):
def tokens_client():
"""Very messy code fix - allow the tokens_client to be used instead of
requesting a token from the tokens api.
"""
tokens_client = TokenClient(tokens_key=TOKENS_KEY)
return tokens_client.token


def get_options(args: list) -> object:
parser = OptionParser()
for arg in args:
parser.add_option(arg['val'], dest=arg['dest'], default=arg['default'],
action=arg['action'], help=arg['help'])
parser.add_option(
arg['val'],
dest=arg['dest'],
default=arg['default'],
action=arg['action'],
help=arg['help']
)
options, rest = parser.parse_args()

assert rest == []
assert options.measure
return options


def url_to_id(url, timestamp):
def url_to_id(url: str, timestamp: time) -> json:
if url in CACHE:
return CACHE[url]
req = "%s?uri=%s&filter=uri_scheme:%s&strict=%s" \
% (URI_API_ENDP, url, URI_SCHEME, URI_STRICT)
h = httplib2.Http()
res, content = h.request(req, 'GET', headers={'Authorization': AUTH})
req = (
f"{URI_API_ENDP}?uri={url}&filter=uri_scheme:"
f"{URI_SCHEME}&strict={URI_STRICT}"
)
http = httplib2.Http()
res, content = http.request(req, 'GET', headers={'Authorization': AUTH})
try:
assert res.status == 200
except AssertionError:
if url in EXCLUDED_URLS:
return []
r = json.loads(content.decode('utf-8'))
p = (r['message'], r['parameters']['uri'], timestamp)
print("%s: %s (%s)" % p, file=sys.stderr)
print(f"{p}: {sys.stderr}")
sys.exit(1)
entry = json.loads(content.decode('utf-8'))['data']
CACHE[url] = entry
return entry


def resolve(get_id):
def resolve(get_id: str) -> Iterator[Tuple]:
"""
Read in CSV data from stdin; lazily return a stream of tuples of
type: (timestamp * ip_address * uri * str), where the final <str>
Expand All @@ -107,9 +132,9 @@ def resolve(get_id):

`get_id` is a callback of type (url -> timestamp -> node | None)
"""
r = csv.reader(sys.stdin)
stream = csv.reader(sys.stdin)

for timestamp, ip_address, url, agent in r:
for timestamp, ip_address, url, agent in stream:
identifiers = get_id(url, timestamp)

excluded = identifiers == []
Expand All @@ -122,7 +147,11 @@ def resolve(get_id):
yield (ds, ip_address, identifiers, agent)


def strip_sessions(get_id, session_timeout, rollover):
def strip_sessions(
get_id: str,
session_timeout: int,
rollover: int
) -> Iterator:
"""
Take a lazy stream whose items are of type:
(timestamp * ip_address * uri * str)
Expand Down Expand Up @@ -181,7 +210,7 @@ def project_hits():
return hits


def run(measure, add_headers):
def run(measure: object, add_headers: object) -> None:
hits = project_hits()

w = csv.writer(sys.stdout)
Expand Down
9 changes: 4 additions & 5 deletions src/geolookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


class GeoLookup(object):
def __init__(self, path):
def __init__(self, path: str) -> None:
if path is None:
self.db = None
return
Expand All @@ -26,7 +26,7 @@ def __init__(self, path):
'''
self.prefix = 'urn:iso:std:3166:-2:'

def lookup_country(self, ip_address, date):
def lookup_country(self, ip_address: str, date: datetime) -> str:
if self.db is None:
return ''
time_now = date.timestamp()
Expand All @@ -37,11 +37,10 @@ def lookup_country(self, ip_address, date):
row = self.cursor.fetchone()
if row is None:
return ''
else:
return self.prefix + row[0]
return self.prefix + row[0]


def run():
def run() -> None:
_, ip_address, timestamp = sys.argv
ts = time.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
ds = datetime.datetime(*ts[:6])
Expand Down
Loading