Generate Bloom Filters #1102
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Generate Bloom filters with submitted Hacker News stories periodically and | |
# on-command for use in the accompanying browser extension. | |
# | |
# Created by Jacob Strieb | |
# January 2021 | |
name: Generate Bloom Filters | |
# Run once every 24 hours because the BigQuery database is updated once every | |
# 24 hours. Also run when manually triggered. | |
on: | |
schedule: | |
# This time is based loosely around when the BigQuery data updates | |
- cron: "5 16 * * *" | |
workflow_dispatch: | |
jobs: | |
create-bloomfilter: | |
name: Create Bloom Filters | |
runs-on: ubuntu-latest | |
steps: | |
- name: Install dependencies | |
run: | | |
sudo apt-get install jo | |
- uses: actions/checkout@v2 | |
- name: Set up BigQuery | |
run: | | |
# This BQ_JSON key is obtained by downloading the JSON key for a | |
# service account on BigQuery. It is then copied directly into a | |
# GitHub Actions secret (under Settings) | |
echo '${{ secrets.BQ_JSON }}' > bq.json | |
gcloud auth \ | |
activate-service-account \ | |
--key-file bq.json | |
# Unfortunately, bq outputs some dumb shit the first time we run it | |
# with the new authentication, so we do a dummy call here to prevent | |
# it from outputting the annoying welcome message directly into the | |
# CSV we are attempting to process | |
bq query \ | |
--project_id=hacker-news-analysis-300719 \ | |
--use_legacy_sql=false \ | |
'SELECT COUNT(*) FROM `bigquery-public-data.hacker_news.full` | |
WHERE type = "story"' | |
- name: Compile Bloom filter creation program | |
run: | | |
make create | |
- name: Pull BigQuery data | |
run: | | |
# Note that max_rows must be an unsigned, 32-bit int | |
bq query \ | |
--project_id=hacker-news-analysis-300719 \ | |
--format csv \ | |
--max_rows 99999999 \ | |
--use_legacy_sql=false \ | |
'SELECT | |
url, | |
score, | |
descendants, | |
time | |
FROM | |
`bigquery-public-data.hacker_news.full` | |
WHERE | |
type = "story" | |
AND dead IS NOT TRUE | |
AND deleted IS NOT TRUE | |
AND url != ""' \ | |
> data.csv | |
head -n 50 data.csv | |
# Put the data in a sqlite3 database | |
sqlite3 \ | |
data.db \ | |
".mode csv" \ | |
".import data.csv hn" | |
- name: Generate Bloom Filters | |
run: | | |
# Get the time of the last submission | |
LATEST=$(sqlite3 \ | |
-csv \ | |
data.db \ | |
"SELECT CAST(time AS INT) AS inttime | |
FROM hn ORDER BY inttime DESC LIMIT 1") | |
# TODO: Make this accurate -- not sure why but it's inaccurate now | |
printf \ | |
"Last story submitted %d hours ago\n" \ | |
$((($(date +%s) - LATEST) / 3600)) | |
# Generate one Bloom filter for each date range/threshold combination | |
# Used as sqlite3 date modifiers: | |
# https://sqlite.org/lang_datefunc.html | |
DATE_RANGES=( | |
# "-1 month" | |
"-7 days" | |
"-24 hours" | |
) | |
THRESHOLDS=( | |
0 | |
10 | |
75 | |
250 | |
500 | |
) | |
# Apply the threshold to points (score) or comments (descendants) | |
THRESHOLD_KEY="score" | |
mkdir generated | |
for THRESHOLD in "${THRESHOLDS[@]}"; do | |
FILENAME="hn-$THRESHOLD.bloom" | |
# Make a Bloom filter for each threshold with no time restriction | |
echo "Creating bloom filter for stories from all time and" \ | |
"$THRESHOLD+ $THRESHOLD_KEY. Writing to $FILENAME..." | |
sqlite3 \ | |
-header \ | |
-csv \ | |
data.db \ | |
"SELECT * FROM hn | |
WHERE CAST($THRESHOLD_KEY AS INT) >= $THRESHOLD" \ | |
| python3 canonicalize.py \ | |
| bin/bloom-create "generated/$FILENAME" | |
# For the current threshold, make a bloom filter for each date range | |
for DATE_RANGE in "${DATE_RANGES[@]}"; do | |
DATE_RANGE_STR="$(echo $DATE_RANGE | sed 's/[- ]//g')" | |
FILENAME="hn-$DATE_RANGE_STR-$THRESHOLD.bloom" | |
echo "Creating bloom filter for $DATE_RANGE and" \ | |
"$THRESHOLD+ $THRESHOLD_KEY. Writing to $FILENAME..." | |
sqlite3 \ | |
-header \ | |
-csv \ | |
data.db \ | |
"SELECT * FROM hn | |
WHERE CAST($THRESHOLD_KEY AS INT) >= $THRESHOLD | |
AND CAST(time AS INT) > strftime('%s', ( | |
SELECT CAST(time AS INT) AS inttime FROM hn | |
ORDER BY inttime DESC LIMIT 1 | |
), 'unixepoch', '$DATE_RANGE')" \ | |
| python3 canonicalize.py \ | |
| bin/bloom-create "generated/$FILENAME" | |
done | |
done | |
# Output a JSON file with information about the thresholds and dates | |
# | |
# TODO: Fix to use $LATEST instead of now | |
DATES=() | |
for DATE_RANGE in "${DATE_RANGES[@]}"; do | |
DATE_RANGE_STR="$(echo $DATE_RANGE | sed 's/[- ]//g')" | |
DATE="$(date -d "now ""$DATE_RANGE" +%s)" | |
DATES+=("$DATE=$DATE_RANGE_STR") | |
done | |
# 24 hours used in next_generated since this runs every 24 hours and | |
# takes 10 to 20 minutes to complete | |
jo \ | |
-p \ | |
thresholds=$(jo -a "${THRESHOLDS[@]}") \ | |
dates=$(jo "${DATES[@]}") \ | |
version="0.6" \ | |
compressed="true" \ | |
date_generated="$(date +%s)" \ | |
next_generated="$(date -d "now +24 hours +20 minutes" +%s)" \ | |
last_submitted="$(sqlite3 \ | |
-csv \ | |
data.db \ | |
"SELECT CAST(time AS INT) AS inttime | |
FROM hn ORDER BY inttime DESC LIMIT 1")" \ | |
> generated/info.json | |
- name: Create Release | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
run: | | |
cd generated | |
# Keep the latest extension in the next release | |
curl \ | |
--location \ | |
--output "hackernews-button.xpi" \ | |
"https://github.com/jstrieb/hackernews-button/releases/latest/download/hackernews-button.xpi" | |
# TODO: Fix fatal name collision failure if run more than once a day? | |
gh release create \ | |
"$(date +%Y%m%d)" \ | |
--title "$(date '+%b %d, %Y') Bloom filters" \ | |
--notes "Updated Bloom filters for Hacker News submitted stories for $(date '+%B %d, %^Y')" \ | |
* |