Skip to content

Generate Bloom Filters #1102

Generate Bloom Filters

Generate Bloom Filters #1102

# Generate Bloom filters with submitted Hacker News stories periodically and
# on-command for use in the accompanying browser extension.
#
# Created by Jacob Strieb
# January 2021
name: Generate Bloom Filters
# Run once every 24 hours because the BigQuery database is updated once every
# 24 hours. Also run when manually triggered.
on:
schedule:
# This time is based loosely around when the BigQuery data updates
- cron: "5 16 * * *"
workflow_dispatch:
jobs:
create-bloomfilter:
name: Create Bloom Filters
runs-on: ubuntu-latest
steps:
- name: Install dependencies
run: |
sudo apt-get install jo
- uses: actions/checkout@v2
- name: Set up BigQuery
run: |
# This BQ_JSON key is obtained by downloading the JSON key for a
# service account on BigQuery. It is then copied directly into a
# GitHub Actions secret (under Settings)
echo '${{ secrets.BQ_JSON }}' > bq.json
gcloud auth \
activate-service-account \
--key-file bq.json
# Unfortunately, bq outputs some dumb shit the first time we run it
# with the new authentication, so we do a dummy call here to prevent
# it from outputting the annoying welcome message directly into the
# CSV we are attempting to process
bq query \
--project_id=hacker-news-analysis-300719 \
--use_legacy_sql=false \
'SELECT COUNT(*) FROM `bigquery-public-data.hacker_news.full`
WHERE type = "story"'
- name: Compile Bloom filter creation program
run: |
make create
- name: Pull BigQuery data
run: |
# Note that max_rows must be an unsigned, 32-bit int
bq query \
--project_id=hacker-news-analysis-300719 \
--format csv \
--max_rows 99999999 \
--use_legacy_sql=false \
'SELECT
url,
score,
descendants,
time
FROM
`bigquery-public-data.hacker_news.full`
WHERE
type = "story"
AND dead IS NOT TRUE
AND deleted IS NOT TRUE
AND url != ""' \
> data.csv
head -n 50 data.csv
# Put the data in a sqlite3 database
sqlite3 \
data.db \
".mode csv" \
".import data.csv hn"
- name: Generate Bloom Filters
run: |
# Get the time of the last submission
LATEST=$(sqlite3 \
-csv \
data.db \
"SELECT CAST(time AS INT) AS inttime
FROM hn ORDER BY inttime DESC LIMIT 1")
# TODO: Make this accurate -- not sure why but it's inaccurate now
printf \
"Last story submitted %d hours ago\n" \
$((($(date +%s) - LATEST) / 3600))
# Generate one Bloom filter for each date range/threshold combination
# Used as sqlite3 date modifiers:
# https://sqlite.org/lang_datefunc.html
DATE_RANGES=(
# "-1 month"
"-7 days"
"-24 hours"
)
THRESHOLDS=(
0
10
75
250
500
)
# Apply the threshold to points (score) or comments (descendants)
THRESHOLD_KEY="score"
mkdir generated
for THRESHOLD in "${THRESHOLDS[@]}"; do
FILENAME="hn-$THRESHOLD.bloom"
# Make a Bloom filter for each threshold with no time restriction
echo "Creating bloom filter for stories from all time and" \
"$THRESHOLD+ $THRESHOLD_KEY. Writing to $FILENAME..."
sqlite3 \
-header \
-csv \
data.db \
"SELECT * FROM hn
WHERE CAST($THRESHOLD_KEY AS INT) >= $THRESHOLD" \
| python3 canonicalize.py \
| bin/bloom-create "generated/$FILENAME"
# For the current threshold, make a bloom filter for each date range
for DATE_RANGE in "${DATE_RANGES[@]}"; do
DATE_RANGE_STR="$(echo $DATE_RANGE | sed 's/[- ]//g')"
FILENAME="hn-$DATE_RANGE_STR-$THRESHOLD.bloom"
echo "Creating bloom filter for $DATE_RANGE and" \
"$THRESHOLD+ $THRESHOLD_KEY. Writing to $FILENAME..."
sqlite3 \
-header \
-csv \
data.db \
"SELECT * FROM hn
WHERE CAST($THRESHOLD_KEY AS INT) >= $THRESHOLD
AND CAST(time AS INT) > strftime('%s', (
SELECT CAST(time AS INT) AS inttime FROM hn
ORDER BY inttime DESC LIMIT 1
), 'unixepoch', '$DATE_RANGE')" \
| python3 canonicalize.py \
| bin/bloom-create "generated/$FILENAME"
done
done
# Output a JSON file with information about the thresholds and dates
#
# TODO: Fix to use $LATEST instead of now
DATES=()
for DATE_RANGE in "${DATE_RANGES[@]}"; do
DATE_RANGE_STR="$(echo $DATE_RANGE | sed 's/[- ]//g')"
DATE="$(date -d "now ""$DATE_RANGE" +%s)"
DATES+=("$DATE=$DATE_RANGE_STR")
done
# 24 hours used in next_generated since this runs every 24 hours and
# takes 10 to 20 minutes to complete
jo \
-p \
thresholds=$(jo -a "${THRESHOLDS[@]}") \
dates=$(jo "${DATES[@]}") \
version="0.6" \
compressed="true" \
date_generated="$(date +%s)" \
next_generated="$(date -d "now +24 hours +20 minutes" +%s)" \
last_submitted="$(sqlite3 \
-csv \
data.db \
"SELECT CAST(time AS INT) AS inttime
FROM hn ORDER BY inttime DESC LIMIT 1")" \
> generated/info.json
- name: Create Release
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
cd generated
# Keep the latest extension in the next release
curl \
--location \
--output "hackernews-button.xpi" \
"https://github.com/jstrieb/hackernews-button/releases/latest/download/hackernews-button.xpi"
# TODO: Fix fatal name collision failure if run more than once a day?
gh release create \
"$(date +%Y%m%d)" \
--title "$(date '+%b %d, %Y') Bloom filters" \
--notes "Updated Bloom filters for Hacker News submitted stories for $(date '+%B %d, %^Y')" \
*