-
Notifications
You must be signed in to change notification settings - Fork 12
/
race.py
136 lines (104 loc) · 4.79 KB
/
race.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#! /usr/bin/env python3
import datetime
import logging
import matplotlib
import numpy as np
import pandas as pd
import bar_chart_race as bcr
from ws.db.database import Database
logger = logging.getLogger(__name__)
def fetch_revisions(db):
revs = list(db.query(list="allrevisions", arvlimit="max", arvprop={"ids", "timestamp", "user"}))
# TODO: this should be reconsidered, the "MediaWiki default" user is included here and some deleted revisions were pruned from the server...
#revs += list(db.query(list="alldeletedrevisions", adrlimit="max", adrprop={"ids", "timestamp", "user"}))
revs = pd.DataFrame(revs)
# select only relevant columns (timestamp for rolling, user for grouping, revid for counting)
revs = revs[["timestamp", "user", "revid"]]
# sort by timestamp
revs = revs.sort_values("timestamp")
# rename "revid" to "revisions" as counting discards the "id" semantics
revs = revs.rename(columns={"revid": "revisions"})
return revs
def get_rolling_revisions(revs, *, period_days=30):
# group by user and resaple to daily periods (used later for rolling counts and visualization)
daily_revs = revs.groupby("user").resample("1d", on="timestamp", include_groups=False).count()
# change back to previous format
daily_revs = daily_revs.reset_index()
# group by user and compute a rolling sum on the daily revision counts
rolling_revs = daily_revs.groupby("user").rolling(f"{period_days}d", on="timestamp").sum()
# change back to previous format
rolling_revs = rolling_revs.reset_index().drop("level_1", axis="columns")
return rolling_revs
def prune_rolling_data(df, *, nlargest, start_date=None):
# select only n largest entries per timestamp
pruned = df.sort_values("timestamp").groupby("timestamp").revisions.nlargest(nlargest)
# get the original index values
pruned_idx = pruned.reset_index()["level_1"]
# filter the input dataframe
df = df.iloc[pruned_idx]
if start_date is None:
return df
return df[df["timestamp"] >= start_date]
def fill_timestamps(df, *, period_days=30):
# fill all missing timestamps for all users
# https://stackoverflow.com/a/44979696/4180822
# NOTE: it is important to do this only after pruning the dataframe,
# otherwise this takes too much memory with ArchWiki data
df = df.set_index(
["timestamp", "user"]
).unstack(
fill_value=np.nan
).asfreq(
freq="1D", fill_value=np.nan
).stack(future_stack=True).sort_index(level=1).reset_index()
# forward fill monthly revision counts after the user's last active day
df["revisions"] = df.groupby("user")["revisions"].ffill(limit=period_days).fillna(0).reset_index(drop=True).astype(int)
return df
def race(db, output_filename):
logger.info("Fetching data from the SQL database")
all_revs = fetch_revisions(db)
logger.info("Computing Arch Wiki Race data")
rolling_revs = get_rolling_revisions(all_revs)
# prune users that will not appear in the visualization
max_bars = 10
# start only after 30 days or more
start_date = rolling_revs["timestamp"].min() + datetime.timedelta(days=30)
pruned_revs = prune_rolling_data(rolling_revs, nlargest=max_bars, start_date=start_date)
# fill all timestamps
pruned_revs = fill_timestamps(pruned_revs)
# pivot data for bcr
df = pruned_revs.pivot(index="timestamp", columns="user", values="revisions")
# prepare total edit counts and callback function for bcr
# (We take the maximum revid instead of counting because some revisions
# are lost forever.)
total_edits = all_revs.resample("1d", on="timestamp").max()["revisions"].ffill().astype(int)
def summary(values, ranks):
date = values.name
value = total_edits[date]
s = f"Total edits: {value}"
# the dict is passed into matplotlib.pyplot.text
return {"x": .95, "y": .05, "s": s, "horizontalalignment": "right", "fontsize": "small"}
# render the animation
logger.info(f"The race is now {df.shape[0]} days long and there are {df.shape[1]} racers remaining")
logger.info("Visualizing the Arch Wiki Race 🏁")
bcr.bar_chart_race(
df,
output_filename,
title="Arch Wiki edits in the past 30 days",
period_summary_func=summary,
n_bars=max_bars,
figsize=(5, 3),
dpi=192,
steps_per_period=5, # frames per period
period_length=100, # ms per period
)
if __name__ == "__main__":
import ws.config
argparser = ws.config.getArgParser()
Database.set_argparser(argparser)
args = ws.config.parse_args(argparser)
db = Database.from_argparser(args)
# output to webm with AV1 codec
output_filename = "arch-wiki-race.webm"
matplotlib.rcParams["animation.codec"] = "libvpx-vp9"
race(db, output_filename)