Skip to content

Commit

Permalink
Also download and process new linktarget table
Browse files Browse the repository at this point in the history
  • Loading branch information
mtmail committed Jul 31, 2024
1 parent d758ddf commit a1df41c
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 28 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ in the results match the search terms).

Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month.

To run one build you need 150GB of disc space (of which 90GB is Postgresql database). The scripts process
39 languages and output one file. Runtime is approximately 9 hours on a 4 core, 4GB RAM machine with SSD
discs.
To run one build you need 150GB of disc space (of which 90GB Postgresql database). The scripts process
39 languages and output 4 files. Runtime is approximately 9 hours on a 4 core, 4GB RAM machine with NVMe
drives.

```
334M wikimedia_importance.tsv.gz
Expand Down
1 change: 0 additions & 1 deletion bin/filter_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
# `page_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
# `page_namespace` int(11) NOT NULL DEFAULT 0,
# `page_title` varbinary(255) NOT NULL DEFAULT '',
# `page_restrictions` tinyblob DEFAULT NULL,
# `page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0,
# `page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0,
# `page_random` double unsigned NOT NULL DEFAULT 0,
Expand Down
24 changes: 18 additions & 6 deletions bin/filter_pagelinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,28 @@
'''
Input from STDIN
# CREATE TABLE `pagelinks` (
# `pl_from` int(8) unsigned NOT NULL DEFAULT 0,
# `pl_namespace` int(11) NOT NULL DEFAULT 0,
# `pl_title` varbinary(255) NOT NULL DEFAULT '',
# `pl_from_namespace` int(11) NOT NULL DEFAULT 0,
# `pl_from` int(8) unsigned NOT NULL DEFAULT 0,
# `pl_namespace` int(11) NOT NULL DEFAULT 0,
# `pl_target_id` bigint(20) unsigned NOT NULL,
Output to STDOUT: pl_title, count
'''

import sys
import csv
import gzip

if len(sys.argv) < 2:
print("Usage: filter_pagelinks.py linktarget.csv.gz")
exit(1)

linktarget_filename = sys.argv[1]
linktarget_id_to_title = dict()

with gzip.open(linktarget_filename, 'rt') as gzfile:
reader = csv.reader(gzfile)
for row in reader:
linktarget_id_to_title[row[0]] = row[1]

reader = csv.reader(sys.stdin)
writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)
Expand All @@ -23,8 +35,8 @@
if (row[1] != '0'):
continue

title = row[2].replace('\r', '')
if len(title) == 0:
title = linktarget_id_to_title.get(row[2])
if title is None:
continue

if title not in counts:
Expand Down
6 changes: 6 additions & 0 deletions bin/filter_redirect.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
# `rd_fragment` varbinary(255) DEFAULT NULL,
Output to STDOUT: rd_from_page_id, rd_title
Same for linktarget table
# CREATE TABLE `linktarget` (
# `lt_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
# `lt_namespace` int(11) NOT NULL,
# `lt_title` varbinary(255) NOT NULL,
'''

import sys
Expand Down
2 changes: 1 addition & 1 deletion steps/latest_available_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ check_all_files_ready() {
## usually the last to be dumped
##
# from wikipedia_download.sh
WIKIPEDIA_REQUIRED_FILES="page pagelinks langlinks redirect"
WIKIPEDIA_REQUIRED_FILES="page pagelinks langlinks linktarget redirect"
DUMP_RUN_INFO_URL="https://mirror.clarkson.edu/wikimedia/zhwiki/$CHECK_DATE/dumpruninfo.json"
debug $DUMP_RUN_INFO_URL
DUMP_RUN_INFO=$(curl -s --fail "$DUMP_RUN_INFO_URL")
Expand Down
20 changes: 11 additions & 9 deletions steps/wikipedia_download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,21 @@ do
mkdir -p "$DOWNLOADED_PATH/$LANG"

# English is the largest
# 1.7G downloaded/en/page.sql.gz
# 6.2G downloaded/en/pagelinks.sql.gz
# 355M downloaded/en/langlinks.sql.gz
# 128M downloaded/en/redirect.sql.gz
# 2.1G downloaded/en/page.sql.gz
# 6.4G downloaded/en/pagelinks.sql.gz
# 492M downloaded/en/langlinks.sql.gz
# 992M downloaded/en/linktarget.sql.gz
# 160M downloaded/en/redirect.sql.gz

# Smaller language Turkish
# 53M downloaded/tr/page.sql.gz
# 176M downloaded/tr/pagelinks.sql.gz
# 106M downloaded/tr/langlinks.sql.gz
# 3.2M downloaded/tr/redirect.sql.gz
# 90M downloaded/tr/page.sql.gz
# 255M downloaded/tr/pagelinks.sql.gz
# 166M downloaded/tr/langlinks.sql.gz
# 62M downloaded/tr/linktarget.sql.gz
# 4.2M downloaded/tr/redirect.sql.gz


for FN in page.sql.gz pagelinks.sql.gz langlinks.sql.gz redirect.sql.gz; do
for FN in page.sql.gz pagelinks.sql.gz langlinks.sql.gz linktarget.sql.gz redirect.sql.gz; do

download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/${LANG}wiki-$WIKIPEDIA_DATE-$FN "$DOWNLOADED_PATH/$LANG/$FN"
download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/md5sums-${LANG}wiki-$WIKIPEDIA_DATE-$FN.txt "$DOWNLOADED_PATH/$LANG/$FN.md5"
Expand Down
45 changes: 37 additions & 8 deletions steps/wikipedia_sql2csv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ do
mkdir -p "$CONVERTED_PATH/$LANG/"

echo "[language $LANG] Page table SQL => CSV"
# https://www.mediawiki.org/wiki/Manual:Page_table
#
# CREATE TABLE `page` (
# `page_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
# `page_namespace` int(11) NOT NULL DEFAULT 0,
# `page_title` varbinary(255) NOT NULL DEFAULT '',
# `page_restrictions` tinyblob DEFAULT NULL,
# `page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0,
# `page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0,
# `page_random` double unsigned NOT NULL DEFAULT 0,
Expand All @@ -44,26 +45,50 @@ do
pigz -9 > $CONVERTED_PATH/$LANG/pages.csv.gz


echo "[language $LANG] linktarget table SQL => CSV"
# https://www.mediawiki.org/wiki/Manual:Linktarget_table
#
# CREATE TABLE `linktarget` (
# `lt_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
# `lt_namespace` int(11) NOT NULL,
# `lt_title` varbinary(255) NOT NULL,
#
# Only interested in lt_namespace == 0 (articles)
# English wikipedia:
# input 964MB compressed (100m rows)
# output 322MB compressed (30m rows)
# Output columns: lt_id, lt_title

unpigz -c $DOWNLOADED_PATH/${LANG}/linktarget.sql.gz | \
bin/mysqldump_to_csv.py | \
bin/filter_redirect.py | \
pigz -9 > $CONVERTED_PATH/$LANG/linktarget.csv.gz



echo "[language $LANG] Pagelinks table SQL => CSV"
# https://www.mediawiki.org/wiki/Manual:Pagelinks_table
#
# CREATE TABLE `pagelinks` (
# `pl_from` int(8) unsigned NOT NULL DEFAULT 0,
# `pl_namespace` int(11) NOT NULL DEFAULT 0,
# `pl_title` varbinary(255) NOT NULL DEFAULT '',
# `pl_from_namespace` int(11) NOT NULL DEFAULT 0,
# `pl_from` int(8) unsigned NOT NULL DEFAULT 0,
# `pl_namespace` int(11) NOT NULL DEFAULT 0,
# `pl_target_id` bigint(20) unsigned NOT NULL,
#
# Only interested in pl_namespace == 0 (articles)
# Only interested in target_ids that point to == 0 (articles)
# English wikipedia:
# input 6.8GB compressed
# output 200MB compressed
# Output columns: pl_title, count
# Output columns: lt_title (from linktarget file), count (unique pl_from)

unpigz -c $DOWNLOADED_PATH/$LANG/pagelinks.sql.gz | \
bin/mysqldump_to_csv.py | \
bin/filter_pagelinks.py | \
bin/filter_pagelinks.py $CONVERTED_PATH/$LANG/linktarget.csv.gz | \
pigz -9 > $CONVERTED_PATH/$LANG/pagelinks.csv.gz


echo "[language $LANG] langlinks table SQL => CSV"
# https://www.mediawiki.org/wiki/Manual:Langlinks_table
#
# CREATE TABLE `langlinks` (
# `ll_from` int(8) unsigned NOT NULL DEFAULT 0,
# `ll_lang` varbinary(35) NOT NULL DEFAULT '',
Expand All @@ -81,7 +106,11 @@ do
pigz -9 > $CONVERTED_PATH/$LANG/langlinks.csv.gz




echo "[language $LANG] redirect table SQL => CSV"
# https://www.mediawiki.org/wiki/Manual:Redirect_table
#
# CREATE TABLE `redirect` (
# `rd_from` int(8) unsigned NOT NULL DEFAULT 0,
# `rd_namespace` int(11) NOT NULL DEFAULT 0,
Expand Down

0 comments on commit a1df41c

Please sign in to comment.