diff --git a/README.md b/README.md index 2ab0dd8..2d80ff4 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,9 @@ in the results match the search terms). Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month. -To run one build you need 150GB of disc space (of which 90GB is Postgresql database). The scripts process -39 languages and output one file. Runtime is approximately 9 hours on a 4 core, 4GB RAM machine with SSD -discs. +To run one build you need 150GB of disc space (of which 90GB Postgresql database). The scripts process +39 languages and output 4 files. Runtime is approximately 9 hours on a 4 core, 4GB RAM machine with NVMe +drives. ``` 334M wikimedia_importance.tsv.gz diff --git a/bin/filter_page.py b/bin/filter_page.py index ff6a58e..435cc80 100755 --- a/bin/filter_page.py +++ b/bin/filter_page.py @@ -6,7 +6,6 @@ # `page_id` int(8) unsigned NOT NULL AUTO_INCREMENT, # `page_namespace` int(11) NOT NULL DEFAULT 0, # `page_title` varbinary(255) NOT NULL DEFAULT '', - # `page_restrictions` tinyblob DEFAULT NULL, # `page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0, # `page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0, # `page_random` double unsigned NOT NULL DEFAULT 0, diff --git a/bin/filter_pagelinks.py b/bin/filter_pagelinks.py index 21123f8..3188d4f 100755 --- a/bin/filter_pagelinks.py +++ b/bin/filter_pagelinks.py @@ -3,16 +3,28 @@ ''' Input from STDIN # CREATE TABLE `pagelinks` ( - # `pl_from` int(8) unsigned NOT NULL DEFAULT 0, - # `pl_namespace` int(11) NOT NULL DEFAULT 0, - # `pl_title` varbinary(255) NOT NULL DEFAULT '', - # `pl_from_namespace` int(11) NOT NULL DEFAULT 0, + # `pl_from` int(8) unsigned NOT NULL DEFAULT 0, + # `pl_namespace` int(11) NOT NULL DEFAULT 0, + # `pl_target_id` bigint(20) unsigned NOT NULL, Output to STDOUT: pl_title, count ''' import sys import csv +import gzip + +if len(sys.argv) < 2: + print("Usage: filter_pagelinks.py linktarget.csv.gz") + exit(1) + +linktarget_filename = sys.argv[1] +linktarget_id_to_title = dict() + +with gzip.open(linktarget_filename, 'rt') as gzfile: + reader = csv.reader(gzfile) + for row in reader: + linktarget_id_to_title[row[0]] = row[1] reader = csv.reader(sys.stdin) writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL) @@ -23,8 +35,8 @@ if (row[1] != '0'): continue - title = row[2].replace('\r', '') - if len(title) == 0: + title = linktarget_id_to_title.get(row[2]) + if title is None: continue if title not in counts: diff --git a/bin/filter_redirect.py b/bin/filter_redirect.py index 83ae381..bbcdb48 100755 --- a/bin/filter_redirect.py +++ b/bin/filter_redirect.py @@ -10,6 +10,12 @@ # `rd_fragment` varbinary(255) DEFAULT NULL, Output to STDOUT: rd_from_page_id, rd_title + +Same for linktarget table + # CREATE TABLE `linktarget` ( + # `lt_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, + # `lt_namespace` int(11) NOT NULL, + # `lt_title` varbinary(255) NOT NULL, ''' import sys diff --git a/steps/latest_available_data.sh b/steps/latest_available_data.sh index 698e3e0..75ad1e7 100755 --- a/steps/latest_available_data.sh +++ b/steps/latest_available_data.sh @@ -61,7 +61,7 @@ check_all_files_ready() { ## usually the last to be dumped ## # from wikipedia_download.sh - WIKIPEDIA_REQUIRED_FILES="page pagelinks langlinks redirect" + WIKIPEDIA_REQUIRED_FILES="page pagelinks langlinks linktarget redirect" DUMP_RUN_INFO_URL="https://mirror.clarkson.edu/wikimedia/zhwiki/$CHECK_DATE/dumpruninfo.json" debug $DUMP_RUN_INFO_URL DUMP_RUN_INFO=$(curl -s --fail "$DUMP_RUN_INFO_URL") diff --git a/steps/wikipedia_download.sh b/steps/wikipedia_download.sh index 3b5ce7e..32f7311 100755 --- a/steps/wikipedia_download.sh +++ b/steps/wikipedia_download.sh @@ -42,19 +42,21 @@ do mkdir -p "$DOWNLOADED_PATH/$LANG" # English is the largest - # 1.7G downloaded/en/page.sql.gz - # 6.2G downloaded/en/pagelinks.sql.gz - # 355M downloaded/en/langlinks.sql.gz - # 128M downloaded/en/redirect.sql.gz + # 2.1G downloaded/en/page.sql.gz + # 6.4G downloaded/en/pagelinks.sql.gz + # 492M downloaded/en/langlinks.sql.gz + # 992M downloaded/en/linktarget.sql.gz + # 160M downloaded/en/redirect.sql.gz # Smaller language Turkish - # 53M downloaded/tr/page.sql.gz - # 176M downloaded/tr/pagelinks.sql.gz - # 106M downloaded/tr/langlinks.sql.gz - # 3.2M downloaded/tr/redirect.sql.gz + # 90M downloaded/tr/page.sql.gz + # 255M downloaded/tr/pagelinks.sql.gz + # 166M downloaded/tr/langlinks.sql.gz + # 62M downloaded/tr/linktarget.sql.gz + # 4.2M downloaded/tr/redirect.sql.gz - for FN in page.sql.gz pagelinks.sql.gz langlinks.sql.gz redirect.sql.gz; do + for FN in page.sql.gz pagelinks.sql.gz langlinks.sql.gz linktarget.sql.gz redirect.sql.gz; do download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/${LANG}wiki-$WIKIPEDIA_DATE-$FN "$DOWNLOADED_PATH/$LANG/$FN" download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/md5sums-${LANG}wiki-$WIKIPEDIA_DATE-$FN.txt "$DOWNLOADED_PATH/$LANG/$FN.md5" diff --git a/steps/wikipedia_sql2csv.sh b/steps/wikipedia_sql2csv.sh index 1cb7e04..76a6b98 100755 --- a/steps/wikipedia_sql2csv.sh +++ b/steps/wikipedia_sql2csv.sh @@ -17,11 +17,12 @@ do mkdir -p "$CONVERTED_PATH/$LANG/" echo "[language $LANG] Page table SQL => CSV" + # https://www.mediawiki.org/wiki/Manual:Page_table + # # CREATE TABLE `page` ( # `page_id` int(8) unsigned NOT NULL AUTO_INCREMENT, # `page_namespace` int(11) NOT NULL DEFAULT 0, # `page_title` varbinary(255) NOT NULL DEFAULT '', - # `page_restrictions` tinyblob DEFAULT NULL, # `page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0, # `page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0, # `page_random` double unsigned NOT NULL DEFAULT 0, @@ -44,26 +45,50 @@ do pigz -9 > $CONVERTED_PATH/$LANG/pages.csv.gz + echo "[language $LANG] linktarget table SQL => CSV" + # https://www.mediawiki.org/wiki/Manual:Linktarget_table + # + # CREATE TABLE `linktarget` ( + # `lt_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, + # `lt_namespace` int(11) NOT NULL, + # `lt_title` varbinary(255) NOT NULL, + # + # Only interested in lt_namespace == 0 (articles) + # English wikipedia: + # input 964MB compressed (100m rows) + # output 322MB compressed (30m rows) + # Output columns: lt_id, lt_title + + unpigz -c $DOWNLOADED_PATH/${LANG}/linktarget.sql.gz | \ + bin/mysqldump_to_csv.py | \ + bin/filter_redirect.py | \ + pigz -9 > $CONVERTED_PATH/$LANG/linktarget.csv.gz + + + echo "[language $LANG] Pagelinks table SQL => CSV" + # https://www.mediawiki.org/wiki/Manual:Pagelinks_table + # # CREATE TABLE `pagelinks` ( - # `pl_from` int(8) unsigned NOT NULL DEFAULT 0, - # `pl_namespace` int(11) NOT NULL DEFAULT 0, - # `pl_title` varbinary(255) NOT NULL DEFAULT '', - # `pl_from_namespace` int(11) NOT NULL DEFAULT 0, + # `pl_from` int(8) unsigned NOT NULL DEFAULT 0, + # `pl_namespace` int(11) NOT NULL DEFAULT 0, + # `pl_target_id` bigint(20) unsigned NOT NULL, # - # Only interested in pl_namespace == 0 (articles) + # Only interested in target_ids that point to == 0 (articles) # English wikipedia: # input 6.8GB compressed # output 200MB compressed - # Output columns: pl_title, count + # Output columns: lt_title (from linktarget file), count (unique pl_from) unpigz -c $DOWNLOADED_PATH/$LANG/pagelinks.sql.gz | \ bin/mysqldump_to_csv.py | \ - bin/filter_pagelinks.py | \ + bin/filter_pagelinks.py $CONVERTED_PATH/$LANG/linktarget.csv.gz | \ pigz -9 > $CONVERTED_PATH/$LANG/pagelinks.csv.gz echo "[language $LANG] langlinks table SQL => CSV" + # https://www.mediawiki.org/wiki/Manual:Langlinks_table + # # CREATE TABLE `langlinks` ( # `ll_from` int(8) unsigned NOT NULL DEFAULT 0, # `ll_lang` varbinary(35) NOT NULL DEFAULT '', @@ -81,7 +106,11 @@ do pigz -9 > $CONVERTED_PATH/$LANG/langlinks.csv.gz + + echo "[language $LANG] redirect table SQL => CSV" + # https://www.mediawiki.org/wiki/Manual:Redirect_table + # # CREATE TABLE `redirect` ( # `rd_from` int(8) unsigned NOT NULL DEFAULT 0, # `rd_namespace` int(11) NOT NULL DEFAULT 0,