Skip to content

Commit

Permalink
new filter_ scripts to speed up sql2csv
Browse files Browse the repository at this point in the history
  • Loading branch information
mtmail committed Jul 17, 2023
1 parent 7c63279 commit 0710044
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 19 deletions.
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ in the results match the search terms).
Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month.

To run one build you need 420GB of disc space (of which 360GB Postgresql database). The scripts process
39 languages and output 4 files. Runtime is approximately 24 hours on a 4 core, 4GB RAM machine with SSD
39 languages and output 4 files. Runtime is approximately 14 hours on a 4 core, 4GB RAM machine with SSD
discs.

```
Expand Down Expand Up @@ -250,10 +250,8 @@ uncommon for an export starting Jan/1st to only be full ready Jan/20th.
Runs 300 SPARQL queries against wikidata servers. Output is 5GB.
5. wikipedia_sql2csv (15h)
5. wikipedia_sql2csv (5h)
By far the longest step, 70% of the build is spend here.
The MySQL SQL files get parsed sequentially and we try to exclude as much data (rows,
columns) as possible. Output is 75% smaller than input. Any work done here cuts
down the time (and space) needed in the database (database used to be 1TB before
Expand Down
20 changes: 20 additions & 0 deletions bin/filter_langlinks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env python3

'''
Input from STDIN
# CREATE TABLE `langlinks` (
# `ll_from` int(8) unsigned NOT NULL DEFAULT 0,
# `ll_lang` varbinary(35) NOT NULL DEFAULT '',
# `ll_title` varbinary(255) NOT NULL DEFAULT '',
Output to STDOUT: ll_title, ll_from_page_id, ll_lang
'''

import sys

for line in sys.stdin:
line = line.rstrip().replace('\r', '')

columns = line.split(',', 2)

print(','.join([columns[2], columns[0], columns[1]]))
52 changes: 52 additions & 0 deletions bin/filter_page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env python3

'''
Input from STDIN
# CREATE TABLE `page` (
# `page_id` int(8) unsigned NOT NULL AUTO_INCREMENT,
# `page_namespace` int(11) NOT NULL DEFAULT 0,
# `page_title` varbinary(255) NOT NULL DEFAULT '',
# `page_restrictions` tinyblob DEFAULT NULL,
# `page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0,
# `page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0,
# `page_random` double unsigned NOT NULL DEFAULT 0,
# `page_touched` varbinary(14) NOT NULL DEFAULT '',
# `page_links_updated` varbinary(14) DEFAULT NULL,
# `page_latest` int(8) unsigned NOT NULL DEFAULT 0,
# `page_len` int(8) unsigned NOT NULL DEFAULT 0,
# `page_content_model` varbinary(32) DEFAULT NULL,
# `page_lang` varbinary(35) DEFAULT NULL,
Output to STDOUT: page_id, page_title
'''

import sys
import csv

reader = csv.DictReader(sys.stdin, fieldnames=[
'page_id',
'page_namespace',
'page_title',
'page_restrictions',
'page_is_redirect',
'page_is_new',
'page_random',
'page_touched',
'page_links_updated',
'page_latest',
'page_len',
'page_content_model',
'page_lang'
])
writer = csv.DictWriter(sys.stdout, fieldnames=['id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)

for row in reader:
# 0 are articles
if (row['page_namespace'] != '0'):
continue

title = row['page_title'].replace('\r', '')
if len(title) == 0:
continue

writer.writerow({'id': row['page_id'], 'title': title})
29 changes: 29 additions & 0 deletions bin/filter_pagelinks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python3

'''
Input from STDIN
# CREATE TABLE `pagelinks` (
# `pl_from` int(8) unsigned NOT NULL DEFAULT 0,
# `pl_namespace` int(11) NOT NULL DEFAULT 0,
# `pl_title` varbinary(255) NOT NULL DEFAULT '',
# `pl_from_namespace` int(11) NOT NULL DEFAULT 0,
Output to STDOUT: only pl_title
'''

import sys
import csv

reader = csv.DictReader(sys.stdin, fieldnames=['pl_from', 'pl_namespace', 'pl_title', 'pl_from_namespace'])
writer = csv.DictWriter(sys.stdout, fieldnames=['title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)

for row in reader:
# 0 are articles
if (row['pl_namespace'] != '0'):
continue

title = row['pl_title'].replace('\r', '')
if len(title) == 0:
continue

writer.writerow({'title': title})
36 changes: 36 additions & 0 deletions bin/filter_redirect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python3

'''
Input from STDIN
# CREATE TABLE `redirect` (
# `rd_from` int(8) unsigned NOT NULL DEFAULT 0,
# `rd_namespace` int(11) NOT NULL DEFAULT 0,
# `rd_title` varbinary(255) NOT NULL DEFAULT '',
# `rd_interwiki` varbinary(32) DEFAULT NULL,
# `rd_fragment` varbinary(255) DEFAULT NULL,
Output to STDOUT: rd_from_page_id, rd_title
'''

import sys
import csv

reader = csv.DictReader(sys.stdin, fieldnames=[
'rd_from',
'rd_namespace',
'rd_title',
'rd_interwiki',
'rd_fragment'
])
writer = csv.DictWriter(sys.stdout, fieldnames=['id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)

for row in reader:
# 0 are articles
if (row['rd_namespace'] != '0'):
continue

title = row['rd_title'].replace('\r', '')
if len(title) == 0:
continue

writer.writerow({'id': row['rd_from'], 'title': title})
19 changes: 4 additions & 15 deletions steps/wikipedia_sql2csv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,7 @@ do
unpigz -c $DOWNLOADED_PATH/${LANG}/page.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
sed 's/\x0//g' | \
sed 's/\r\?//g' | \
csvcut -c 1,3,2 | \
grep -e ',0$' | \
sed 's/,0$//' | \
bin/filter_page.py | \
pigz -9 > $CONVERTED_PATH/$LANG/pages.csv.gz


Expand All @@ -64,11 +61,7 @@ do
unpigz -c $DOWNLOADED_PATH/${LANG}/pagelinks.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
sed 's/\x0//g' | \
sed 's/\r\?//g' | \
csvcut -c 3,2 | \
grep -e ',0$' | \
sed 's/,0$//' | \
grep -v '^$' | \
bin/filter_pagelinks.py | \
pigz -9 > $CONVERTED_PATH/$LANG/pagelinks.csv.gz


Expand All @@ -87,8 +80,7 @@ do
unpigz -c $DOWNLOADED_PATH/${LANG}/langlinks.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
sed 's/\x0//g' | \
sed 's/\r\?//g' | \
csvcut -c 3,1,2 | \
bin/filter_langlinks.py | \
pigz -9 > $CONVERTED_PATH/$LANG/langlinks.csv.gz


Expand All @@ -109,10 +101,7 @@ do
unpigz -c $DOWNLOADED_PATH/${LANG}/redirect.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
sed 's/\x0//g' | \
sed 's/\r\?//g' | \
csvcut -c 1,3,2 | \
grep -e ',0$' | \
sed 's/,0$//' | \
bin/filter_redirect.py | \
pigz -9 > $CONVERTED_PATH/$LANG/redirect.csv.gz

du -h $CONVERTED_PATH/$LANG/*
Expand Down

0 comments on commit 0710044

Please sign in to comment.