new filter_ scripts to speed up sql2csv

osm-search · Jul 17, 2023 · 0710044 · 0710044
1 parent 7c63279
commit 0710044
Show file tree

Hide file tree

Showing 6 changed files with 143 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ in the results match the search terms).
 Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month.
 
 To run one build you need 420GB of disc space (of which 360GB Postgresql database). The scripts process
-39 languages and output 4 files. Runtime is approximately 24 hours on a 4 core, 4GB RAM machine with SSD
+39 languages and output 4 files. Runtime is approximately 14 hours on a 4 core, 4GB RAM machine with SSD
 discs.
 
 ```
@@ -250,10 +250,8 @@ uncommon for an export starting Jan/1st to only be full ready Jan/20th.
 
    Runs 300 SPARQL queries against wikidata servers. Output is 5GB.
 
-5. wikipedia_sql2csv (15h)
+5. wikipedia_sql2csv (5h)
    
-   By far the longest step, 70% of the build is spend here.
-  
    The MySQL SQL files get parsed sequentially and we try to exclude as much data (rows,
    columns) as possible. Output is 75% smaller than input. Any work done here cuts
    down the time (and space) needed in the database (database used to be 1TB before

diff --git a/bin/filter_langlinks.py b/bin/filter_langlinks.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+'''
+Input from STDIN
+    # CREATE TABLE `langlinks` (
+    #   `ll_from`         int(8) unsigned   NOT NULL DEFAULT 0,
+    #   `ll_lang`         varbinary(35)     NOT NULL DEFAULT '',
+    #   `ll_title`        varbinary(255)    NOT NULL DEFAULT '',
+
+Output to STDOUT: ll_title, ll_from_page_id, ll_lang
+'''
+
+import sys
+
+for line in sys.stdin:
+    line = line.rstrip().replace('\r', '')
+
+    columns = line.split(',', 2)
+
+    print(','.join([columns[2], columns[0], columns[1]]))
diff --git a/bin/filter_page.py b/bin/filter_page.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+'''
+Input from STDIN
+    # CREATE TABLE `page` (
+    #   `page_id`            int(8) unsigned     NOT NULL AUTO_INCREMENT,
+    #   `page_namespace`     int(11)             NOT NULL DEFAULT 0,
+    #   `page_title`         varbinary(255)      NOT NULL DEFAULT '',
+    #   `page_restrictions`  tinyblob                     DEFAULT NULL,
+    #   `page_is_redirect`   tinyint(1) unsigned NOT NULL DEFAULT 0,
+    #   `page_is_new`        tinyint(1) unsigned NOT NULL DEFAULT 0,
+    #   `page_random`        double unsigned     NOT NULL DEFAULT 0,
+    #   `page_touched`       varbinary(14)       NOT NULL DEFAULT '',
+    #   `page_links_updated` varbinary(14)                DEFAULT NULL,
+    #   `page_latest`        int(8) unsigned     NOT NULL DEFAULT 0,
+    #   `page_len`           int(8) unsigned     NOT NULL DEFAULT 0,
+    #   `page_content_model` varbinary(32)                DEFAULT NULL,
+    #   `page_lang`          varbinary(35)                DEFAULT NULL,
+
+Output to STDOUT: page_id, page_title
+'''
+
+import sys
+import csv
+
+reader = csv.DictReader(sys.stdin, fieldnames=[
+        'page_id',
+        'page_namespace',
+        'page_title',
+        'page_restrictions',
+        'page_is_redirect',
+        'page_is_new',
+        'page_random',
+        'page_touched',
+        'page_links_updated',
+        'page_latest',
+        'page_len',
+        'page_content_model',
+        'page_lang'
+    ])
+writer = csv.DictWriter(sys.stdout, fieldnames=['id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
+
+for row in reader:
+    # 0 are articles
+    if (row['page_namespace'] != '0'):
+        continue
+
+    title = row['page_title'].replace('\r', '')
+    if len(title) == 0:
+        continue
+
+    writer.writerow({'id': row['page_id'], 'title': title})
diff --git a/bin/filter_pagelinks.py b/bin/filter_pagelinks.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+'''
+Input from STDIN
+    # CREATE TABLE `pagelinks` (
+    #   `pl_from`            int(8) unsigned    NOT NULL DEFAULT 0,
+    #   `pl_namespace`       int(11)            NOT NULL DEFAULT 0,
+    #   `pl_title`           varbinary(255)     NOT NULL DEFAULT '',
+    #   `pl_from_namespace`  int(11)            NOT NULL DEFAULT 0,
+
+Output to STDOUT: only pl_title
+'''
+
+import sys
+import csv
+
+reader = csv.DictReader(sys.stdin, fieldnames=['pl_from', 'pl_namespace', 'pl_title', 'pl_from_namespace'])
+writer = csv.DictWriter(sys.stdout, fieldnames=['title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
+
+for row in reader:
+    # 0 are articles
+    if (row['pl_namespace'] != '0'):
+        continue
+
+    title = row['pl_title'].replace('\r', '')
+    if len(title) == 0:
+        continue
+
+    writer.writerow({'title': title})
diff --git a/bin/filter_redirect.py b/bin/filter_redirect.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+
+'''
+Input from STDIN
+    # CREATE TABLE `redirect` (
+    #   `rd_from`         int(8) unsigned   NOT NULL DEFAULT 0,
+    #   `rd_namespace`    int(11)           NOT NULL DEFAULT 0,
+    #   `rd_title`        varbinary(255)    NOT NULL DEFAULT '',
+    #   `rd_interwiki`    varbinary(32)              DEFAULT NULL,
+    #   `rd_fragment`     varbinary(255)             DEFAULT NULL,
+
+Output to STDOUT: rd_from_page_id, rd_title
+'''
+
+import sys
+import csv
+
+reader = csv.DictReader(sys.stdin, fieldnames=[
+        'rd_from',
+        'rd_namespace',
+        'rd_title',
+        'rd_interwiki',
+        'rd_fragment'
+    ])
+writer = csv.DictWriter(sys.stdout, fieldnames=['id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
+
+for row in reader:
+    # 0 are articles
+    if (row['rd_namespace'] != '0'):
+        continue
+
+    title = row['rd_title'].replace('\r', '')
+    if len(title) == 0:
+        continue
+
+    writer.writerow({'id': row['rd_from'], 'title': title})
diff --git a/steps/wikipedia_sql2csv.sh b/steps/wikipedia_sql2csv.sh
@@ -41,10 +41,7 @@ do
     unpigz -c $DOWNLOADED_PATH/${LANG}/page.sql.gz | \
     python3 bin/mysqldump_to_csv.py | \
     sed 's/\x0//g' | \
-    sed 's/\r\?//g' | \
-    csvcut -c 1,3,2 | \
-    grep -e ',0$' | \
-    sed 's/,0$//' | \
+    bin/filter_page.py | \
     pigz -9 > $CONVERTED_PATH/$LANG/pages.csv.gz
 
 
@@ -64,11 +61,7 @@ do
     unpigz -c $DOWNLOADED_PATH/${LANG}/pagelinks.sql.gz | \
     python3 bin/mysqldump_to_csv.py | \
     sed 's/\x0//g' | \
-    sed 's/\r\?//g' | \
-    csvcut -c 3,2 | \
-    grep -e ',0$' | \
-    sed 's/,0$//' | \
-    grep -v '^$' | \
+    bin/filter_pagelinks.py | \
     pigz -9 > $CONVERTED_PATH/$LANG/pagelinks.csv.gz
 
 
@@ -87,8 +80,7 @@ do
     unpigz -c $DOWNLOADED_PATH/${LANG}/langlinks.sql.gz | \
     python3 bin/mysqldump_to_csv.py | \
     sed 's/\x0//g' | \
-    sed 's/\r\?//g' | \
-    csvcut -c 3,1,2 | \
+    bin/filter_langlinks.py | \
     pigz -9 > $CONVERTED_PATH/$LANG/langlinks.csv.gz
 
 
@@ -109,10 +101,7 @@ do
     unpigz -c $DOWNLOADED_PATH/${LANG}/redirect.sql.gz | \
     python3 bin/mysqldump_to_csv.py | \
     sed 's/\x0//g' | \
-    sed 's/\r\?//g' | \
-    csvcut -c 1,3,2 | \
-    grep -e ',0$' | \
-    sed 's/,0$//' | \
+    bin/filter_redirect.py | \
     pigz -9 > $CONVERTED_PATH/$LANG/redirect.csv.gz
 
     du -h $CONVERTED_PATH/$LANG/*