Remove csvcut (#66)

* replace csvcut with python scripts
osm-search · Jul 18, 2023 · 1334b54 · 1334b54
1 parent a581bf6
commit 1334b54
Show file tree

Hide file tree

Showing 12 changed files with 200 additions and 70 deletions.
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ in the results match the search terms).
 Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month.
 
 To run one build you need 420GB of disc space (of which 360GB Postgresql database). The scripts process
-39 languages and output 4 files. Runtime is approximately 14 hours on a 4 core, 4GB RAM machine with SSD
+39 languages and output 4 files. Runtime is approximately 13 hours on a 4 core, 4GB RAM machine with SSD
 discs.
 
 ```
@@ -264,24 +264,21 @@ uncommon for an export starting Jan/1st to only be full ready Jan/20th.
    down the time (and space) needed in the database (database used to be 1TB before
    this step).
   
-   Command-line tools are great for processing sequential data but piping data through 4
-   tools could be replaced by a single custom script later.
-   
    Most time is spend on the Pagelinks table
   
    ```
-   [language en] Page table      (0:22h)
-   [language en] Pagelinks table (3:00h)
-   [language en] langlinks table (0:05h)
-   [language en] redirect table  (0:02h)
+   [language en] Page table      (0:06h)
+   [language en] Pagelinks table (1:10h)
+   [language en] langlinks table (0:01h)
+   [language en] redirect table  (0:01h)
    ```
 
-6. wikidata_sql2csv (1h)
+6. wikidata_sql2csv (0:15h)
 
    ```
-	geo_tags          (0:02h)
-	page              (0:40h)
-	wb_items_per_site (0:20h)
+	geo_tags          (0:01h)
+	page              (0:09h)
+	wb_items_per_site (0:07h)
    ```
 
 7. wikipedia\_import, wikidata\_import (0:40h)

diff --git a/bin/filter_wikidata_geo_tags.py b/bin/filter_wikidata_geo_tags.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+'''
+Input from STDIN
+# MySQL schema inside the sql.gz file:
+#
+# CREATE TABLE `geo_tags` (
+#   `gt_id`      int(10) unsigned NOT NULL AUTO_INCREMENT,
+#   `gt_page_id` int(10) unsigned NOT NULL,
+#   `gt_globe`   varbinary(32)    NOT NULL,
+#   `gt_primary` tinyint(1)       NOT NULL,
+#   `gt_lat`     decimal(11,8)              DEFAULT NULL,
+#   `gt_lon`     decimal(11,8)              DEFAULT NULL,
+#   `gt_dim`     int(11)                    DEFAULT NULL,
+#   `gt_type`    varbinary(32)              DEFAULT NULL,
+#   `gt_name`    varbinary(255)             DEFAULT NULL,
+#   `gt_country` binary(2)                  DEFAULT NULL,
+#   `gt_region`  varbinary(3)               DEFAULT NULL,
+
+Output to STDOUT: gt_page_id, gt_lat, gt_lon
+'''
+
+import sys
+import csv
+
+reader = csv.DictReader(sys.stdin, fieldnames=[
+            'gt_id',
+            'gt_page_id',
+            'gt_globe',
+            'gt_primary',
+            'gt_lat',
+            'gt_lon',
+            'gt_dim',
+            'gt_type',
+            'gt_name',
+            'gt_country',
+            'gt_region'
+        ])
+
+for row in reader:
+    # There are places e.g. on the moon with coordinates
+    if (row['gt_globe'] != 'earth'):
+        continue
+
+    if (row['gt_primary'] != '1'):
+        continue
+
+    lat = float(row['gt_lat'])
+    lon = float(row['gt_lon'])
+
+    if (lat == 0 and lon == 0):
+        # print('skipping 0,0', file=sys.stderr)
+        continue
+
+    if (lat < -90 or lat > 90 or lon < -180 or lon > 180):
+        # print('skipping out of bounds', file=sys.stderr)
+        # print(lat, file=sys.stderr)
+        # print(lon, file=sys.stderr)
+        continue
+
+    lat = round(lat, 5)
+    lon = round(lon, 5)
+
+    print(row['gt_page_id'] + ',' + str(lat) + ',' + str(lon))
diff --git a/bin/filter_wikidata_page.py b/bin/filter_wikidata_page.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+'''
+Input from STDIN
+# MySQL schema inside the sql.gz file:
+#
+# CREATE TABLE `page` (
+#   `page_id`            int(10) unsigned    NOT NULL AUTO_INCREMENT,
+#   `page_namespace`     int(11)             NOT NULL,
+#   `page_title`         varbinary(255)      NOT NULL,
+#   `page_restrictions`  tinyblob                     DEFAULT NULL,
+#   `page_is_redirect`   tinyint(3) unsigned NOT NULL DEFAULT 0,
+#   `page_is_new`        tinyint(3) unsigned NOT NULL DEFAULT 0,
+#   `page_random`        double unsigned     NOT NULL,
+#   `page_touched`       binary(14)          NOT NULL,
+#   `page_links_updated` varbinary(14)                DEFAULT NULL,
+#   `page_latest`        int(10) unsigned    NOT NULL,
+#   `page_len`           int(10) unsigned    NOT NULL,
+#   `page_content_model` varbinary(32)                DEFAULT NULL,
+#   `page_lang`          varbinary(35)                DEFAULT NULL,
+
+# page_lang isn't interesting, 'NULL' 99.999% of the time
+
+Output to STDOUT: page_id, page_title
+'''
+
+import sys
+import csv
+
+reader = csv.DictReader(sys.stdin, fieldnames=[
+            'page_id',
+            'page_namespace',
+            'page_title',
+            'page_restrictions',
+            'page_is_redirect',
+            'page_is_new',
+            'page_random',
+            'page_touched',
+            'page_links_updated',
+            'page_latest',
+            'page_len',
+            'page_content_model',
+            'page_lang'
+        ])
+
+for row in reader:
+    # 0 are articles (99% of the input lines)
+    if (row['page_namespace'] != '0'):
+        continue
+
+    # Some are special pages, not articles
+    if (row['page_title'][0] != 'Q'):
+        continue
+
+    print(row['page_id'] + ',' + row['page_title'])
diff --git a/bin/filter_wikidata_wb_items_per_site.py b/bin/filter_wikidata_wb_items_per_site.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+'''
+Input from STDIN
+# MySQL schema inside the sql.gz file:
+#
+# CREATE TABLE `wb_items_per_site` (
+#   `ips_row_id`    bigint(20) unsigned NOT NULL AUTO_INCREMENT,
+#   `ips_item_id`   int(10) unsigned    NOT NULL,
+#   `ips_site_id`   varbinary(32)       NOT NULL,
+#   `ips_site_page` varbinary(310)      NOT NULL,
+
+Output to STDOUT: item_id, site_id, site_page (title)
+'''
+
+import os
+import sys
+import csv
+
+def get_languages():
+    with open('config/languages.txt', 'r') as file:
+        languages = file.readlines()
+        languages = map(lambda line: line.strip('\n'), languages)
+        languages = filter(lambda line: not line.startswith('#'), languages )
+    return languages
+
+# TODO: this ignores the environment variable that might be a subset
+languages_set = set(get_languages())
+if 'LANGUAGES' in os.environ:
+    languages_set = set(os.environ['LANGUAGES'].split(','))
+
+# print(languages_set, file=sys.stderr)
+
+
+reader = csv.DictReader(sys.stdin, fieldnames=[
+            'ips_row_id',
+            'ips_item_id',
+            'ips_site_id',
+            'ips_site_page'
+        ])
+writer = csv.DictWriter(sys.stdout, fieldnames=['item_id', 'site_id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
+
+for row in reader:
+    title = row['ips_site_page'].replace('\r', '')
+    if len(title) == 0:
+        continue
+
+    language = row['ips_site_id'].replace('wiki', '')
+    if language not in languages_set:
+        continue
+
+    writer.writerow({'item_id': row['ips_item_id'], 'site_id': row['ips_site_id'], 'title': title})
diff --git a/bin/round_coordinates.py b/bin/round_coordinates.py
diff --git a/install_dependencies.sh b/install_dependencies.sh
@@ -27,12 +27,7 @@ sudo -u postgres createuser -s $USER
 
 
 sudo apt-get install -y wget coreutils nodejs jq moreutils pigz
-
-# https://github.com/wireservice/csvkit
-# https://csvkit.readthedocs.io
 sudo apt-get install -y python3-dev python3-pip python3-setuptools build-essential
-pip install csvkit
-sudo ln -s ~/.local/bin/csvcut /usr/local/bin/csvcut
 
 # https://wdtaxonomy.readthedocs.io/
 sudo apt-get install -y nodejs

diff --git a/steps/wikidata_sql2csv.sh b/steps/wikidata_sql2csv.sh
@@ -37,10 +37,7 @@ echo "wikidata_sql2csv geo_tags"
 unpigz -c $DOWNLOADED_PATH/geo_tags.sql.gz | \
 python3 bin/mysqldump_to_csv.py | \
 sed 's/\x0//g' | \
-sed 's/\r\?//g' | \
-grep ',earth,1,' | \
-csvcut -c 2,5,6 | \
-bin/round_coordinates.py | \
+bin/filter_wikidata_geo_tags.py | \
 pigz -9 \
 > $CONVERTED_PATH/geo_tags.csv.gz
 
@@ -87,11 +84,7 @@ echo "wikidata_sql2csv page"
 unpigz -c $DOWNLOADED_PATH/page.sql.gz | \
 python3 bin/mysqldump_to_csv.py | \
 sed 's/\x0//g' | \
-sed 's/\r\?//g' | \
-csvcut -c 1,3,2 | \
-grep -e ',0$' | \
-sed 's/,0$//' | \
-grep ',Q' | \
+bin/filter_wikidata_page.py | \
 pigz -9 \
 > $CONVERTED_PATH/page.csv.gz
 
@@ -129,17 +122,15 @@ echo "wikidata_sql2csv wb_items_per_site"
 #   `ips_site_page` varbinary(310)      NOT NULL,
 
 # Only considering languages we need, cuts down 80m lines to 52m
-LISTLANG=${LANGUAGES_ARRAY[@]}
+# LISTLANG=${LANGUAGES_ARRAY[@]}
 # ar bg ca cs da de en es
-LANG_E_REGEX=",\(${LISTLANG// /\\|}\)wiki,"
+# LANG_E_REGEX=",\(${LISTLANG// /\\|}\)wiki,"
 # ,\(ar\|bg\|ca\|cs\|da\|de\|en...\)wiki,
 
 unpigz -c $DOWNLOADED_PATH/wb_items_per_site.sql.gz | \
 python3 bin/mysqldump_to_csv.py | \
 sed 's/\x0//g' | \
-sed 's/\r\?//g' | \
-grep -e "$LANG_E_REGEX" | \
-csvcut -c 2,3,4 | \
+bin/filter_wikidata_wb_items_per_site.py | \
 pigz -9 \
 > $CONVERTED_PATH/wb_items_per_site.csv.gz
 

diff --git a/tests/filter_wikidata_geo_tags.test1.txt b/tests/filter_wikidata_geo_tags.test1.txt
@@ -0,0 +1,7 @@
+158103,15923968,moon,1,29.63771000,111.17787000,,,,,,,NULL
+158108,5009,earth,1,25.13333300,56.33333300,,,,,,,NULL
+158109,5010,earth,1,-34.35805556,18.47194444,,,,,,,NULL
+158112,5018,earth,1,54.08333333,13.38333333,,,,,,,NULL
+158113,5020,earth,1,48.76194444,8.24083333,,,,,,,NULL
+158120,5030,earth,1,54.67638889,13.43777778,,,,,,,NULL
+158124,5034,earth,1,55.92140000,-3.53665000,,,,,,,NULL
diff --git a/tests/filter_wikidata_geo_tags.test1expected.txt b/tests/filter_wikidata_geo_tags.test1expected.txt
@@ -0,0 +1,6 @@
+5009,25.13333,56.33333
+5010,-34.35806,18.47194
+5018,54.08333,13.38333
+5020,48.76194,8.24083
+5030,54.67639,13.43778
+5034,55.9214,-3.53665
diff --git a/tests/round_coordinates.test1.txt b/tests/round_coordinates.test1.txt
diff --git a/tests/round_coordinates.test1expected.txt b/tests/round_coordinates.test1expected.txt
diff --git a/tests/run.sh b/tests/run.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
-cat tests/round_coordinates.test1.txt | bin/round_coordinates.py > out.txt
-diff --brief out.txt tests/round_coordinates.test1expected.txt || exit 1
+cat tests/filter_wikidata_geo_tags.test1.txt | bin/filter_wikidata_geo_tags.py > out.txt
+diff --brief out.txt tests/filter_wikidata_geo_tags.test1expected.txt || exit 1