From 1334b540caf2507615afb31a4d62503e845ce624 Mon Sep 17 00:00:00 2001
From: mtmail <mtmail@gmx.net>
Date: Tue, 18 Jul 2023 21:07:22 +0200
Subject: [PATCH] Remove csvcut (#66)

* replace csvcut with python scripts
---
 README.md                                     | 21 +++---
 bin/filter_wikidata_geo_tags.py               | 64 +++++++++++++++++++
 bin/filter_wikidata_page.py                   | 55 ++++++++++++++++
 bin/filter_wikidata_wb_items_per_site.py      | 52 +++++++++++++++
 bin/round_coordinates.py                      | 28 --------
 install_dependencies.sh                       |  5 --
 steps/wikidata_sql2csv.sh                     | 19 ++----
 tests/filter_wikidata_geo_tags.test1.txt      |  7 ++
 ...filter_wikidata_geo_tags.test1expected.txt |  6 ++
 tests/round_coordinates.test1.txt             |  5 --
 tests/round_coordinates.test1expected.txt     |  4 --
 tests/run.sh                                  |  4 +-
 12 files changed, 200 insertions(+), 70 deletions(-)
 create mode 100755 bin/filter_wikidata_geo_tags.py
 create mode 100755 bin/filter_wikidata_page.py
 create mode 100755 bin/filter_wikidata_wb_items_per_site.py
 delete mode 100755 bin/round_coordinates.py
 create mode 100644 tests/filter_wikidata_geo_tags.test1.txt
 create mode 100644 tests/filter_wikidata_geo_tags.test1expected.txt
 delete mode 100644 tests/round_coordinates.test1.txt
 delete mode 100644 tests/round_coordinates.test1expected.txt

diff --git a/README.md b/README.md
index b667cb4..c03959a 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ in the results match the search terms).
 Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month.
 
 To run one build you need 420GB of disc space (of which 360GB Postgresql database). The scripts process
-39 languages and output 4 files. Runtime is approximately 14 hours on a 4 core, 4GB RAM machine with SSD
+39 languages and output 4 files. Runtime is approximately 13 hours on a 4 core, 4GB RAM machine with SSD
 discs.
 
 ```
@@ -264,24 +264,21 @@ uncommon for an export starting Jan/1st to only be full ready Jan/20th.
    down the time (and space) needed in the database (database used to be 1TB before
    this step).
   
-   Command-line tools are great for processing sequential data but piping data through 4
-   tools could be replaced by a single custom script later.
-   
    Most time is spend on the Pagelinks table
   
    ```
-   [language en] Page table      (0:22h)
-   [language en] Pagelinks table (3:00h)
-   [language en] langlinks table (0:05h)
-   [language en] redirect table  (0:02h)
+   [language en] Page table      (0:06h)
+   [language en] Pagelinks table (1:10h)
+   [language en] langlinks table (0:01h)
+   [language en] redirect table  (0:01h)
    ```
 
-6. wikidata_sql2csv (1h)
+6. wikidata_sql2csv (0:15h)
 
    ```
-	geo_tags          (0:02h)
-	page              (0:40h)
-	wb_items_per_site (0:20h)
+	geo_tags          (0:01h)
+	page              (0:09h)
+	wb_items_per_site (0:07h)
    ```
 
 7. wikipedia\_import, wikidata\_import (0:40h)
diff --git a/bin/filter_wikidata_geo_tags.py b/bin/filter_wikidata_geo_tags.py
new file mode 100755
index 0000000..d368223
--- /dev/null
+++ b/bin/filter_wikidata_geo_tags.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+'''
+Input from STDIN
+# MySQL schema inside the sql.gz file:
+#
+# CREATE TABLE `geo_tags` (
+#   `gt_id`      int(10) unsigned NOT NULL AUTO_INCREMENT,
+#   `gt_page_id` int(10) unsigned NOT NULL,
+#   `gt_globe`   varbinary(32)    NOT NULL,
+#   `gt_primary` tinyint(1)       NOT NULL,
+#   `gt_lat`     decimal(11,8)              DEFAULT NULL,
+#   `gt_lon`     decimal(11,8)              DEFAULT NULL,
+#   `gt_dim`     int(11)                    DEFAULT NULL,
+#   `gt_type`    varbinary(32)              DEFAULT NULL,
+#   `gt_name`    varbinary(255)             DEFAULT NULL,
+#   `gt_country` binary(2)                  DEFAULT NULL,
+#   `gt_region`  varbinary(3)               DEFAULT NULL,
+
+Output to STDOUT: gt_page_id, gt_lat, gt_lon
+'''
+
+import sys
+import csv
+
+reader = csv.DictReader(sys.stdin, fieldnames=[
+            'gt_id',
+            'gt_page_id',
+            'gt_globe',
+            'gt_primary',
+            'gt_lat',
+            'gt_lon',
+            'gt_dim',
+            'gt_type',
+            'gt_name',
+            'gt_country',
+            'gt_region'
+        ])
+
+for row in reader:
+    # There are places e.g. on the moon with coordinates
+    if (row['gt_globe'] != 'earth'):
+        continue
+
+    if (row['gt_primary'] != '1'):
+        continue
+
+    lat = float(row['gt_lat'])
+    lon = float(row['gt_lon'])
+
+    if (lat == 0 and lon == 0):
+        # print('skipping 0,0', file=sys.stderr)
+        continue
+
+    if (lat < -90 or lat > 90 or lon < -180 or lon > 180):
+        # print('skipping out of bounds', file=sys.stderr)
+        # print(lat, file=sys.stderr)
+        # print(lon, file=sys.stderr)
+        continue
+
+    lat = round(lat, 5)
+    lon = round(lon, 5)
+
+    print(row['gt_page_id'] + ',' + str(lat) + ',' + str(lon))
diff --git a/bin/filter_wikidata_page.py b/bin/filter_wikidata_page.py
new file mode 100755
index 0000000..b00a650
--- /dev/null
+++ b/bin/filter_wikidata_page.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+'''
+Input from STDIN
+# MySQL schema inside the sql.gz file:
+#
+# CREATE TABLE `page` (
+#   `page_id`            int(10) unsigned    NOT NULL AUTO_INCREMENT,
+#   `page_namespace`     int(11)             NOT NULL,
+#   `page_title`         varbinary(255)      NOT NULL,
+#   `page_restrictions`  tinyblob                     DEFAULT NULL,
+#   `page_is_redirect`   tinyint(3) unsigned NOT NULL DEFAULT 0,
+#   `page_is_new`        tinyint(3) unsigned NOT NULL DEFAULT 0,
+#   `page_random`        double unsigned     NOT NULL,
+#   `page_touched`       binary(14)          NOT NULL,
+#   `page_links_updated` varbinary(14)                DEFAULT NULL,
+#   `page_latest`        int(10) unsigned    NOT NULL,
+#   `page_len`           int(10) unsigned    NOT NULL,
+#   `page_content_model` varbinary(32)                DEFAULT NULL,
+#   `page_lang`          varbinary(35)                DEFAULT NULL,
+
+# page_lang isn't interesting, 'NULL' 99.999% of the time
+
+Output to STDOUT: page_id, page_title
+'''
+
+import sys
+import csv
+
+reader = csv.DictReader(sys.stdin, fieldnames=[
+            'page_id',
+            'page_namespace',
+            'page_title',
+            'page_restrictions',
+            'page_is_redirect',
+            'page_is_new',
+            'page_random',
+            'page_touched',
+            'page_links_updated',
+            'page_latest',
+            'page_len',
+            'page_content_model',
+            'page_lang'
+        ])
+
+for row in reader:
+    # 0 are articles (99% of the input lines)
+    if (row['page_namespace'] != '0'):
+        continue
+
+    # Some are special pages, not articles
+    if (row['page_title'][0] != 'Q'):
+        continue
+
+    print(row['page_id'] + ',' + row['page_title'])
diff --git a/bin/filter_wikidata_wb_items_per_site.py b/bin/filter_wikidata_wb_items_per_site.py
new file mode 100755
index 0000000..047e4f9
--- /dev/null
+++ b/bin/filter_wikidata_wb_items_per_site.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+'''
+Input from STDIN
+# MySQL schema inside the sql.gz file:
+#
+# CREATE TABLE `wb_items_per_site` (
+#   `ips_row_id`    bigint(20) unsigned NOT NULL AUTO_INCREMENT,
+#   `ips_item_id`   int(10) unsigned    NOT NULL,
+#   `ips_site_id`   varbinary(32)       NOT NULL,
+#   `ips_site_page` varbinary(310)      NOT NULL,
+
+Output to STDOUT: item_id, site_id, site_page (title)
+'''
+
+import os
+import sys
+import csv
+
+def get_languages():
+    with open('config/languages.txt', 'r') as file:
+        languages = file.readlines()
+        languages = map(lambda line: line.strip('\n'), languages)
+        languages = filter(lambda line: not line.startswith('#'), languages )
+    return languages
+
+# TODO: this ignores the environment variable that might be a subset
+languages_set = set(get_languages())
+if 'LANGUAGES' in os.environ:
+    languages_set = set(os.environ['LANGUAGES'].split(','))
+
+# print(languages_set, file=sys.stderr)
+
+
+reader = csv.DictReader(sys.stdin, fieldnames=[
+            'ips_row_id',
+            'ips_item_id',
+            'ips_site_id',
+            'ips_site_page'
+        ])
+writer = csv.DictWriter(sys.stdout, fieldnames=['item_id', 'site_id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
+
+for row in reader:
+    title = row['ips_site_page'].replace('\r', '')
+    if len(title) == 0:
+        continue
+
+    language = row['ips_site_id'].replace('wiki', '')
+    if language not in languages_set:
+        continue
+
+    writer.writerow({'item_id': row['ips_item_id'], 'site_id': row['ips_site_id'], 'title': title})
diff --git a/bin/round_coordinates.py b/bin/round_coordinates.py
deleted file mode 100755
index 24da2a2..0000000
--- a/bin/round_coordinates.py
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env python3
-
-'''
-'''
-
-import sys
-import csv
-
-reader = csv.DictReader(sys.stdin, fieldnames=['page_id', 'lat', 'lon'])
-
-for row in reader:
-    lat = float(row['lat'])
-    lon = float(row['lon'])
-
-    if (row['lat'] == 0 and row['lon'] == 0):
-        # print('skipping 0,0', file=sys.stderr)
-        continue
-
-    if (lat < -90 or lat > 90 or lon < -180 or lon > 180):
-        # print('skipping out of bounds', file=sys.stderr)
-        # print(lat, file=sys.stderr)
-        # print(lon, file=sys.stderr)
-        continue
-
-    lat = round(lat, 5)
-    lon = round(lon, 5)
-
-    print(row['page_id'] + ',' + str(lat) + ',' + str(lon))
diff --git a/install_dependencies.sh b/install_dependencies.sh
index 0bb2ca9..9f42a63 100755
--- a/install_dependencies.sh
+++ b/install_dependencies.sh
@@ -27,12 +27,7 @@ sudo -u postgres createuser -s $USER
 
 
 sudo apt-get install -y wget coreutils nodejs jq moreutils pigz
-
-# https://github.com/wireservice/csvkit
-# https://csvkit.readthedocs.io
 sudo apt-get install -y python3-dev python3-pip python3-setuptools build-essential
-pip install csvkit
-sudo ln -s ~/.local/bin/csvcut /usr/local/bin/csvcut
 
 # https://wdtaxonomy.readthedocs.io/
 sudo apt-get install -y nodejs
diff --git a/steps/wikidata_sql2csv.sh b/steps/wikidata_sql2csv.sh
index e27c154..695d211 100755
--- a/steps/wikidata_sql2csv.sh
+++ b/steps/wikidata_sql2csv.sh
@@ -37,10 +37,7 @@ echo "wikidata_sql2csv geo_tags"
 unpigz -c $DOWNLOADED_PATH/geo_tags.sql.gz | \
 python3 bin/mysqldump_to_csv.py | \
 sed 's/\x0//g' | \
-sed 's/\r\?//g' | \
-grep ',earth,1,' | \
-csvcut -c 2,5,6 | \
-bin/round_coordinates.py | \
+bin/filter_wikidata_geo_tags.py | \
 pigz -9 \
 > $CONVERTED_PATH/geo_tags.csv.gz
 
@@ -87,11 +84,7 @@ echo "wikidata_sql2csv page"
 unpigz -c $DOWNLOADED_PATH/page.sql.gz | \
 python3 bin/mysqldump_to_csv.py | \
 sed 's/\x0//g' | \
-sed 's/\r\?//g' | \
-csvcut -c 1,3,2 | \
-grep -e ',0$' | \
-sed 's/,0$//' | \
-grep ',Q' | \
+bin/filter_wikidata_page.py | \
 pigz -9 \
 > $CONVERTED_PATH/page.csv.gz
 
@@ -129,17 +122,15 @@ echo "wikidata_sql2csv wb_items_per_site"
 #   `ips_site_page` varbinary(310)      NOT NULL,
 
 # Only considering languages we need, cuts down 80m lines to 52m
-LISTLANG=${LANGUAGES_ARRAY[@]}
+# LISTLANG=${LANGUAGES_ARRAY[@]}
 # ar bg ca cs da de en es
-LANG_E_REGEX=",\(${LISTLANG// /\\|}\)wiki,"
+# LANG_E_REGEX=",\(${LISTLANG// /\\|}\)wiki,"
 # ,\(ar\|bg\|ca\|cs\|da\|de\|en...\)wiki,
 
 unpigz -c $DOWNLOADED_PATH/wb_items_per_site.sql.gz | \
 python3 bin/mysqldump_to_csv.py | \
 sed 's/\x0//g' | \
-sed 's/\r\?//g' | \
-grep -e "$LANG_E_REGEX" | \
-csvcut -c 2,3,4 | \
+bin/filter_wikidata_wb_items_per_site.py | \
 pigz -9 \
 > $CONVERTED_PATH/wb_items_per_site.csv.gz
 
diff --git a/tests/filter_wikidata_geo_tags.test1.txt b/tests/filter_wikidata_geo_tags.test1.txt
new file mode 100644
index 0000000..5141b6e
--- /dev/null
+++ b/tests/filter_wikidata_geo_tags.test1.txt
@@ -0,0 +1,7 @@
+158103,15923968,moon,1,29.63771000,111.17787000,,,,,,,NULL
+158108,5009,earth,1,25.13333300,56.33333300,,,,,,,NULL
+158109,5010,earth,1,-34.35805556,18.47194444,,,,,,,NULL
+158112,5018,earth,1,54.08333333,13.38333333,,,,,,,NULL
+158113,5020,earth,1,48.76194444,8.24083333,,,,,,,NULL
+158120,5030,earth,1,54.67638889,13.43777778,,,,,,,NULL
+158124,5034,earth,1,55.92140000,-3.53665000,,,,,,,NULL
\ No newline at end of file
diff --git a/tests/filter_wikidata_geo_tags.test1expected.txt b/tests/filter_wikidata_geo_tags.test1expected.txt
new file mode 100644
index 0000000..6ba888b
--- /dev/null
+++ b/tests/filter_wikidata_geo_tags.test1expected.txt
@@ -0,0 +1,6 @@
+5009,25.13333,56.33333
+5010,-34.35806,18.47194
+5018,54.08333,13.38333
+5020,48.76194,8.24083
+5030,54.67639,13.43778
+5034,55.9214,-3.53665
diff --git a/tests/round_coordinates.test1.txt b/tests/round_coordinates.test1.txt
deleted file mode 100644
index 9f45a61..0000000
--- a/tests/round_coordinates.test1.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-4175,43.1924,-81.3158
-4176,-91,140.1
-4180,-26.0,121.0
-4181,43.08333333,2.41666667
-4187,51.76055556,14.33416667
\ No newline at end of file
diff --git a/tests/round_coordinates.test1expected.txt b/tests/round_coordinates.test1expected.txt
deleted file mode 100644
index f3c877d..0000000
--- a/tests/round_coordinates.test1expected.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-4175,43.1924,-81.3158
-4180,-26.0,121.0
-4181,43.08333,2.41667
-4187,51.76056,14.33417
diff --git a/tests/run.sh b/tests/run.sh
index e4c0918..e0b7dd6 100755
--- a/tests/run.sh
+++ b/tests/run.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
-cat tests/round_coordinates.test1.txt | bin/round_coordinates.py > out.txt
-diff --brief out.txt tests/round_coordinates.test1expected.txt || exit 1
+cat tests/filter_wikidata_geo_tags.test1.txt | bin/filter_wikidata_geo_tags.py > out.txt
+diff --brief out.txt tests/filter_wikidata_geo_tags.test1expected.txt || exit 1