From 1334b540caf2507615afb31a4d62503e845ce624 Mon Sep 17 00:00:00 2001 From: mtmail Date: Tue, 18 Jul 2023 21:07:22 +0200 Subject: [PATCH] Remove csvcut (#66) * replace csvcut with python scripts --- README.md | 21 +++--- bin/filter_wikidata_geo_tags.py | 64 +++++++++++++++++++ bin/filter_wikidata_page.py | 55 ++++++++++++++++ bin/filter_wikidata_wb_items_per_site.py | 52 +++++++++++++++ bin/round_coordinates.py | 28 -------- install_dependencies.sh | 5 -- steps/wikidata_sql2csv.sh | 19 ++---- tests/filter_wikidata_geo_tags.test1.txt | 7 ++ ...filter_wikidata_geo_tags.test1expected.txt | 6 ++ tests/round_coordinates.test1.txt | 5 -- tests/round_coordinates.test1expected.txt | 4 -- tests/run.sh | 4 +- 12 files changed, 200 insertions(+), 70 deletions(-) create mode 100755 bin/filter_wikidata_geo_tags.py create mode 100755 bin/filter_wikidata_page.py create mode 100755 bin/filter_wikidata_wb_items_per_site.py delete mode 100755 bin/round_coordinates.py create mode 100644 tests/filter_wikidata_geo_tags.test1.txt create mode 100644 tests/filter_wikidata_geo_tags.test1expected.txt delete mode 100644 tests/round_coordinates.test1.txt delete mode 100644 tests/round_coordinates.test1expected.txt diff --git a/README.md b/README.md index b667cb4..c03959a 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ in the results match the search terms). Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month. To run one build you need 420GB of disc space (of which 360GB Postgresql database). The scripts process -39 languages and output 4 files. Runtime is approximately 14 hours on a 4 core, 4GB RAM machine with SSD +39 languages and output 4 files. Runtime is approximately 13 hours on a 4 core, 4GB RAM machine with SSD discs. ``` @@ -264,24 +264,21 @@ uncommon for an export starting Jan/1st to only be full ready Jan/20th. down the time (and space) needed in the database (database used to be 1TB before this step). - Command-line tools are great for processing sequential data but piping data through 4 - tools could be replaced by a single custom script later. - Most time is spend on the Pagelinks table ``` - [language en] Page table (0:22h) - [language en] Pagelinks table (3:00h) - [language en] langlinks table (0:05h) - [language en] redirect table (0:02h) + [language en] Page table (0:06h) + [language en] Pagelinks table (1:10h) + [language en] langlinks table (0:01h) + [language en] redirect table (0:01h) ``` -6. wikidata_sql2csv (1h) +6. wikidata_sql2csv (0:15h) ``` - geo_tags (0:02h) - page (0:40h) - wb_items_per_site (0:20h) + geo_tags (0:01h) + page (0:09h) + wb_items_per_site (0:07h) ``` 7. wikipedia\_import, wikidata\_import (0:40h) diff --git a/bin/filter_wikidata_geo_tags.py b/bin/filter_wikidata_geo_tags.py new file mode 100755 index 0000000..d368223 --- /dev/null +++ b/bin/filter_wikidata_geo_tags.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 + +''' +Input from STDIN +# MySQL schema inside the sql.gz file: +# +# CREATE TABLE `geo_tags` ( +# `gt_id` int(10) unsigned NOT NULL AUTO_INCREMENT, +# `gt_page_id` int(10) unsigned NOT NULL, +# `gt_globe` varbinary(32) NOT NULL, +# `gt_primary` tinyint(1) NOT NULL, +# `gt_lat` decimal(11,8) DEFAULT NULL, +# `gt_lon` decimal(11,8) DEFAULT NULL, +# `gt_dim` int(11) DEFAULT NULL, +# `gt_type` varbinary(32) DEFAULT NULL, +# `gt_name` varbinary(255) DEFAULT NULL, +# `gt_country` binary(2) DEFAULT NULL, +# `gt_region` varbinary(3) DEFAULT NULL, + +Output to STDOUT: gt_page_id, gt_lat, gt_lon +''' + +import sys +import csv + +reader = csv.DictReader(sys.stdin, fieldnames=[ + 'gt_id', + 'gt_page_id', + 'gt_globe', + 'gt_primary', + 'gt_lat', + 'gt_lon', + 'gt_dim', + 'gt_type', + 'gt_name', + 'gt_country', + 'gt_region' + ]) + +for row in reader: + # There are places e.g. on the moon with coordinates + if (row['gt_globe'] != 'earth'): + continue + + if (row['gt_primary'] != '1'): + continue + + lat = float(row['gt_lat']) + lon = float(row['gt_lon']) + + if (lat == 0 and lon == 0): + # print('skipping 0,0', file=sys.stderr) + continue + + if (lat < -90 or lat > 90 or lon < -180 or lon > 180): + # print('skipping out of bounds', file=sys.stderr) + # print(lat, file=sys.stderr) + # print(lon, file=sys.stderr) + continue + + lat = round(lat, 5) + lon = round(lon, 5) + + print(row['gt_page_id'] + ',' + str(lat) + ',' + str(lon)) diff --git a/bin/filter_wikidata_page.py b/bin/filter_wikidata_page.py new file mode 100755 index 0000000..b00a650 --- /dev/null +++ b/bin/filter_wikidata_page.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +''' +Input from STDIN +# MySQL schema inside the sql.gz file: +# +# CREATE TABLE `page` ( +# `page_id` int(10) unsigned NOT NULL AUTO_INCREMENT, +# `page_namespace` int(11) NOT NULL, +# `page_title` varbinary(255) NOT NULL, +# `page_restrictions` tinyblob DEFAULT NULL, +# `page_is_redirect` tinyint(3) unsigned NOT NULL DEFAULT 0, +# `page_is_new` tinyint(3) unsigned NOT NULL DEFAULT 0, +# `page_random` double unsigned NOT NULL, +# `page_touched` binary(14) NOT NULL, +# `page_links_updated` varbinary(14) DEFAULT NULL, +# `page_latest` int(10) unsigned NOT NULL, +# `page_len` int(10) unsigned NOT NULL, +# `page_content_model` varbinary(32) DEFAULT NULL, +# `page_lang` varbinary(35) DEFAULT NULL, + +# page_lang isn't interesting, 'NULL' 99.999% of the time + +Output to STDOUT: page_id, page_title +''' + +import sys +import csv + +reader = csv.DictReader(sys.stdin, fieldnames=[ + 'page_id', + 'page_namespace', + 'page_title', + 'page_restrictions', + 'page_is_redirect', + 'page_is_new', + 'page_random', + 'page_touched', + 'page_links_updated', + 'page_latest', + 'page_len', + 'page_content_model', + 'page_lang' + ]) + +for row in reader: + # 0 are articles (99% of the input lines) + if (row['page_namespace'] != '0'): + continue + + # Some are special pages, not articles + if (row['page_title'][0] != 'Q'): + continue + + print(row['page_id'] + ',' + row['page_title']) diff --git a/bin/filter_wikidata_wb_items_per_site.py b/bin/filter_wikidata_wb_items_per_site.py new file mode 100755 index 0000000..047e4f9 --- /dev/null +++ b/bin/filter_wikidata_wb_items_per_site.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +''' +Input from STDIN +# MySQL schema inside the sql.gz file: +# +# CREATE TABLE `wb_items_per_site` ( +# `ips_row_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, +# `ips_item_id` int(10) unsigned NOT NULL, +# `ips_site_id` varbinary(32) NOT NULL, +# `ips_site_page` varbinary(310) NOT NULL, + +Output to STDOUT: item_id, site_id, site_page (title) +''' + +import os +import sys +import csv + +def get_languages(): + with open('config/languages.txt', 'r') as file: + languages = file.readlines() + languages = map(lambda line: line.strip('\n'), languages) + languages = filter(lambda line: not line.startswith('#'), languages ) + return languages + +# TODO: this ignores the environment variable that might be a subset +languages_set = set(get_languages()) +if 'LANGUAGES' in os.environ: + languages_set = set(os.environ['LANGUAGES'].split(',')) + +# print(languages_set, file=sys.stderr) + + +reader = csv.DictReader(sys.stdin, fieldnames=[ + 'ips_row_id', + 'ips_item_id', + 'ips_site_id', + 'ips_site_page' + ]) +writer = csv.DictWriter(sys.stdout, fieldnames=['item_id', 'site_id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL) + +for row in reader: + title = row['ips_site_page'].replace('\r', '') + if len(title) == 0: + continue + + language = row['ips_site_id'].replace('wiki', '') + if language not in languages_set: + continue + + writer.writerow({'item_id': row['ips_item_id'], 'site_id': row['ips_site_id'], 'title': title}) diff --git a/bin/round_coordinates.py b/bin/round_coordinates.py deleted file mode 100755 index 24da2a2..0000000 --- a/bin/round_coordinates.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python3 - -''' -''' - -import sys -import csv - -reader = csv.DictReader(sys.stdin, fieldnames=['page_id', 'lat', 'lon']) - -for row in reader: - lat = float(row['lat']) - lon = float(row['lon']) - - if (row['lat'] == 0 and row['lon'] == 0): - # print('skipping 0,0', file=sys.stderr) - continue - - if (lat < -90 or lat > 90 or lon < -180 or lon > 180): - # print('skipping out of bounds', file=sys.stderr) - # print(lat, file=sys.stderr) - # print(lon, file=sys.stderr) - continue - - lat = round(lat, 5) - lon = round(lon, 5) - - print(row['page_id'] + ',' + str(lat) + ',' + str(lon)) diff --git a/install_dependencies.sh b/install_dependencies.sh index 0bb2ca9..9f42a63 100755 --- a/install_dependencies.sh +++ b/install_dependencies.sh @@ -27,12 +27,7 @@ sudo -u postgres createuser -s $USER sudo apt-get install -y wget coreutils nodejs jq moreutils pigz - -# https://github.com/wireservice/csvkit -# https://csvkit.readthedocs.io sudo apt-get install -y python3-dev python3-pip python3-setuptools build-essential -pip install csvkit -sudo ln -s ~/.local/bin/csvcut /usr/local/bin/csvcut # https://wdtaxonomy.readthedocs.io/ sudo apt-get install -y nodejs diff --git a/steps/wikidata_sql2csv.sh b/steps/wikidata_sql2csv.sh index e27c154..695d211 100755 --- a/steps/wikidata_sql2csv.sh +++ b/steps/wikidata_sql2csv.sh @@ -37,10 +37,7 @@ echo "wikidata_sql2csv geo_tags" unpigz -c $DOWNLOADED_PATH/geo_tags.sql.gz | \ python3 bin/mysqldump_to_csv.py | \ sed 's/\x0//g' | \ -sed 's/\r\?//g' | \ -grep ',earth,1,' | \ -csvcut -c 2,5,6 | \ -bin/round_coordinates.py | \ +bin/filter_wikidata_geo_tags.py | \ pigz -9 \ > $CONVERTED_PATH/geo_tags.csv.gz @@ -87,11 +84,7 @@ echo "wikidata_sql2csv page" unpigz -c $DOWNLOADED_PATH/page.sql.gz | \ python3 bin/mysqldump_to_csv.py | \ sed 's/\x0//g' | \ -sed 's/\r\?//g' | \ -csvcut -c 1,3,2 | \ -grep -e ',0$' | \ -sed 's/,0$//' | \ -grep ',Q' | \ +bin/filter_wikidata_page.py | \ pigz -9 \ > $CONVERTED_PATH/page.csv.gz @@ -129,17 +122,15 @@ echo "wikidata_sql2csv wb_items_per_site" # `ips_site_page` varbinary(310) NOT NULL, # Only considering languages we need, cuts down 80m lines to 52m -LISTLANG=${LANGUAGES_ARRAY[@]} +# LISTLANG=${LANGUAGES_ARRAY[@]} # ar bg ca cs da de en es -LANG_E_REGEX=",\(${LISTLANG// /\\|}\)wiki," +# LANG_E_REGEX=",\(${LISTLANG// /\\|}\)wiki," # ,\(ar\|bg\|ca\|cs\|da\|de\|en...\)wiki, unpigz -c $DOWNLOADED_PATH/wb_items_per_site.sql.gz | \ python3 bin/mysqldump_to_csv.py | \ sed 's/\x0//g' | \ -sed 's/\r\?//g' | \ -grep -e "$LANG_E_REGEX" | \ -csvcut -c 2,3,4 | \ +bin/filter_wikidata_wb_items_per_site.py | \ pigz -9 \ > $CONVERTED_PATH/wb_items_per_site.csv.gz diff --git a/tests/filter_wikidata_geo_tags.test1.txt b/tests/filter_wikidata_geo_tags.test1.txt new file mode 100644 index 0000000..5141b6e --- /dev/null +++ b/tests/filter_wikidata_geo_tags.test1.txt @@ -0,0 +1,7 @@ +158103,15923968,moon,1,29.63771000,111.17787000,,,,,,,NULL +158108,5009,earth,1,25.13333300,56.33333300,,,,,,,NULL +158109,5010,earth,1,-34.35805556,18.47194444,,,,,,,NULL +158112,5018,earth,1,54.08333333,13.38333333,,,,,,,NULL +158113,5020,earth,1,48.76194444,8.24083333,,,,,,,NULL +158120,5030,earth,1,54.67638889,13.43777778,,,,,,,NULL +158124,5034,earth,1,55.92140000,-3.53665000,,,,,,,NULL \ No newline at end of file diff --git a/tests/filter_wikidata_geo_tags.test1expected.txt b/tests/filter_wikidata_geo_tags.test1expected.txt new file mode 100644 index 0000000..6ba888b --- /dev/null +++ b/tests/filter_wikidata_geo_tags.test1expected.txt @@ -0,0 +1,6 @@ +5009,25.13333,56.33333 +5010,-34.35806,18.47194 +5018,54.08333,13.38333 +5020,48.76194,8.24083 +5030,54.67639,13.43778 +5034,55.9214,-3.53665 diff --git a/tests/round_coordinates.test1.txt b/tests/round_coordinates.test1.txt deleted file mode 100644 index 9f45a61..0000000 --- a/tests/round_coordinates.test1.txt +++ /dev/null @@ -1,5 +0,0 @@ -4175,43.1924,-81.3158 -4176,-91,140.1 -4180,-26.0,121.0 -4181,43.08333333,2.41666667 -4187,51.76055556,14.33416667 \ No newline at end of file diff --git a/tests/round_coordinates.test1expected.txt b/tests/round_coordinates.test1expected.txt deleted file mode 100644 index f3c877d..0000000 --- a/tests/round_coordinates.test1expected.txt +++ /dev/null @@ -1,4 +0,0 @@ -4175,43.1924,-81.3158 -4180,-26.0,121.0 -4181,43.08333,2.41667 -4187,51.76056,14.33417 diff --git a/tests/run.sh b/tests/run.sh index e4c0918..e0b7dd6 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -1,3 +1,3 @@ #!/bin/bash -cat tests/round_coordinates.test1.txt | bin/round_coordinates.py > out.txt -diff --brief out.txt tests/round_coordinates.test1expected.txt || exit 1 +cat tests/filter_wikidata_geo_tags.test1.txt | bin/filter_wikidata_geo_tags.py > out.txt +diff --brief out.txt tests/filter_wikidata_geo_tags.test1expected.txt || exit 1