Skip to content

Commit

Permalink
Remove csvcut (#66)
Browse files Browse the repository at this point in the history
* replace csvcut with python scripts
  • Loading branch information
mtmail authored Jul 18, 2023
1 parent a581bf6 commit 1334b54
Show file tree
Hide file tree
Showing 12 changed files with 200 additions and 70 deletions.
21 changes: 9 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ in the results match the search terms).
Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month.

To run one build you need 420GB of disc space (of which 360GB Postgresql database). The scripts process
39 languages and output 4 files. Runtime is approximately 14 hours on a 4 core, 4GB RAM machine with SSD
39 languages and output 4 files. Runtime is approximately 13 hours on a 4 core, 4GB RAM machine with SSD
discs.

```
Expand Down Expand Up @@ -264,24 +264,21 @@ uncommon for an export starting Jan/1st to only be full ready Jan/20th.
down the time (and space) needed in the database (database used to be 1TB before
this step).
Command-line tools are great for processing sequential data but piping data through 4
tools could be replaced by a single custom script later.
Most time is spend on the Pagelinks table
```
[language en] Page table (0:22h)
[language en] Pagelinks table (3:00h)
[language en] langlinks table (0:05h)
[language en] redirect table (0:02h)
[language en] Page table (0:06h)
[language en] Pagelinks table (1:10h)
[language en] langlinks table (0:01h)
[language en] redirect table (0:01h)
```
6. wikidata_sql2csv (1h)
6. wikidata_sql2csv (0:15h)
```
geo_tags (0:02h)
page (0:40h)
wb_items_per_site (0:20h)
geo_tags (0:01h)
page (0:09h)
wb_items_per_site (0:07h)
```
7. wikipedia\_import, wikidata\_import (0:40h)
Expand Down
64 changes: 64 additions & 0 deletions bin/filter_wikidata_geo_tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env python3

'''
Input from STDIN
# MySQL schema inside the sql.gz file:
#
# CREATE TABLE `geo_tags` (
# `gt_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
# `gt_page_id` int(10) unsigned NOT NULL,
# `gt_globe` varbinary(32) NOT NULL,
# `gt_primary` tinyint(1) NOT NULL,
# `gt_lat` decimal(11,8) DEFAULT NULL,
# `gt_lon` decimal(11,8) DEFAULT NULL,
# `gt_dim` int(11) DEFAULT NULL,
# `gt_type` varbinary(32) DEFAULT NULL,
# `gt_name` varbinary(255) DEFAULT NULL,
# `gt_country` binary(2) DEFAULT NULL,
# `gt_region` varbinary(3) DEFAULT NULL,
Output to STDOUT: gt_page_id, gt_lat, gt_lon
'''

import sys
import csv

reader = csv.DictReader(sys.stdin, fieldnames=[
'gt_id',
'gt_page_id',
'gt_globe',
'gt_primary',
'gt_lat',
'gt_lon',
'gt_dim',
'gt_type',
'gt_name',
'gt_country',
'gt_region'
])

for row in reader:
# There are places e.g. on the moon with coordinates
if (row['gt_globe'] != 'earth'):
continue

if (row['gt_primary'] != '1'):
continue

lat = float(row['gt_lat'])
lon = float(row['gt_lon'])

if (lat == 0 and lon == 0):
# print('skipping 0,0', file=sys.stderr)
continue

if (lat < -90 or lat > 90 or lon < -180 or lon > 180):
# print('skipping out of bounds', file=sys.stderr)
# print(lat, file=sys.stderr)
# print(lon, file=sys.stderr)
continue

lat = round(lat, 5)
lon = round(lon, 5)

print(row['gt_page_id'] + ',' + str(lat) + ',' + str(lon))
55 changes: 55 additions & 0 deletions bin/filter_wikidata_page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python3

'''
Input from STDIN
# MySQL schema inside the sql.gz file:
#
# CREATE TABLE `page` (
# `page_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
# `page_namespace` int(11) NOT NULL,
# `page_title` varbinary(255) NOT NULL,
# `page_restrictions` tinyblob DEFAULT NULL,
# `page_is_redirect` tinyint(3) unsigned NOT NULL DEFAULT 0,
# `page_is_new` tinyint(3) unsigned NOT NULL DEFAULT 0,
# `page_random` double unsigned NOT NULL,
# `page_touched` binary(14) NOT NULL,
# `page_links_updated` varbinary(14) DEFAULT NULL,
# `page_latest` int(10) unsigned NOT NULL,
# `page_len` int(10) unsigned NOT NULL,
# `page_content_model` varbinary(32) DEFAULT NULL,
# `page_lang` varbinary(35) DEFAULT NULL,
# page_lang isn't interesting, 'NULL' 99.999% of the time
Output to STDOUT: page_id, page_title
'''

import sys
import csv

reader = csv.DictReader(sys.stdin, fieldnames=[
'page_id',
'page_namespace',
'page_title',
'page_restrictions',
'page_is_redirect',
'page_is_new',
'page_random',
'page_touched',
'page_links_updated',
'page_latest',
'page_len',
'page_content_model',
'page_lang'
])

for row in reader:
# 0 are articles (99% of the input lines)
if (row['page_namespace'] != '0'):
continue

# Some are special pages, not articles
if (row['page_title'][0] != 'Q'):
continue

print(row['page_id'] + ',' + row['page_title'])
52 changes: 52 additions & 0 deletions bin/filter_wikidata_wb_items_per_site.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env python3

'''
Input from STDIN
# MySQL schema inside the sql.gz file:
#
# CREATE TABLE `wb_items_per_site` (
# `ips_row_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
# `ips_item_id` int(10) unsigned NOT NULL,
# `ips_site_id` varbinary(32) NOT NULL,
# `ips_site_page` varbinary(310) NOT NULL,
Output to STDOUT: item_id, site_id, site_page (title)
'''

import os
import sys
import csv

def get_languages():
with open('config/languages.txt', 'r') as file:
languages = file.readlines()
languages = map(lambda line: line.strip('\n'), languages)
languages = filter(lambda line: not line.startswith('#'), languages )
return languages

# TODO: this ignores the environment variable that might be a subset
languages_set = set(get_languages())
if 'LANGUAGES' in os.environ:
languages_set = set(os.environ['LANGUAGES'].split(','))

# print(languages_set, file=sys.stderr)


reader = csv.DictReader(sys.stdin, fieldnames=[
'ips_row_id',
'ips_item_id',
'ips_site_id',
'ips_site_page'
])
writer = csv.DictWriter(sys.stdout, fieldnames=['item_id', 'site_id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)

for row in reader:
title = row['ips_site_page'].replace('\r', '')
if len(title) == 0:
continue

language = row['ips_site_id'].replace('wiki', '')
if language not in languages_set:
continue

writer.writerow({'item_id': row['ips_item_id'], 'site_id': row['ips_site_id'], 'title': title})
28 changes: 0 additions & 28 deletions bin/round_coordinates.py

This file was deleted.

5 changes: 0 additions & 5 deletions install_dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,7 @@ sudo -u postgres createuser -s $USER


sudo apt-get install -y wget coreutils nodejs jq moreutils pigz

# https://github.com/wireservice/csvkit
# https://csvkit.readthedocs.io
sudo apt-get install -y python3-dev python3-pip python3-setuptools build-essential
pip install csvkit
sudo ln -s ~/.local/bin/csvcut /usr/local/bin/csvcut

# https://wdtaxonomy.readthedocs.io/
sudo apt-get install -y nodejs
Expand Down
19 changes: 5 additions & 14 deletions steps/wikidata_sql2csv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,7 @@ echo "wikidata_sql2csv geo_tags"
unpigz -c $DOWNLOADED_PATH/geo_tags.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
sed 's/\x0//g' | \
sed 's/\r\?//g' | \
grep ',earth,1,' | \
csvcut -c 2,5,6 | \
bin/round_coordinates.py | \
bin/filter_wikidata_geo_tags.py | \
pigz -9 \
> $CONVERTED_PATH/geo_tags.csv.gz

Expand Down Expand Up @@ -87,11 +84,7 @@ echo "wikidata_sql2csv page"
unpigz -c $DOWNLOADED_PATH/page.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
sed 's/\x0//g' | \
sed 's/\r\?//g' | \
csvcut -c 1,3,2 | \
grep -e ',0$' | \
sed 's/,0$//' | \
grep ',Q' | \
bin/filter_wikidata_page.py | \
pigz -9 \
> $CONVERTED_PATH/page.csv.gz

Expand Down Expand Up @@ -129,17 +122,15 @@ echo "wikidata_sql2csv wb_items_per_site"
# `ips_site_page` varbinary(310) NOT NULL,

# Only considering languages we need, cuts down 80m lines to 52m
LISTLANG=${LANGUAGES_ARRAY[@]}
# LISTLANG=${LANGUAGES_ARRAY[@]}
# ar bg ca cs da de en es
LANG_E_REGEX=",\(${LISTLANG// /\\|}\)wiki,"
# LANG_E_REGEX=",\(${LISTLANG// /\\|}\)wiki,"
# ,\(ar\|bg\|ca\|cs\|da\|de\|en...\)wiki,

unpigz -c $DOWNLOADED_PATH/wb_items_per_site.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
sed 's/\x0//g' | \
sed 's/\r\?//g' | \
grep -e "$LANG_E_REGEX" | \
csvcut -c 2,3,4 | \
bin/filter_wikidata_wb_items_per_site.py | \
pigz -9 \
> $CONVERTED_PATH/wb_items_per_site.csv.gz

Expand Down
7 changes: 7 additions & 0 deletions tests/filter_wikidata_geo_tags.test1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
158103,15923968,moon,1,29.63771000,111.17787000,,,,,,,NULL
158108,5009,earth,1,25.13333300,56.33333300,,,,,,,NULL
158109,5010,earth,1,-34.35805556,18.47194444,,,,,,,NULL
158112,5018,earth,1,54.08333333,13.38333333,,,,,,,NULL
158113,5020,earth,1,48.76194444,8.24083333,,,,,,,NULL
158120,5030,earth,1,54.67638889,13.43777778,,,,,,,NULL
158124,5034,earth,1,55.92140000,-3.53665000,,,,,,,NULL
6 changes: 6 additions & 0 deletions tests/filter_wikidata_geo_tags.test1expected.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
5009,25.13333,56.33333
5010,-34.35806,18.47194
5018,54.08333,13.38333
5020,48.76194,8.24083
5030,54.67639,13.43778
5034,55.9214,-3.53665
5 changes: 0 additions & 5 deletions tests/round_coordinates.test1.txt

This file was deleted.

4 changes: 0 additions & 4 deletions tests/round_coordinates.test1expected.txt

This file was deleted.

4 changes: 2 additions & 2 deletions tests/run.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/bin/bash
cat tests/round_coordinates.test1.txt | bin/round_coordinates.py > out.txt
diff --brief out.txt tests/round_coordinates.test1expected.txt || exit 1
cat tests/filter_wikidata_geo_tags.test1.txt | bin/filter_wikidata_geo_tags.py > out.txt
diff --brief out.txt tests/filter_wikidata_geo_tags.test1expected.txt || exit 1

0 comments on commit 1334b54

Please sign in to comment.