-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* replace csvcut with python scripts
- Loading branch information
Showing
12 changed files
with
200 additions
and
70 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#!/usr/bin/env python3 | ||
|
||
''' | ||
Input from STDIN | ||
# MySQL schema inside the sql.gz file: | ||
# | ||
# CREATE TABLE `geo_tags` ( | ||
# `gt_id` int(10) unsigned NOT NULL AUTO_INCREMENT, | ||
# `gt_page_id` int(10) unsigned NOT NULL, | ||
# `gt_globe` varbinary(32) NOT NULL, | ||
# `gt_primary` tinyint(1) NOT NULL, | ||
# `gt_lat` decimal(11,8) DEFAULT NULL, | ||
# `gt_lon` decimal(11,8) DEFAULT NULL, | ||
# `gt_dim` int(11) DEFAULT NULL, | ||
# `gt_type` varbinary(32) DEFAULT NULL, | ||
# `gt_name` varbinary(255) DEFAULT NULL, | ||
# `gt_country` binary(2) DEFAULT NULL, | ||
# `gt_region` varbinary(3) DEFAULT NULL, | ||
Output to STDOUT: gt_page_id, gt_lat, gt_lon | ||
''' | ||
|
||
import sys | ||
import csv | ||
|
||
reader = csv.DictReader(sys.stdin, fieldnames=[ | ||
'gt_id', | ||
'gt_page_id', | ||
'gt_globe', | ||
'gt_primary', | ||
'gt_lat', | ||
'gt_lon', | ||
'gt_dim', | ||
'gt_type', | ||
'gt_name', | ||
'gt_country', | ||
'gt_region' | ||
]) | ||
|
||
for row in reader: | ||
# There are places e.g. on the moon with coordinates | ||
if (row['gt_globe'] != 'earth'): | ||
continue | ||
|
||
if (row['gt_primary'] != '1'): | ||
continue | ||
|
||
lat = float(row['gt_lat']) | ||
lon = float(row['gt_lon']) | ||
|
||
if (lat == 0 and lon == 0): | ||
# print('skipping 0,0', file=sys.stderr) | ||
continue | ||
|
||
if (lat < -90 or lat > 90 or lon < -180 or lon > 180): | ||
# print('skipping out of bounds', file=sys.stderr) | ||
# print(lat, file=sys.stderr) | ||
# print(lon, file=sys.stderr) | ||
continue | ||
|
||
lat = round(lat, 5) | ||
lon = round(lon, 5) | ||
|
||
print(row['gt_page_id'] + ',' + str(lat) + ',' + str(lon)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#!/usr/bin/env python3 | ||
|
||
''' | ||
Input from STDIN | ||
# MySQL schema inside the sql.gz file: | ||
# | ||
# CREATE TABLE `page` ( | ||
# `page_id` int(10) unsigned NOT NULL AUTO_INCREMENT, | ||
# `page_namespace` int(11) NOT NULL, | ||
# `page_title` varbinary(255) NOT NULL, | ||
# `page_restrictions` tinyblob DEFAULT NULL, | ||
# `page_is_redirect` tinyint(3) unsigned NOT NULL DEFAULT 0, | ||
# `page_is_new` tinyint(3) unsigned NOT NULL DEFAULT 0, | ||
# `page_random` double unsigned NOT NULL, | ||
# `page_touched` binary(14) NOT NULL, | ||
# `page_links_updated` varbinary(14) DEFAULT NULL, | ||
# `page_latest` int(10) unsigned NOT NULL, | ||
# `page_len` int(10) unsigned NOT NULL, | ||
# `page_content_model` varbinary(32) DEFAULT NULL, | ||
# `page_lang` varbinary(35) DEFAULT NULL, | ||
# page_lang isn't interesting, 'NULL' 99.999% of the time | ||
Output to STDOUT: page_id, page_title | ||
''' | ||
|
||
import sys | ||
import csv | ||
|
||
reader = csv.DictReader(sys.stdin, fieldnames=[ | ||
'page_id', | ||
'page_namespace', | ||
'page_title', | ||
'page_restrictions', | ||
'page_is_redirect', | ||
'page_is_new', | ||
'page_random', | ||
'page_touched', | ||
'page_links_updated', | ||
'page_latest', | ||
'page_len', | ||
'page_content_model', | ||
'page_lang' | ||
]) | ||
|
||
for row in reader: | ||
# 0 are articles (99% of the input lines) | ||
if (row['page_namespace'] != '0'): | ||
continue | ||
|
||
# Some are special pages, not articles | ||
if (row['page_title'][0] != 'Q'): | ||
continue | ||
|
||
print(row['page_id'] + ',' + row['page_title']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#!/usr/bin/env python3 | ||
|
||
''' | ||
Input from STDIN | ||
# MySQL schema inside the sql.gz file: | ||
# | ||
# CREATE TABLE `wb_items_per_site` ( | ||
# `ips_row_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, | ||
# `ips_item_id` int(10) unsigned NOT NULL, | ||
# `ips_site_id` varbinary(32) NOT NULL, | ||
# `ips_site_page` varbinary(310) NOT NULL, | ||
Output to STDOUT: item_id, site_id, site_page (title) | ||
''' | ||
|
||
import os | ||
import sys | ||
import csv | ||
|
||
def get_languages(): | ||
with open('config/languages.txt', 'r') as file: | ||
languages = file.readlines() | ||
languages = map(lambda line: line.strip('\n'), languages) | ||
languages = filter(lambda line: not line.startswith('#'), languages ) | ||
return languages | ||
|
||
# TODO: this ignores the environment variable that might be a subset | ||
languages_set = set(get_languages()) | ||
if 'LANGUAGES' in os.environ: | ||
languages_set = set(os.environ['LANGUAGES'].split(',')) | ||
|
||
# print(languages_set, file=sys.stderr) | ||
|
||
|
||
reader = csv.DictReader(sys.stdin, fieldnames=[ | ||
'ips_row_id', | ||
'ips_item_id', | ||
'ips_site_id', | ||
'ips_site_page' | ||
]) | ||
writer = csv.DictWriter(sys.stdout, fieldnames=['item_id', 'site_id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL) | ||
|
||
for row in reader: | ||
title = row['ips_site_page'].replace('\r', '') | ||
if len(title) == 0: | ||
continue | ||
|
||
language = row['ips_site_id'].replace('wiki', '') | ||
if language not in languages_set: | ||
continue | ||
|
||
writer.writerow({'item_id': row['ips_item_id'], 'site_id': row['ips_site_id'], 'title': title}) |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
158103,15923968,moon,1,29.63771000,111.17787000,,,,,,,NULL | ||
158108,5009,earth,1,25.13333300,56.33333300,,,,,,,NULL | ||
158109,5010,earth,1,-34.35805556,18.47194444,,,,,,,NULL | ||
158112,5018,earth,1,54.08333333,13.38333333,,,,,,,NULL | ||
158113,5020,earth,1,48.76194444,8.24083333,,,,,,,NULL | ||
158120,5030,earth,1,54.67638889,13.43777778,,,,,,,NULL | ||
158124,5034,earth,1,55.92140000,-3.53665000,,,,,,,NULL |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
5009,25.13333,56.33333 | ||
5010,-34.35806,18.47194 | ||
5018,54.08333,13.38333 | ||
5020,48.76194,8.24083 | ||
5030,54.67639,13.43778 | ||
5034,55.9214,-3.53665 |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
#!/bin/bash | ||
cat tests/round_coordinates.test1.txt | bin/round_coordinates.py > out.txt | ||
diff --brief out.txt tests/round_coordinates.test1expected.txt || exit 1 | ||
cat tests/filter_wikidata_geo_tags.test1.txt | bin/filter_wikidata_geo_tags.py > out.txt | ||
diff --brief out.txt tests/filter_wikidata_geo_tags.test1expected.txt || exit 1 |