Skip to content

Commit

Permalink
new lib/languages.py, filter languages in filter_langlinks.py (#74)
Browse files Browse the repository at this point in the history
  • Loading branch information
mtmail authored Jul 21, 2023
1 parent a757a2e commit e02e96b
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 29 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pyc
16 changes: 15 additions & 1 deletion bin/filter_langlinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,31 @@
Output to STDOUT: ll_title, ll_from_page_id, ll_lang
'''

import os
import sys

parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)

from lib.languages import Languages;

languages_set = set(Languages.get_languages())


# We don't need CSV parsing here because the first two columns never
# contain commas.
for line in sys.stdin:
line = line.rstrip().replace('\r', '')

columns = line.split(',', 2)

# ll_lang, e.g. 'en'
language = columns[1]
if language not in languages_set:
continue

# langlinks table contain titles with spaces, e.g. 'one (two)' while pages and
# pagelinkcount table contain titles with underscore, e.g. 'one_(two)'
title = columns[2].replace(' ', '_')

print(','.join([title, columns[0], columns[1]]))
print(','.join([title, columns[0], language]))
17 changes: 6 additions & 11 deletions bin/filter_wikidata_wb_items_per_site.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,13 @@
import sys
import csv

def get_languages():
with open('config/languages.txt', 'r') as file:
languages = file.readlines()
languages = map(lambda line: line.strip('\n'), languages)
languages = filter(lambda line: not line.startswith('#'), languages )
return languages

# TODO: this ignores the environment variable that might be a subset
languages_set = set(get_languages())
if 'LANGUAGES' in os.environ:
languages_set = set(os.environ['LANGUAGES'].split(','))
# Add the parent directory to sys.path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)

from lib.languages import Languages;

languages_set = set(Languages.get_languages())
# print(languages_set, file=sys.stderr)


Expand Down
14 changes: 14 additions & 0 deletions lib/languages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import os

class Languages:
def get_languages():
if 'LANGUAGES' in os.environ:
return os.environ['LANGUAGES'].split(',')

with open('config/languages.txt', 'r') as file:
languages = file.readlines()
languages = map(lambda line: line.strip('\n'), languages)
languages = filter(lambda line: not line.startswith('#'), languages )
return list(languages)

return []
6 changes: 3 additions & 3 deletions steps/wikidata_sql2csv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ echo "wikidata_sql2csv geo_tags"
# Remove anything globe!=earth, primary!=1
# Round the coordinates
unpigz -c $DOWNLOADED_PATH/geo_tags.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
./bin/mysqldump_to_csv.py | \
bin/filter_wikidata_geo_tags.py | \
pigz -9 \
> $CONVERTED_PATH/geo_tags.csv.gz
Expand Down Expand Up @@ -81,7 +81,7 @@ echo "wikidata_sql2csv page"
# Remove all page_title that don't start with 'Q'

unpigz -c $DOWNLOADED_PATH/page.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
./bin/mysqldump_to_csv.py | \
bin/filter_wikidata_page.py | \
pigz -9 \
> $CONVERTED_PATH/page.csv.gz
Expand Down Expand Up @@ -126,7 +126,7 @@ echo "wikidata_sql2csv wb_items_per_site"
# ,\(ar\|bg\|ca\|cs\|da\|de\|en...\)wiki,

unpigz -c $DOWNLOADED_PATH/wb_items_per_site.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
./bin/mysqldump_to_csv.py | \
bin/filter_wikidata_wb_items_per_site.py | \
pigz -9 \
> $CONVERTED_PATH/wb_items_per_site.csv.gz
Expand Down
8 changes: 4 additions & 4 deletions steps/wikipedia_sql2csv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ do
# Output columns: page_id, page_title

unpigz -c $DOWNLOADED_PATH/${LANG}/page.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
./bin/mysqldump_to_csv.py | \
bin/filter_page.py | \
pigz -9 > $CONVERTED_PATH/$LANG/pages.csv.gz

Expand All @@ -58,7 +58,7 @@ do
# Output columns: pl_title, count

unpigz -c $DOWNLOADED_PATH/${LANG}/pagelinks.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
./bin/mysqldump_to_csv.py | \
bin/filter_pagelinks.py | \
pigz -9 > $CONVERTED_PATH/$LANG/pagelinks.csv.gz

Expand All @@ -76,7 +76,7 @@ do
# output 380MB compressed (1.3GB uncompressed)

unpigz -c $DOWNLOADED_PATH/${LANG}/langlinks.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
./bin/mysqldump_to_csv.py | \
bin/filter_langlinks.py | \
pigz -9 > $CONVERTED_PATH/$LANG/langlinks.csv.gz

Expand All @@ -96,7 +96,7 @@ do
# output 100MB compressed (300MB uncompressed)

unpigz -c $DOWNLOADED_PATH/${LANG}/redirect.sql.gz | \
python3 bin/mysqldump_to_csv.py | \
./bin/mysqldump_to_csv.py | \
bin/filter_redirect.py | \
pigz -9 > $CONVERTED_PATH/$LANG/redirect.csv.gz

Expand Down
10 changes: 5 additions & 5 deletions tests/filter_langlinks.test1.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
2074847,az,Berlin dövlət kitabxanası
291145,az,Berlin döyüşü (1945)
52637892,az,Berlin hücumu (2016)
494808,az,Berlin kafedralı
438617,az,Berlin konqresi
2074847,tr,Berlin dövlət kitabxanası
291145,tr,Berlin döyüşü (1945)
52637892,tr,Berlin hücumu (2016)
494808,tr,Berlin kafedralı
438617,tr,Berlin konqresi
1234,de,"Berlin, Berlin"
10 changes: 5 additions & 5 deletions tests/filter_langlinks.test1expected.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Berlin_dövlət_kitabxanası,2074847,az
Berlin_döyüşü_(1945),291145,az
Berlin_hücumu_(2016),52637892,az
Berlin_kafedralı,494808,az
Berlin_konqresi,438617,az
Berlin_dövlət_kitabxanası,2074847,tr
Berlin_döyüşü_(1945),291145,tr
Berlin_hücumu_(2016),52637892,tr
Berlin_kafedralı,494808,tr
Berlin_konqresi,438617,tr
"Berlin,_Berlin",1234,de
13 changes: 13 additions & 0 deletions tests/run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@
#!/bin/bash

OUT=$(python3 -c'from lib.languages import Languages; print(len(Languages.get_languages()))')
if [[ "$OUT" != "39" ]]; then
echo 'expected 39'
exit 1
fi

OUT=$(LANGUAGES=de,fr,it,en python3 -c'from lib.languages import Languages; print(len(Languages.get_languages()))')
if [[ "$OUT" != "4" ]]; then
echo 'expected 4'
exit 1
fi

cat tests/filter_pagelinks.test1.txt | bin/filter_pagelinks.py > out.txt
diff --brief out.txt tests/filter_pagelinks.test1expected.txt || exit 1

Expand Down

0 comments on commit e02e96b

Please sign in to comment.