Skip to content

Commit

Permalink
Use reader/write, not DictReader/DictWriter for CSV (#72)
Browse files Browse the repository at this point in the history
  • Loading branch information
mtmail authored Jul 21, 2023
1 parent 6812d27 commit c9e6f65
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 87 deletions.
24 changes: 5 additions & 19 deletions bin/filter_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,30 +23,16 @@
import sys
import csv

reader = csv.DictReader(sys.stdin, fieldnames=[
'page_id',
'page_namespace',
'page_title',
'page_restrictions',
'page_is_redirect',
'page_is_new',
'page_random',
'page_touched',
'page_links_updated',
'page_latest',
'page_len',
'page_content_model',
'page_lang'
])
writer = csv.DictWriter(sys.stdout, fieldnames=['id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
reader = csv.reader(sys.stdin)
writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)

for row in reader:
# 0 are articles
if (row['page_namespace'] != '0'):
if (row[1] != '0'):
continue

title = row['page_title'].replace('\r', '')
title = row[2].replace('\r', '')
if len(title) == 0:
continue

writer.writerow({'id': row['page_id'], 'title': title})
writer.writerow([row[0], title])
14 changes: 7 additions & 7 deletions bin/filter_pagelinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
import sys
import csv

reader = csv.DictReader(sys.stdin, fieldnames=['pl_from', 'pl_namespace', 'pl_title', 'pl_from_namespace'])
writer = csv.DictWriter(sys.stdout, fieldnames=['title', 'count'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
reader = csv.reader(sys.stdin)
writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)

# Similar to 'uniq -c' we look if the title repeats and print a count.
# If the file is unsorted then a title might repeat later in the output. For enwiki though
Expand All @@ -24,20 +24,20 @@
count = 0

for row in reader:
# 0 are articles
if (row['pl_namespace'] != '0'):
# pl_namespace: 0 are articles
if (row[1] != '0'):
continue

title = row['pl_title'].replace('\r', '')
title = row[2].replace('\r', '')
if len(title) == 0:
continue

if prev_title is not None and prev_title != title:
writer.writerow({'title': prev_title, 'count': count})
writer.writerow([prev_title, count])
count = 0

prev_title = title
count += 1

if prev_title is not None:
writer.writerow({'title': prev_title, 'count': count})
writer.writerow([prev_title, count])
18 changes: 6 additions & 12 deletions bin/filter_redirect.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,16 @@
import sys
import csv

reader = csv.DictReader(sys.stdin, fieldnames=[
'rd_from',
'rd_namespace',
'rd_title',
'rd_interwiki',
'rd_fragment'
])
writer = csv.DictWriter(sys.stdout, fieldnames=['id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
reader = csv.reader(sys.stdin)
writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)

for row in reader:
# 0 are articles
if (row['rd_namespace'] != '0'):
# namespace: 0 are articles
if (row[1] != '0'):
continue

title = row['rd_title'].replace('\r', '')
title = row[2].replace('\r', '')
if len(title) == 0:
continue

writer.writerow({'id': row['rd_from'], 'title': title})
writer.writerow([row[0], title])
27 changes: 8 additions & 19 deletions bin/filter_wikidata_geo_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,30 +23,19 @@
import sys
import csv

reader = csv.DictReader(sys.stdin, fieldnames=[
'gt_id',
'gt_page_id',
'gt_globe',
'gt_primary',
'gt_lat',
'gt_lon',
'gt_dim',
'gt_type',
'gt_name',
'gt_country',
'gt_region'
])
reader = csv.reader(sys.stdin)

for row in reader:
# There are places e.g. on the moon with coordinates
if (row['gt_globe'] != 'earth'):
# gt_globe: There are places e.g. on the moon with coordinates
if (row[2] != 'earth'):
continue

if (row['gt_primary'] != '1'):
# gt_primary
if (row[3] != '1'):
continue

lat = float(row['gt_lat'])
lon = float(row['gt_lon'])
lat = float(row[4])
lon = float(row[5])

if (lat == 0 and lon == 0):
# print('skipping 0,0', file=sys.stderr)
Expand All @@ -61,4 +50,4 @@
lat = round(lat, 5)
lon = round(lon, 5)

print(row['gt_page_id'] + ',' + str(lat) + ',' + str(lon))
print(row[1] + ',' + str(lat) + ',' + str(lon))
26 changes: 6 additions & 20 deletions bin/filter_wikidata_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,29 +27,15 @@
import sys
import csv

reader = csv.DictReader(sys.stdin, fieldnames=[
'page_id',
'page_namespace',
'page_title',
'page_restrictions',
'page_is_redirect',
'page_is_new',
'page_random',
'page_touched',
'page_links_updated',
'page_latest',
'page_len',
'page_content_model',
'page_lang'
])
reader = csv.reader(sys.stdin)

for row in reader:
# 0 are articles (99% of the input lines)
if (row['page_namespace'] != '0'):
# page_namespace: 0 are articles (99% of the input lines)
if (row[1] != '0'):
continue

# Some are special pages, not articles
if (row['page_title'][0] != 'Q'):
# page_title are actually ids. Some are special pages, not articles
if (row[2][0] != 'Q'):
continue

print(row['page_id'] + ',' + row['page_title'])
print(row[0] + ',' + row[2])
17 changes: 7 additions & 10 deletions bin/filter_wikidata_wb_items_per_site.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,18 @@ def get_languages():
# print(languages_set, file=sys.stderr)


reader = csv.DictReader(sys.stdin, fieldnames=[
'ips_row_id',
'ips_item_id',
'ips_site_id',
'ips_site_page'
])
writer = csv.DictWriter(sys.stdout, fieldnames=['item_id', 'site_id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
reader = csv.reader(sys.stdin)
writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)

for row in reader:
title = row['ips_site_page'].replace('\r', '')
# ips_site_page is the title
title = row[3].replace('\r', '')
if len(title) == 0:
continue

language = row['ips_site_id'].replace('wiki', '')
# ips_site_id, e.g. 'enwiki'
language = row[2].replace('wiki', '')
if language not in languages_set:
continue

writer.writerow({'item_id': row['ips_item_id'], 'site_id': row['ips_site_id'], 'title': title})
writer.writerow([row[1], row[2], title])

0 comments on commit c9e6f65

Please sign in to comment.