Use reader/write, not DictReader/DictWriter for CSV (#72)

osm-search · Jul 21, 2023 · c9e6f65 · c9e6f65
1 parent 6812d27
commit c9e6f65
Show file tree

Hide file tree

Showing 6 changed files with 39 additions and 87 deletions.
diff --git a/bin/filter_page.py b/bin/filter_page.py
@@ -23,30 +23,16 @@
 import sys
 import csv
 
-reader = csv.DictReader(sys.stdin, fieldnames=[
-        'page_id',
-        'page_namespace',
-        'page_title',
-        'page_restrictions',
-        'page_is_redirect',
-        'page_is_new',
-        'page_random',
-        'page_touched',
-        'page_links_updated',
-        'page_latest',
-        'page_len',
-        'page_content_model',
-        'page_lang'
-    ])
-writer = csv.DictWriter(sys.stdout, fieldnames=['id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
+reader = csv.reader(sys.stdin)
+writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)
 
 for row in reader:
     # 0 are articles
-    if (row['page_namespace'] != '0'):
+    if (row[1] != '0'):
         continue
 
-    title = row['page_title'].replace('\r', '')
+    title = row[2].replace('\r', '')
     if len(title) == 0:
         continue
 
-    writer.writerow({'id': row['page_id'], 'title': title})
+    writer.writerow([row[0], title])
diff --git a/bin/filter_pagelinks.py b/bin/filter_pagelinks.py
@@ -14,8 +14,8 @@
 import sys
 import csv
 
-reader = csv.DictReader(sys.stdin, fieldnames=['pl_from', 'pl_namespace', 'pl_title', 'pl_from_namespace'])
-writer = csv.DictWriter(sys.stdout, fieldnames=['title', 'count'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
+reader = csv.reader(sys.stdin)
+writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)
 
 # Similar to 'uniq -c' we look if the title repeats and print a count.
 # If the file is unsorted then a title might repeat later in the output. For enwiki though
@@ -24,20 +24,20 @@
 count = 0
 
 for row in reader:
-    # 0 are articles
-    if (row['pl_namespace'] != '0'):
+    # pl_namespace: 0 are articles
+    if (row[1] != '0'):
         continue
 
-    title = row['pl_title'].replace('\r', '')
+    title = row[2].replace('\r', '')
     if len(title) == 0:
         continue
 
     if prev_title is not None and prev_title != title:
-        writer.writerow({'title': prev_title, 'count': count})
+        writer.writerow([prev_title, count])
         count = 0
 
     prev_title = title
     count += 1
 
 if prev_title is not None:
-    writer.writerow({'title': prev_title, 'count': count})
+    writer.writerow([prev_title, count])
diff --git a/bin/filter_redirect.py b/bin/filter_redirect.py
@@ -15,22 +15,16 @@
 import sys
 import csv
 
-reader = csv.DictReader(sys.stdin, fieldnames=[
-        'rd_from',
-        'rd_namespace',
-        'rd_title',
-        'rd_interwiki',
-        'rd_fragment'
-    ])
-writer = csv.DictWriter(sys.stdout, fieldnames=['id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
+reader = csv.reader(sys.stdin)
+writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)
 
 for row in reader:
-    # 0 are articles
-    if (row['rd_namespace'] != '0'):
+    # namespace: 0 are articles
+    if (row[1] != '0'):
         continue
 
-    title = row['rd_title'].replace('\r', '')
+    title = row[2].replace('\r', '')
     if len(title) == 0:
         continue
 
-    writer.writerow({'id': row['rd_from'], 'title': title})
+    writer.writerow([row[0], title])
diff --git a/bin/filter_wikidata_geo_tags.py b/bin/filter_wikidata_geo_tags.py
@@ -23,30 +23,19 @@
 import sys
 import csv
 
-reader = csv.DictReader(sys.stdin, fieldnames=[
-            'gt_id',
-            'gt_page_id',
-            'gt_globe',
-            'gt_primary',
-            'gt_lat',
-            'gt_lon',
-            'gt_dim',
-            'gt_type',
-            'gt_name',
-            'gt_country',
-            'gt_region'
-        ])
+reader = csv.reader(sys.stdin)
 
 for row in reader:
-    # There are places e.g. on the moon with coordinates
-    if (row['gt_globe'] != 'earth'):
+    # gt_globe: There are places e.g. on the moon with coordinates
+    if (row[2] != 'earth'):
         continue
 
-    if (row['gt_primary'] != '1'):
+    # gt_primary
+    if (row[3] != '1'):
         continue
 
-    lat = float(row['gt_lat'])
-    lon = float(row['gt_lon'])
+    lat = float(row[4])
+    lon = float(row[5])
 
     if (lat == 0 and lon == 0):
         # print('skipping 0,0', file=sys.stderr)
@@ -61,4 +50,4 @@
     lat = round(lat, 5)
     lon = round(lon, 5)
 
-    print(row['gt_page_id'] + ',' + str(lat) + ',' + str(lon))
+    print(row[1] + ',' + str(lat) + ',' + str(lon))
diff --git a/bin/filter_wikidata_page.py b/bin/filter_wikidata_page.py
@@ -27,29 +27,15 @@
 import sys
 import csv
 
-reader = csv.DictReader(sys.stdin, fieldnames=[
-            'page_id',
-            'page_namespace',
-            'page_title',
-            'page_restrictions',
-            'page_is_redirect',
-            'page_is_new',
-            'page_random',
-            'page_touched',
-            'page_links_updated',
-            'page_latest',
-            'page_len',
-            'page_content_model',
-            'page_lang'
-        ])
+reader = csv.reader(sys.stdin)
 
 for row in reader:
-    # 0 are articles (99% of the input lines)
-    if (row['page_namespace'] != '0'):
+    # page_namespace: 0 are articles (99% of the input lines)
+    if (row[1] != '0'):
         continue
 
-    # Some are special pages, not articles
-    if (row['page_title'][0] != 'Q'):
+    # page_title are actually ids. Some are special pages, not articles
+    if (row[2][0] != 'Q'):
         continue
 
-    print(row['page_id'] + ',' + row['page_title'])
+    print(row[0] + ',' + row[2])
diff --git a/bin/filter_wikidata_wb_items_per_site.py b/bin/filter_wikidata_wb_items_per_site.py
@@ -32,21 +32,18 @@ def get_languages():
 # print(languages_set, file=sys.stderr)
 
 
-reader = csv.DictReader(sys.stdin, fieldnames=[
-            'ips_row_id',
-            'ips_item_id',
-            'ips_site_id',
-            'ips_site_page'
-        ])
-writer = csv.DictWriter(sys.stdout, fieldnames=['item_id', 'site_id', 'title'], dialect='unix', quoting=csv.QUOTE_MINIMAL)
+reader = csv.reader(sys.stdin)
+writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)
 
 for row in reader:
-    title = row['ips_site_page'].replace('\r', '')
+    # ips_site_page is the title
+    title = row[3].replace('\r', '')
     if len(title) == 0:
         continue
 
-    language = row['ips_site_id'].replace('wiki', '')
+    # ips_site_id, e.g. 'enwiki'
+    language = row[2].replace('wiki', '')
     if language not in languages_set:
         continue
 
-    writer.writerow({'item_id': row['ips_item_id'], 'site_id': row['ips_site_id'], 'title': title})
+    writer.writerow([row[1], row[2], title])