From f798ce5d7a21dd54c8075c547245e60b87350634 Mon Sep 17 00:00:00 2001 From: mtmail Date: Sun, 5 May 2024 23:41:15 +0200 Subject: [PATCH] Add output headers (#80) * output - add header row to CSV files * documentation: make clear output is tab delimited --- README.md | 44 ++++++++++++++++++++++---------------------- steps/output.sh | 11 +++++++---- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 8dd739f..1775824 100644 --- a/README.md +++ b/README.md @@ -51,8 +51,8 @@ retries (wikidata API being unreliable) was added. ## Output data -`wikimedia_importance.csv.gz` contains about 17 million rows. Number of lines grew 2% between 2022 and 2023. The file -is sorted. +`wikimedia_importance.csv.gz` contains about 17 million rows. Number of lines grew 2% between 2022 and 2023. +The file tab delimited, not quoted, is sorted and contains a header row. | Column | Type | | ----------- | ---------------- | @@ -89,36 +89,36 @@ Examples of `wikimedia_importance.csv.gz` rows: * Wikipedia contains redirects, so a single wikidata object can have multiple titles even though. Each title has the same importance score. Redirects to non-existing articles are removed. ``` - en,a,Brandenburg_Gate,0.5531125195487524,Q82425 - en,r,Berlin's_Gate,0.5531125195487524,Q82425 - en,r,Brandenberg_Gate,0.5531125195487524,Q82425 - en,r,Brandenburger_gate,0.5531125195487524,Q82425 - en,r,Brandenburger_Gate,0.5531125195487524,Q82425 - en,r,Brandenburger_Tor,0.5531125195487524,Q82425 - en,r,Brandenburg_gate,0.5531125195487524,Q82425 - en,r,BRANDENBURG_GATE,0.5531125195487524,Q82425 - en,r,Brandenburg_Gates,0.5531125195487524,Q82425 - en,r,Brandenburg_Tor,0.5531125195487524,Q82425 + en a Brandenburg_Gate 0.5531125195487524 Q82425 + en r Berlin's_Gate 0.5531125195487524 Q82425 + en r Brandenberg_Gate 0.5531125195487524 Q82425 + en r Brandenburger_gate 0.5531125195487524 Q82425 + en r Brandenburger_Gate 0.5531125195487524 Q82425 + en r Brandenburger_Tor 0.5531125195487524 Q82425 + en r Brandenburg_gate 0.5531125195487524 Q82425 + en r BRANDENBURG_GATE 0.5531125195487524 Q82425 + en r Brandenburg_Gates 0.5531125195487524 Q82425 + en r Brandenburg_Tor 0.5531125195487524 Q82425 ``` * Wikipedia titles contain underscores instead of space, e.g. [Alford,_Massachusetts](https://en.wikipedia.org/wiki/Alford,_Massachusetts) ``` - en,a,"Alford,_Massachusetts",0.36590368314334637,Q2431901 - en,r,"Alford,_ma",0.36590368314334637,Q2431901 - en,r,"Alford,_MA",0.36590368314334637,Q2431901 - en,r,"Alford,_Mass",0.36590368314334637,Q2431901 + en a "Alford _Massachusetts" 0.36590368314334637 Q2431901 + en r "Alford _ma" 0.36590368314334637 Q2431901 + en r "Alford _MA" 0.36590368314334637 Q2431901 + en r "Alford _Mass" 0.36590368314334637 Q2431901 ``` * The highest score article is the [United States](https://en.wikipedia.org/wiki/United_States) ``` - pl,a,Stany_Zjednoczone,1,Q30 - en,a,United_States,1,Q30 - ru,a,Соединённые_Штаты_Америки,1,Q30 - hu,a,Amerikai_Egyesült_Államok,1,Q30 - it,a,Stati_Uniti_d'America,1,Q30 - de,a,Vereinigte_Staaten,1,Q30 + pl a Stany_Zjednoczone 1 Q30 + en a United_States 1 Q30 + ru a Соединённые_Штаты_Америки 1 Q30 + hu a Amerikai_Egyesült_Államok 1 Q30 + it a Stati_Uniti_d'America 1 Q30 + de a Vereinigte_Staaten 1 Q30 ... ``` diff --git a/steps/output.sh b/steps/output.sh index 80382bb..db6d5cd 100755 --- a/steps/output.sh +++ b/steps/output.sh @@ -126,10 +126,13 @@ for TABLE in wikipedia_article wikipedia_redirect wikimedia_importance do echo "* $TABLE.csv.gz" - echo "COPY $TABLE TO STDOUT" | \ - psqlcmd | \ - sort | \ - pigz -9 > "$OUTPUT_PATH/$TABLE.csv.gz" + { + echo "COPY (SELECT * FROM $TABLE LIMIT 0) TO STDOUT WITH DELIMITER E'\t' CSV HEADER" | \ + psqlcmd + echo "COPY $TABLE TO STDOUT" | \ + psqlcmd | \ + sort + } | pigz -9 > "$OUTPUT_PATH/$TABLE.csv.gz" # default is 600 chmod 644 "$OUTPUT_PATH/$TABLE.csv.gz"