diff --git a/install_dependencies.sh b/install_dependencies.sh index 0eec93c..14d395e 100755 --- a/install_dependencies.sh +++ b/install_dependencies.sh @@ -7,6 +7,17 @@ sudo apt-get install -y postgresql-14 sudo -u postgres createuser -s $USER + +# No not significant performance increase above 250MB +sudo -u postgres mkdir -p /etc/postgresql/14/main/conf.d/ +echo " +work_mem = 250MB +" | sudo -u postgres tee /etc/postgresql/14/main/conf.d/wikipedia.conf + +sudo systemctl restart postgresql + + + sudo apt-get install -y wget coreutils nodejs jq moreutils pigz sudo apt-get install -y python3-dev python3-pip python3-setuptools build-essential diff --git a/steps/wikipedia_process.sh b/steps/wikipedia_process.sh index a9b0530..1444bee 100755 --- a/steps/wikipedia_process.sh +++ b/steps/wikipedia_process.sh @@ -42,17 +42,19 @@ echo "=====================================================================" echo "Process language tables and associated pagelink counts" echo "=====================================================================" - - echo "set othercounts" +# Creating indexes on title, ll_title didn't have any positive effect on +# query performance and added another 1 hour and 35GB of data. +# echo "CREATE INDEX idx_${LANG}langlinks ON ${LANG}langlinks (ll_lang, ll_title);" | psqlcmd +# echo "CREATE INDEX idx_${LANG}langlinks2 ON ${LANG}langlinks (ll_title);" | psqlcmd +# echo "CREATE INDEX idx_${LANG}page ON ${LANG}page (page_id);" | psqlcmd +# echo "CREATE INDEX idx_${LANG}page2 ON ${LANG}page (page_title);" | psqlcmd for LANG in "${LANGUAGES_ARRAY[@]}" do echo "Language: $LANG" for OTHERLANG in "${LANGUAGES_ARRAY[@]}" do - # Creating indexes on title, ll_title didn't have any positive effect on - # query performance and added another 35GB of data. echo "UPDATE ${LANG}pagelinks SET othercount = othercount + x.count FROM (