Skip to content

Transfer PostgresSQL to BigQuery #78

Transfer PostgresSQL to BigQuery

Transfer PostgresSQL to BigQuery #78

---
name: Transfer PostgresSQL to BigQuery
on:
workflow_dispatch:
inputs:
heroku_app:
description: 'Heroku app [pola-app/pola-staging]'
required: true
default: 'pola-staging'
pull_request:
branches:
- master
paths:
- 'pola-bi/**'
- '.github/workflows/bi-transfer_pola_backend_to_bq.yml'
schedule:
# * is a special character in YAML so you have to quote this string
# This workflow will be run at 5:12 on the first day of every month.
- cron: '12 5 1 * *'
permissions:
id-token: write # This is required for requesting the JWT
contents: read # This is required for actions/checkout
env:
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_ORGANIZATION: ${{ github.repository_owner }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GCP_PROJECT_ID: pola-bi-looker
GCP_PROJECT_NUMBER: 354540873199
GCP_REGION: europe-west3
GCP_BUCKET_NAME: pola-app_pola-backend_postgres_csv-files
GCP_IDENTITY_POOL: github
GCP_IDENTITY_PROVIDER: pola-backend-repo
jobs:
deploy-bi:
name: "Transfer PostgresSQL to BQ"
runs-on: ubuntu-latest
env:
HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }}
steps:
- name: Set dynamic job variable
shell: python
run: |
import os
from pathlib import Path
event_name = os.environ["GITHUB_EVENT_NAME"]
github_env_file_path = Path(os.environ["GITHUB_ENV"])
if event_name == "schedule":
heroku_app = "pola-staging"
elif event_name == "workflow_dispatch":
heroku_app = os.environ["INPUT_HEROKU_APP"]
elif event_name == "pull_request":
heroku_app = "pola-staging"
else:
raise Exception(f"Unknown event: {event_name}")
if heroku_app == "pola-app":
env_name = "prod"
elif heroku_app == "pola-staging":
env_name = "staging"
else:
raise Exception(f"Unknown app: {heroku_app}")
with github_env_file_path.open("a") as file:
file.write(f"ENV_NAME={env_name}\n")
file.write(f"HEROKU_APP={heroku_app}\n")
file.write(f"GCP_BIGQUERY_DATASET=pola_backend__{env_name}\n")
env:
INPUT_HEROKU_APP: ${{ github.event.inputs.heroku_app }}
GITHUB_EVENT_NAME: ${{ github.event_name }}
- name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )"
uses: actions/checkout@v4
with:
fetch-depth: 2
# Use Open ID Connect to authenticate to GCP
# For details, see:
# https://cloud.google.com/blog/products/identity-security/enabling-keyless-authentication-from-github-actions
- id: 'auth'
name: 'Authenticate to GCP'
uses: 'google-github-actions/auth@v2'
with:
project_id: '${{ env.GCP_PROJECT_ID }}'
# yamllint disable-line rule:line-length
workload_identity_provider: 'projects/${{ env.GCP_PROJECT_NUMBER }}/locations/global/workloadIdentityPools/${{ env.GCP_IDENTITY_POOL }}/providers/${{ env.GCP_IDENTITY_PROVIDER }}'
export_environment_variables: true
create_credentials_file: true
- name: 'Set up Cloud SDK'
uses: 'google-github-actions/setup-gcloud@v2'
- name: Install Heroku CLI
run: |
curl https://cli-assets.heroku.com/install.sh | sh
env:
HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }}
- name: Retrieve PostgreSQL credentials
run: |
DATABASE_URL=$(heroku config:get DATABASE_URL --app "${HEROKU_APP}")
echo "::add-mask::$DATABASE_URL"
echo "DATABASE_URL=$DATABASE_URL" >> $GITHUB_ENV
- name: "Setup Python"
uses: actions/setup-python@v5
with:
python-version: '3.9'
- name: "Install dependencies"
run: pip install google-cloud-bigquery google-cloud-storage psycopg2-binary
- name: Run transfer script
env:
TABLE_NAMES: |
ai_pics_aiattachment,
ai_pics_aipics,
bi_companies_by_query_group,
bi_companies_with_count_group,
bi_new_product_by_hour,
bi_popular_not_verified_products,
bi_product_by_time,
bi_queries_by_time,
bi_queries_stats_intervals,
bi_stats_queries_uq_users_by_week,
company_brand,
company_company,
gpc_brick,
gpc_class,
gpc_family,
gpc_segment,
pola_query,
pola_searchquery,
pola_stats,
product_product,
report_attachment,
report_report,
users_user
run: |
python ./pola-bi/postgres_to_bigquery.py --verbose all \
--database-url "${DATABASE_URL}" \
--table-names "${TABLE_NAMES}" \
--staging-url "gs://${GCP_BUCKET_NAME}/" \
--dataset-id "${GCP_BIGQUERY_DATASET}" \