diff --git a/catalogbuilder/scripts/gen_intake_gfdl_notebook.ipynb b/catalogbuilder/scripts/gen_intake_gfdl_notebook.ipynb deleted file mode 100644 index ef4115d..0000000 --- a/catalogbuilder/scripts/gen_intake_gfdl_notebook.ipynb +++ /dev/null @@ -1,4829 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "f39f9409-ee87-4431-9953-55607daba427", - "metadata": {}, - "source": [ - "This notebook was tested from a GFDL workstation.\n", - "This notebook is an example of using catalog builder from a notebook to generate data catalogs, a.k.a intake-esm catalogs.\n", - "\n", - "How to get here? \n", - "\n", - "Login to your workstation at GFDL.\n", - "module load python/3.9\n", - "conda activate intakebuilder \n", - "(For the above: Note that you can either install your own environment using the following or use an existing environment such as this: conda activate /nbhome/Aparna.Radhakrishnan/conda/envs/intakebuilder )\n", - "\n", - "conda create -n intakebuilder \n", - "conda install intakebuilder -c noaa-gfdl -n intakebuilder\n", - "\n", - "Now, we do a couple of things to make sure your environment is available to jupyter-lab as a kernel.\n", - "\n", - "pip install ipykernel \n", - "python -m ipykernel install --user --name=intakebuilder\n", - "\n", - "Now, start a jupyter-lab session from GFDL workstation: \n", - "\n", - "jupyter-lab \n", - "\n", - "This will give you the URL to the jupyter-lab session running on your localhost. Paste the URL in your web-browser (or via TigerVNC). Paste the notebook cells from this notebook, or locate the notebook from the path where you have downloaded or cloned it via git. Go to Kernel->Change Kernel-> Choose intakebuilder.\n", - "\n", - "Run the notebook and see the results! Extend it and share it with us via a github issue. \n" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "fb3010b8-170f-4462-ad2a-457d1d5415f7", - "metadata": {}, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - "Found existing file! Overwrite? (y/n) y\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "writing..\n", - "JSON generated at: /home/a1r/mycatalog.json\n", - "CSV generated at: /home/a1r/mycatalog.csv\n" - ] - } - ], - "source": [ - "from catalogbuilder.scripts import gen_intake_gfdl\n", - "import sys,os\n", - "\n", - "######USER input begins########\n", - "\n", - "#User provides the input directory for which a data catalog needs to be generated.\n", - "#Note that depending on the date and version of the tool, only time-series data are catalogued.\n", - "\n", - "input_path = \"/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/\"\n", - "\n", - "#USER inputs the output path. Based on the following setting, user can expect to see /home/a1r/mycatalog.csv and /home/a1r/mycatalog.json generated as output.\n", - "\n", - "output_path = \"/home/a1r/mycatalog\"\n", - "\n", - "####END OF user input ##########\n", - "sys.argv = ['--INPUT_PATH', input_path, output_path]\n", - "\n", - "try:\n", - " gen_intake_gfdl.main()\n", - "except SystemExit as e:\n", - " if e.code != 0:\n", - " raise" - ] - }, - { - "cell_type": "markdown", - "id": "626eaa1f-d801-4a7d-8fad-2851c9e81070", - "metadata": {}, - "source": [ - "Let's begin our analysis" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "181913cc-4776-4b16-95d6-c6ea1b2cbdad", - "metadata": {}, - "outputs": [], - "source": [ - "import intake_esm, intake\n", - "import matplotlib #do a pip install of tools needed in your env or from the notebook\n", - "from matplotlib import pyplot as plt\n", - "%matplotlib inline\n", - "import warnings\n", - "warnings.filterwarnings(\"ignore\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6665a48b-a335-4fc2-8130-1a4902a428b0", - "metadata": {}, - "outputs": [], - "source": [ - "pip install matplotlib" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "0f83dbc3-3dda-4a43-82e9-fb8726b2cda8", - "metadata": {}, - "outputs": [], - "source": [ - "col_url = \"/home/a1r/mycatalog.json\"\n", - "col = intake.open_esm_datastore(col_url)" - ] - }, - { - "cell_type": "markdown", - "id": "344ada01-6716-4fbd-9cee-878ff815d7dd", - "metadata": {}, - "source": [ - "Explore the catalog" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "1ce0716e-6667-4aeb-8c4b-50a05643b87f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " | activity_id | \n", - "institution_id | \n", - "source_id | \n", - "experiment_id | \n", - "frequency | \n", - "modeling_realm | \n", - "table_id | \n", - "member_id | \n", - "grid_label | \n", - "variable_id | \n", - "temporal_subset | \n", - "chunk_freq | \n", - "grid_label.1 | \n", - "platform | \n", - "dimensions | \n", - "cell_methods | \n", - "path | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "dev | \n", - "NaN | \n", - "am5 | \n", - "c96L65_am5f3b1r0_pdclim1850F | \n", - "3hr | \n", - "atmos_cmip | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "pr | \n", - "0002010100-0002123123 | \n", - "1yr | \n", - "NaN | \n", - "gfdl.ncrc5-deploy-prod-openmp | \n", - "NaN | \n", - "ts | \n", - "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... | \n", - "
1 | \n", - "dev | \n", - "NaN | \n", - "am5 | \n", - "c96L65_am5f3b1r0_pdclim1850F | \n", - "3hr | \n", - "atmos_cmip | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "rlut | \n", - "0002010100-0002123123 | \n", - "1yr | \n", - "NaN | \n", - "gfdl.ncrc5-deploy-prod-openmp | \n", - "NaN | \n", - "ts | \n", - "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... | \n", - "
2 | \n", - "dev | \n", - "NaN | \n", - "am5 | \n", - "c96L65_am5f3b1r0_pdclim1850F | \n", - "3hr | \n", - "atmos_cmip | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "pr | \n", - "0003010100-0003123123 | \n", - "1yr | \n", - "NaN | \n", - "gfdl.ncrc5-deploy-prod-openmp | \n", - "NaN | \n", - "ts | \n", - "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... | \n", - "
3 | \n", - "dev | \n", - "NaN | \n", - "am5 | \n", - "c96L65_am5f3b1r0_pdclim1850F | \n", - "3hr | \n", - "atmos_cmip | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "rlut | \n", - "0003010100-0003123123 | \n", - "1yr | \n", - "NaN | \n", - "gfdl.ncrc5-deploy-prod-openmp | \n", - "NaN | \n", - "ts | \n", - "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... | \n", - "
4 | \n", - "dev | \n", - "NaN | \n", - "am5 | \n", - "c96L65_am5f3b1r0_pdclim1850F | \n", - "3hr | \n", - "atmos_cmip | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "pr | \n", - "0004010100-0004123123 | \n", - "1yr | \n", - "NaN | \n", - "gfdl.ncrc5-deploy-prod-openmp | \n", - "NaN | \n", - "ts | \n", - "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... | \n", - "
... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "
6405 | \n", - "dev | \n", - "NaN | \n", - "am5 | \n", - "c96L65_am5f3b1r0_pdclim1850F | \n", - "monthly | \n", - "land_cmip | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "treeFracNdlDcd | \n", - "001001-001012 | \n", - "1yr | \n", - "NaN | \n", - "gfdl.ncrc5-deploy-prod-openmp | \n", - "NaN | \n", - "ts | \n", - "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... | \n", - "
6406 | \n", - "dev | \n", - "NaN | \n", - "am5 | \n", - "c96L65_am5f3b1r0_pdclim1850F | \n", - "monthly | \n", - "land_cmip | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "treeFracNdlEvg | \n", - "001001-001012 | \n", - "1yr | \n", - "NaN | \n", - "gfdl.ncrc5-deploy-prod-openmp | \n", - "NaN | \n", - "ts | \n", - "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... | \n", - "
6407 | \n", - "dev | \n", - "NaN | \n", - "am5 | \n", - "c96L65_am5f3b1r0_pdclim1850F | \n", - "monthly | \n", - "land_cmip | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "tsl | \n", - "001001-001012 | \n", - "1yr | \n", - "NaN | \n", - "gfdl.ncrc5-deploy-prod-openmp | \n", - "NaN | \n", - "ts | \n", - "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... | \n", - "
6408 | \n", - "dev | \n", - "NaN | \n", - "am5 | \n", - "c96L65_am5f3b1r0_pdclim1850F | \n", - "monthly | \n", - "land_cmip | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "vegFrac | \n", - "001001-001012 | \n", - "1yr | \n", - "NaN | \n", - "gfdl.ncrc5-deploy-prod-openmp | \n", - "NaN | \n", - "ts | \n", - "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... | \n", - "
6409 | \n", - "dev | \n", - "NaN | \n", - "am5 | \n", - "c96L65_am5f3b1r0_pdclim1850F | \n", - "monthly | \n", - "land_cmip | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "vegHeight | \n", - "001001-001012 | \n", - "1yr | \n", - "NaN | \n", - "gfdl.ncrc5-deploy-prod-openmp | \n", - "NaN | \n", - "ts | \n", - "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pd... | \n", - "
6410 rows × 17 columns
\n", - "<xarray.Dataset> Size: 757MB\n", - "Dimensions: (time: 3650, bnds: 2, lat: 180, lon: 288)\n", - "Coordinates:\n", - " average_DT (time) timedelta64[ns] 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n", - " average_T1 (time) object 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n", - " average_T2 (time) object 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n", - " * bnds (bnds) float64 16B 1.0 2.0\n", - " * lat (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 87.5 88.5 89.5\n", - " lat_bnds (lat, bnds) float64 3kB dask.array<chunksize=(180, 2), meta=np.ndarray>\n", - " * lon (lon) float64 2kB 0.625 1.875 3.125 4.375 ... 356.9 358.1 359.4\n", - " lon_bnds (lon, bnds) float64 5kB dask.array<chunksize=(288, 2), meta=np.ndarray>\n", - " * time (time) object 29kB 0002-01-01 12:00:00 ... 0011-12-31 12:00:00\n", - " time_bnds (time, bnds) object 58kB dask.array<chunksize=(5, 2), meta=np.ndarray>\n", - "Data variables:\n", - " mrso (time, lat, lon) float32 757MB dask.array<chunksize=(5, 180, 288), meta=np.ndarray>\n", - "Attributes: (12/18)\n", - " title: c96L65_am5f3b1r0_pdclim1850F\n", - " grid_type: regular\n", - " grid_tile: N/A\n", - " code_release_version: 2023.01\n", - " git_hash: unknown githash\n", - " external_variables: land_area\n", - " ... ...\n", - " intake_esm_attrs:variable_id: mrso\n", - " intake_esm_attrs:chunk_freq: 1yr\n", - " intake_esm_attrs:platform: gfdl.ncrc5-deploy-prod-openmp\n", - " intake_esm_attrs:cell_methods: ts\n", - " intake_esm_attrs:_data_format_: netcdf\n", - " intake_esm_dataset_key: am5.c96L65_am5f3b1r0_pdclim1850F.daily....
<xarray.DataArray 'mrso' (time: 3650, lat: 180, lon: 288)> Size: 757MB\n", - "dask.array<concatenate, shape=(3650, 180, 288), dtype=float32, chunksize=(5, 180, 288), chunktype=numpy.ndarray>\n", - "Coordinates:\n", - " average_DT (time) timedelta64[ns] 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n", - " average_T1 (time) object 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n", - " average_T2 (time) object 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n", - " * lat (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 87.5 88.5 89.5\n", - " * lon (lon) float64 2kB 0.625 1.875 3.125 4.375 ... 356.9 358.1 359.4\n", - " * time (time) object 29kB 0002-01-01 12:00:00 ... 0011-12-31 12:00:00\n", - "Attributes:\n", - " units: kg m-2\n", - " long_name: Total Soil Moisture Content\n", - " cell_methods: area: mean time: mean\n", - " ocean_fillvalue: 0.0\n", - " cell_measures: area: land_area\n", - " time_avg_info: average_T1,average_T2,average_DT\n", - " standard_name: soil_moisture_content\n", - " interp_method: conserve_order1
<xarray.Dataset> Size: 757MB\n", - "Dimensions: (time: 3650, bnds: 2, lat: 180, lon: 288)\n", - "Coordinates:\n", - " average_DT (time) timedelta64[ns] 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n", - " average_T1 (time) object 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n", - " average_T2 (time) object 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n", - " * bnds (bnds) float64 16B 1.0 2.0\n", - " * lat (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 87.5 88.5 89.5\n", - " lat_bnds (lat, bnds) float64 3kB dask.array<chunksize=(180, 2), meta=np.ndarray>\n", - " * lon (lon) float64 2kB 0.625 1.875 3.125 4.375 ... 356.9 358.1 359.4\n", - " lon_bnds (lon, bnds) float64 5kB dask.array<chunksize=(288, 2), meta=np.ndarray>\n", - " * time (time) object 29kB 0002-01-01 12:00:00 ... 0011-12-31 12:00:00\n", - " time_bnds (time, bnds) object 58kB dask.array<chunksize=(5, 2), meta=np.ndarray>\n", - "Data variables:\n", - " mrso (time, lat, lon) float32 757MB dask.array<chunksize=(5, 180, 288), meta=np.ndarray>\n", - "Attributes: (12/18)\n", - " title: c96L65_am5f3b1r0_pdclim1850F\n", - " grid_type: regular\n", - " grid_tile: N/A\n", - " code_release_version: 2023.01\n", - " git_hash: unknown githash\n", - " external_variables: land_area\n", - " ... ...\n", - " intake_esm_attrs:variable_id: mrso\n", - " intake_esm_attrs:chunk_freq: 1yr\n", - " intake_esm_attrs:platform: gfdl.ncrc5-deploy-prod-openmp\n", - " intake_esm_attrs:cell_methods: ts\n", - " intake_esm_attrs:_data_format_: netcdf\n", - " intake_esm_dataset_key: am5.c96L65_am5f3b1r0_pdclim1850F.daily....
<xarray.DataArray 'mrso' (time: 0, lat: 180, lon: 288)> Size: 0B\n", - "dask.array<getitem, shape=(0, 180, 288), dtype=float32, chunksize=(0, 180, 288), chunktype=numpy.ndarray>\n", - "Coordinates:\n", - " average_DT (time) float64 0B dask.array<chunksize=(0,), meta=np.ndarray>\n", - " average_T1 (time) float64 0B dask.array<chunksize=(0,), meta=np.ndarray>\n", - " average_T2 (time) float64 0B dask.array<chunksize=(0,), meta=np.ndarray>\n", - " * lat (lat) float64 1kB -89.5 -88.5 -87.5 -86.5 ... 87.5 88.5 89.5\n", - " * lon (lon) float64 2kB 0.625 1.875 3.125 4.375 ... 356.9 358.1 359.4\n", - " * time (time) float64 0B \n", - "Attributes:\n", - " units: kg m-2\n", - " long_name: Total Soil Moisture Content\n", - " cell_methods: area: mean time: mean\n", - " ocean_fillvalue: 0.0\n", - " cell_measures: area: land_area\n", - " time_avg_info: average_T1,average_T2,average_DT\n", - " standard_name: soil_moisture_content\n", - " interp_method: conserve_order1
<xarray.DataArray 'time' (time: 3650)> Size: 29kB\n", - "array([ 365.5, 366.5, 367.5, ..., 4012.5, 4013.5, 4014.5])\n", - "Coordinates:\n", - " average_DT (time) float64 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n", - " average_T1 (time) float64 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n", - " average_T2 (time) float64 29kB dask.array<chunksize=(5,), meta=np.ndarray>\n", - " * time (time) float64 29kB 365.5 366.5 367.5 ... 4.014e+03 4.014e+03\n", - "Attributes:\n", - " units: days since 0001-01-01 00:00:00\n", - " long_name: time\n", - " axis: T\n", - " calendar_type: NOLEAP\n", - " calendar: noleap\n", - " bounds: time_bnds\n", - " cell_methods: time: mean