diff --git a/CMIP6Plus_DRS.json b/CMIP6Plus_DRS.json new file mode 100644 index 00000000..d9bc3776 --- /dev/null +++ b/CMIP6Plus_DRS.json @@ -0,0 +1,19 @@ +{ + "DRS": { + "directory_path_example": "CMIP6Plus/CMIP/MOHC/HadGEM3-GC31-MM/historical/r1i1p1f3/Amon/tas/gn/v20191207/", + "directory_path_sub_experiment_example": "CMIP6Plus/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/s1960-r1i1p1f2/Amon/tas/gn/v20200417/", + "directory_path_template": "/////////", + "filename_example": "tas_Amon_HadGEM3-GC31-MM_historical_r1i1p1f3_gn_185001-186912.nc", + "filename_sub_experiment_example": "tas_Amon_HadGEM3-GC31-MM_dcppA-hindcast_s1960-r1i1p1f2_gn_196011-196012.nc", + "filename_template": "_____[_].nc" + }, + "Header": { + "CV_collection_modified": "2022-09-05", + "CV_collection_version": "6.3.0.0", + "author": "Matt Mizielinski ", + "checksum": "md5: ebda6eafcf0aba1ed108d6051ef27662", + "institution_id": "MOHC", + "previous_commit": "To be added", + "specs_doc": "v6.3.0 (link TBC)" + } +} \ No newline at end of file diff --git a/CMIP6Plus_activity_id.json b/CMIP6Plus_activity_id.json new file mode 100644 index 00000000..c28eb73c --- /dev/null +++ b/CMIP6Plus_activity_id.json @@ -0,0 +1,14 @@ +{ + "Header": { + "CV_collection_modified": "2022-09-05", + "CV_collection_version": "6.3.0.0", + "author": "Matt Mizielinski ", + "checksum": "md5: f2ffdb7d25c0e2d29beef55fe160a936", + "institution_id": "MOHC", + "previous_commit": "To be added", + "specs_doc": "v6.3.0 (link TBC)" + }, + "activity_id": { + "CMIP": "CMIP DECK: 1pctCO2, abrupt4xCO2, amip, esm-piControl, esm-historical, historical, and piControl experiments" + } +} \ No newline at end of file diff --git a/CMIP6Plus_experiment_id.json b/CMIP6Plus_experiment_id.json new file mode 100644 index 00000000..a9c4198f --- /dev/null +++ b/CMIP6Plus_experiment_id.json @@ -0,0 +1,424 @@ +{ + "Header": { + "CV_collection_modified": "Thu Sep 1 13:47:30 2022 -0700", + "CV_collection_version": "6.2.58.34", + "author": "Paul J. Durack ", + "checksum": "md5: 826788727cb8c57b28bbec9029273c1a", + "experiment_id_CV_modified": "Tue Dec 15 12:25:59 2020 -0800", + "experiment_id_CV_note": "Revise experiment_id historical parent experiments", + "institution_id": "PCMDI", + "previous_commit": "9ddb8352c2f51fc999549425ea7b4648da5ccd31", + "specs_doc": "v6.2.7 (10th September 2018; https://goo.gl/v1drZl)" + }, + "experiment_id": { + "1pctCO2": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM", + "BGC" + ], + "description": "DECK: 1pctCO2", + "end_year": "", + "experiment": "1 percent per year increase in CO2", + "experiment_id": "1pctCO2", + "min_number_yrs_per_sim": "150", + "parent_activity_id": [ + "CMIP" + ], + "parent_experiment_id": [ + "piControl" + ], + "required_model_components": [ + "AOGCM" + ], + "start_year": "", + "sub_experiment_id": [ + "none" + ], + "tier": "1" + }, + "abrupt-4xCO2": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM", + "BGC" + ], + "description": "DECK: abrupt-4xCO2", + "end_year": "", + "experiment": "abrupt quadrupling of CO2", + "experiment_id": "abrupt-4xCO2", + "min_number_yrs_per_sim": "150", + "parent_activity_id": [ + "CMIP" + ], + "parent_experiment_id": [ + "piControl" + ], + "required_model_components": [ + "AOGCM" + ], + "start_year": "", + "sub_experiment_id": [ + "none" + ], + "tier": "1" + }, + "amip": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM", + "BGC" + ], + "description": "DECK: AMIP", + "end_year": "2014", + "experiment": "AMIP", + "experiment_id": "amip", + "min_number_yrs_per_sim": "36", + "parent_activity_id": [ + "no parent" + ], + "parent_experiment_id": [ + "no parent" + ], + "required_model_components": [ + "AGCM" + ], + "start_year": "1979", + "sub_experiment_id": [ + "none" + ], + "tier": "1" + }, + "esm-hist": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM" + ], + "description": "CMIP6 historical (CO2 emission-driven)", + "end_year": "2014", + "experiment": "all-forcing simulation of the recent past with atmospheric CO2 concentration calculated", + "experiment_id": "esm-hist", + "min_number_yrs_per_sim": "165", + "parent_activity_id": [ + "CMIP" + ], + "parent_experiment_id": [ + "esm-piControl" + ], + "required_model_components": [ + "AOGCM", + "BGC" + ], + "start_year": "1850", + "sub_experiment_id": [ + "none" + ], + "tier": "1" + }, + "esm-hist-ext": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM" + ], + "description": "Extension beyond 2014 of the CMIP6 historical (CO2 emission-driven)", + "end_year": "present", + "experiment": "post-2014 all-forcing simulation with atmospheric CO2 concentration calculated", + "experiment_id": "esm-hist-ext", + "min_number_yrs_per_sim": "1", + "parent_activity_id": [ + "CMIP" + ], + "parent_experiment_id": [ + "esm-hist" + ], + "required_model_components": [ + "AOGCM", + "BGC" + ], + "start_year": "2015", + "sub_experiment_id": [ + "none" + ], + "tier": "2" + }, + "esm-piControl": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM" + ], + "description": "DECK: control (emission-driven)", + "end_year": "", + "experiment": "pre-industrial control simulation with CO2 concentration calculated", + "experiment_id": "esm-piControl", + "min_number_yrs_per_sim": "500", + "parent_activity_id": [ + "CMIP" + ], + "parent_experiment_id": [ + "esm-piControl-spinup" + ], + "required_model_components": [ + "AOGCM", + "BGC" + ], + "start_year": "", + "sub_experiment_id": [ + "none" + ], + "tier": "1" + }, + "esm-piControl-spinup": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM" + ], + "description": "DECK: spin-up portion of the control (emission-driven)", + "end_year": "", + "experiment": "pre-industrial control simulation with CO2 concentration calculated (spin-up)", + "experiment_id": "esm-piControl-spinup", + "min_number_yrs_per_sim": "100", + "parent_activity_id": [ + "no parent" + ], + "parent_experiment_id": [ + "no parent" + ], + "required_model_components": [ + "AOGCM", + "BGC" + ], + "start_year": "", + "sub_experiment_id": [ + "none" + ], + "tier": "2" + }, + "historical": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM", + "BGC" + ], + "description": "CMIP6 historical", + "end_year": "2014", + "experiment": "all-forcing simulation of the recent past", + "experiment_id": "historical", + "min_number_yrs_per_sim": "165", + "parent_activity_id": [ + "CMIP", + "PMIP" + ], + "parent_experiment_id": [ + "piControl", + "past1000", + "past2k" + ], + "required_model_components": [ + "AOGCM" + ], + "start_year": "1850", + "sub_experiment_id": [ + "none" + ], + "tier": "1" + }, + "historical-cmip5": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM", + "BGC" + ], + "description": "CMIP5 historical experiment, using CMIP5-era [1850-2005] forcing", + "end_year": "2005", + "experiment": "all-forcing simulation of the recent past (CMIP5-era [1850-2005] forcing)", + "experiment_id": "historical-cmip5", + "min_number_yrs_per_sim": "156", + "parent_activity_id": [ + "CMIP" + ], + "parent_experiment_id": [ + "piControl-cmip5" + ], + "required_model_components": [ + "AOGCM" + ], + "start_year": "1850", + "sub_experiment_id": [ + "none" + ], + "tier": "2" + }, + "historical-ext": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM", + "BGC" + ], + "description": "Extension beyond 2014 of the CMIP6 historical", + "end_year": "present", + "experiment": "post-2014 all-forcing simulation", + "experiment_id": "historical-ext", + "min_number_yrs_per_sim": "1", + "parent_activity_id": [ + "CMIP" + ], + "parent_experiment_id": [ + "historical" + ], + "required_model_components": [ + "AOGCM" + ], + "start_year": "2015", + "sub_experiment_id": [ + "none" + ], + "tier": "2" + }, + "piControl": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM", + "BGC" + ], + "description": "DECK: control", + "end_year": "", + "experiment": "pre-industrial control", + "experiment_id": "piControl", + "min_number_yrs_per_sim": "500", + "parent_activity_id": [ + "CMIP" + ], + "parent_experiment_id": [ + "piControl-spinup" + ], + "required_model_components": [ + "AOGCM" + ], + "start_year": "", + "sub_experiment_id": [ + "none" + ], + "tier": "1" + }, + "piControl-cmip5": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM", + "BGC" + ], + "description": "DECK: control (CMIP5-era pre-industrial forcing)", + "end_year": "", + "experiment": "pre-industrial control (CMIP5-era [1850-2005] forcing)", + "experiment_id": "piControl-cmip5", + "min_number_yrs_per_sim": "500", + "parent_activity_id": [ + "CMIP" + ], + "parent_experiment_id": [ + "piControl-spinup-cmip5" + ], + "required_model_components": [ + "AOGCM" + ], + "start_year": "", + "sub_experiment_id": [ + "none" + ], + "tier": "2" + }, + "piControl-spinup": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM", + "BGC" + ], + "description": "DECK: spin-up portion of the control", + "end_year": "", + "experiment": "pre-industrial control (spin-up)", + "experiment_id": "piControl-spinup", + "min_number_yrs_per_sim": "100", + "parent_activity_id": [ + "no parent" + ], + "parent_experiment_id": [ + "no parent" + ], + "required_model_components": [ + "AOGCM" + ], + "start_year": "", + "sub_experiment_id": [ + "none" + ], + "tier": "2" + }, + "piControl-spinup-cmip5": { + "activity_id": [ + "CMIP" + ], + "additional_allowed_model_components": [ + "AER", + "CHEM", + "BGC" + ], + "description": "DECK: spin-up portion of the control (CMIP5-era pre-industrial forcing)", + "end_year": "", + "experiment": "pre-industrial control (spin-up; CMIP5-era [1850-2005] forcing)", + "experiment_id": "piControl-spinup-cmip5", + "min_number_yrs_per_sim": "100", + "parent_activity_id": [ + "CMIP" + ], + "parent_experiment_id": [ + "no parent" + ], + "required_model_components": [ + "AOGCM" + ], + "start_year": "", + "sub_experiment_id": [ + "none" + ], + "tier": "2" + } + } +} \ No newline at end of file diff --git a/CMIP6Plus_further_info_url.json b/CMIP6Plus_further_info_url.json new file mode 100644 index 00000000..d7e6c4e8 --- /dev/null +++ b/CMIP6Plus_further_info_url.json @@ -0,0 +1,14 @@ +{ + "Header": { + "CV_collection_modified": "2022-09-05", + "CV_collection_version": "6.3.0.0", + "author": "Matt Mizielinski ", + "checksum": "md5: be5d1db66b455f5e8dd02c402dd0d967", + "institution_id": "MOHC", + "previous_commit": "To be added", + "specs_doc": "v6.3.0 (link TBC)" + }, + "further_info_url": [ + "https://furtherinfo.es-doc.org/.*" + ] +} \ No newline at end of file diff --git a/CMIP6Plus_license.json b/CMIP6Plus_license.json new file mode 100644 index 00000000..11a8ea2b --- /dev/null +++ b/CMIP6Plus_license.json @@ -0,0 +1,14 @@ +{ + "Header": { + "CV_collection_modified": "2022-09-05", + "CV_collection_version": "6.3.0.0", + "author": "Matt Mizielinski ", + "checksum": "md5: 796aef0080e78a08aaea5811074435a1", + "institution_id": "MOHC", + "previous_commit": "To be added", + "specs_doc": "v6.3.0 (link TBC)" + }, + "license": [ + "^CMIP6Plus model data produced by .* is licensed under a Creative Commons .* License (https://creativecommons\\.org/.*)\\. *Consult https://pcmdi\\.llnl\\.gov/CMIP6Plus/TermsOfUse for terms of use governing CMIP6Plus output, including citation requirements and proper acknowledgment\\. *Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file).*\\. *The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose\\. *All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law\\.$" + ] +} \ No newline at end of file diff --git a/CMIP6Plus_mip_era.json b/CMIP6Plus_mip_era.json new file mode 100644 index 00000000..fcc6f2d8 --- /dev/null +++ b/CMIP6Plus_mip_era.json @@ -0,0 +1,12 @@ +{ + "Header": { + "CV_collection_modified": "2022-09-05", + "CV_collection_version": "6.3.0.0", + "author": "Matt Mizielinski ", + "checksum": "md5: 766d790228d83749bd1052d3bf7a3b70", + "institution_id": "MOHC", + "previous_commit": "To be added", + "specs_doc": "v6.3.0 (link TBC)" + }, + "mip_era": "CMIP6Plus" +} \ No newline at end of file diff --git a/CMIP6Plus_required_global_attributes.json b/CMIP6Plus_required_global_attributes.json new file mode 100644 index 00000000..4b52a7be --- /dev/null +++ b/CMIP6Plus_required_global_attributes.json @@ -0,0 +1,43 @@ +{ + "Header": { + "CV_collection_modified": "2022-09-05", + "CV_collection_version": "6.3.0.0", + "author": "Matt Mizielinski ", + "checksum": "md5: 15eb2dfa237bb1bdbeb23f250c5acd8e", + "institution_id": "MOHC", + "previous_commit": "To be added", + "specs_doc": "v6.3.0 (link TBC)" + }, + "required_global_attributes": [ + "Conventions", + "activity_id", + "creation_date", + "data_specs_version", + "experiment", + "experiment_id", + "forcing_index", + "frequency", + "further_info_url", + "grid", + "grid_label", + "initialization_index", + "institution", + "institution_id", + "license", + "mip_era", + "nominal_resolution", + "physics_index", + "product", + "realization_index", + "realm", + "source", + "source_id", + "source_type", + "sub_experiment", + "sub_experiment_id", + "table_id", + "tracking_id", + "variable_id", + "variant_label" + ] +} \ No newline at end of file diff --git a/CMIP6Plus_source_id.json b/CMIP6Plus_source_id.json new file mode 100644 index 00000000..58701acc --- /dev/null +++ b/CMIP6Plus_source_id.json @@ -0,0 +1,73 @@ +{ + "Header": { + "CV_collection_modified": "Thu Sep 1 13:47:30 2022 -0700", + "CV_collection_version": "6.2.58.34", + "author": "Paul J. Durack ", + "checksum": "md5: f9e7db244f2969c0ea02db3755759f99", + "institution_id": "PCMDI", + "previous_commit": "9ddb8352c2f51fc999549425ea7b4648da5ccd31", + "source_id_CV_modified": "Wed Aug 31 12:51:15 2022 -0700", + "source_id_CV_note": "Revised 4 IPSL* source_id license histories; tweak EC-Earth3-HR cohort", + "specs_doc": "v6.2.7 (10th September 2018; https://goo.gl/v1drZl)" + }, + "source_id": { + "HadGEM3-GC31-LL": { + "activity_participation": [ + "CMIP" + ], + "cohort": [ + "Published" + ], + "institution_id": [ + "MOHC", + "NERC" + ], + "label": "HadGEM3-GC31-LL", + "label_extended": "HadGEM3-GC3.1-N96ORCA1", + "license_info": { + "exceptions_contact": "@metoffice.gov.uk <- cmip6.hadgem3", + "history": "2017-09-21: initially published under CC BY-SA 4.0; 2021-11-15: relaxed to CC BY 4.0", + "id": "CC BY 4.0", + "license": "Creative Commons Attribution 4.0 International License (CC BY 4.0; https://creativecommons.org/licenses/by/4.0/)", + "source_specific_info": "https://ukesm.ac.uk/licensing-of-met-office-nerc-and-niwa-cmip6-data/", + "url": "https://creativecommons.org/licenses/by/4.0/" + }, + "model_component": { + "aerosol": { + "description": "UKCA-GLOMAP-mode", + "native_nominal_resolution": "250 km" + }, + "atmos": { + "description": "MetUM-HadGEM3-GA7.1 (N96; 192 x 144 longitude/latitude; 85 levels; top level 85 km)", + "native_nominal_resolution": "250 km" + }, + "atmosChem": { + "description": "none", + "native_nominal_resolution": "none" + }, + "land": { + "description": "JULES-HadGEM3-GL7.1", + "native_nominal_resolution": "250 km" + }, + "landIce": { + "description": "none", + "native_nominal_resolution": "none" + }, + "ocean": { + "description": "NEMO-HadGEM3-GO6.0 (eORCA1 tripolar primarily 1 deg with meridional refinement down to 1/3 degree in the tropics; 360 x 330 longitude/latitude; 75 levels; top grid cell 0-1 m)", + "native_nominal_resolution": "100 km" + }, + "ocnBgchem": { + "description": "none", + "native_nominal_resolution": "none" + }, + "seaIce": { + "description": "CICE-HadGEM3-GSI8 (eORCA1 tripolar primarily 1 deg; 360 x 330 longitude/latitude)", + "native_nominal_resolution": "100 km" + } + }, + "release_year": "2016", + "source_id": "HadGEM3-GC31-LL" + } + } +} \ No newline at end of file diff --git a/CMIP6Plus_sub_experiment_id.json b/CMIP6Plus_sub_experiment_id.json new file mode 100644 index 00000000..28fb2cf7 --- /dev/null +++ b/CMIP6Plus_sub_experiment_id.json @@ -0,0 +1,14 @@ +{ + "Header": { + "CV_collection_modified": "2022-09-05", + "CV_collection_version": "6.3.0.0", + "author": "Matt Mizielinski ", + "checksum": "md5: b841ece401fa5f0d5e6bfdbf6dc2e970", + "institution_id": "MOHC", + "previous_commit": "To be added", + "specs_doc": "v6.3.0 (link TBC)" + }, + "sub_experiment_id": { + "none": "none" + } +} \ No newline at end of file diff --git a/CMIP6Plus_tracking_id.json b/CMIP6Plus_tracking_id.json new file mode 100644 index 00000000..6dd18483 --- /dev/null +++ b/CMIP6Plus_tracking_id.json @@ -0,0 +1,14 @@ +{ + "Header": { + "CV_collection_modified": "2022-09-05", + "CV_collection_version": "6.3.0.0", + "author": "Matt Mizielinski ", + "checksum": "md5: f26f4d1c70643d47eced87583272f081", + "institution_id": "MOHC", + "previous_commit": "To be added", + "specs_doc": "v6.3.0 (link TBC)" + }, + "tracking_id": [ + "hdl:21.14100/.*" + ] +} \ No newline at end of file diff --git a/src/Deconstruction.ipynb b/src/Deconstruction.ipynb new file mode 100644 index 00000000..8e62f255 --- /dev/null +++ b/src/Deconstruction.ipynb @@ -0,0 +1,153 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5e0008d3-0f30-4fa8-8ae7-d7f73c188fbd", + "metadata": {}, + "source": [ + "# Deconstruct CMIP6Plus_CV into per field controlled vocabularies" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bed247f5-8cca-417c-b930-58852e3f02e4", + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "import json\n", + "from copy import deepcopy\n", + "# Tools for calculating MD5 Checksums\n", + "from cv_checksums import calculate_checksum\n" + ] + }, + { + "cell_type": "markdown", + "id": "5eef2199-985c-4769-8182-bda7431858ba", + "metadata": {}, + "source": [ + "Load CV example file from mip-cmor-tables" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bec1b12c-cafa-4632-b46f-a69d5433295c", + "metadata": {}, + "outputs": [], + "source": [ + "project_cv_file = '../../mip-cmor-tables/Tables/CMIP6Plus_CV.json'\n", + "with open(project_cv_file) as fhandle:\n", + " project_cv_data = json.load(fhandle)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5d044af4-0a8f-4315-a31d-e0adb47cea69", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['DRS', 'activity_id', 'experiment_id', 'further_info_url', 'license', 'mip_era', 'required_global_attributes', 'source_id', 'sub_experiment_id', 'tracking_id', 'version_metadata'])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project_cv_data.keys()\n" + ] + }, + { + "cell_type": "markdown", + "id": "06bf643e-630f-4346-9576-e3773feed3fd", + "metadata": {}, + "source": [ + "Split out each field into a separate JSON file and add its own `version_metadata` entry" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4402341f-bf29-4aa1-8921-8fd352e593d0", + "metadata": {}, + "outputs": [], + "source": [ + "version_metadata = project_cv_data['version_metadata']\n", + "mip_era = project_cv_data['mip_era']\n", + "output_file_location = '..'\n", + "for field in project_cv_data:\n", + " if field == 'version_metadata':\n", + " continue\n", + " output_file_name = os.path.join(output_file_location, \n", + " '{}_{}.json'.format(mip_era, field))\n", + " output_file_data = {field: deepcopy(project_cv_data[field])}\n", + " output_file_data['version_metadata'] = version_metadata\n", + " \n", + " calculate_checksum(output_file_data)\n", + " with open(output_file_name, 'w') as fhandle:\n", + " json.dump(output_file_data, fhandle, indent=2, sort_keys=True)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "47756929-1c6a-4c9a-bc08-d3bdea963050", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CMIP6Plus_DRS.json\t\t\t CMIP6Plus_source_id.json\n", + "CMIP6Plus_activity_id.json\t\t CMIP6Plus_sub_experiment_id.json\n", + "CMIP6Plus_experiment_id.json\t\t CMIP6Plus_tracking_id.json\n", + "CMIP6Plus_further_info_url.json\t\t LICENSE\n", + "CMIP6Plus_license.json\t\t\t README.md\n", + "CMIP6Plus_mip_era.json\t\t\t src\n", + "CMIP6Plus_required_global_attributes.json\n" + ] + } + ], + "source": [ + "!ls .." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc8c07a7-9187-4005-ab37-5b6f89b5d68e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/construct_cmip6plus_cv.py b/src/construct_cmip6plus_cv.py new file mode 100644 index 00000000..4872ce9b --- /dev/null +++ b/src/construct_cmip6plus_cv.py @@ -0,0 +1,113 @@ +from copy import deepcopy +import glob +import json +import os +import sys + +from cv_checksums import calculate_checksum, validate_checksum + + +def get_mip_era(cv_directory): + filenames = glob.glob(os.path.join(cv_directory, '*_mip_era.json')) + if not len(filenames) == 1: + raise RuntimeError('Found multiple mip era files: {}'.format(repr(filenames))) + + with open(filenames[0]) as fhandle: + mip_era_data = json.load(fhandle) + + validate_checksum(mip_era_data) + + mip_era = mip_era_data['mip_era'] + return mip_era + + +def main(): + # lazy, should replace with argparse + location = sys.argv[1] + if not os.path.exists(location): + raise RuntimeError('Location "{}" not found'.format(location)) + # get MIP era + mip_era = get_mip_era(location) + # look for CV files + json_files_in_location = glob.glob(os.path.join(location, '{}*.json'.format(mip_era))) + + + if not len(json_files_in_location) > 1: + raise RuntimeError('Could not find more one JSON file in location "{}"'.format(location)) + + output_dictionary = {} + checksums = {} + for filename in json_files_in_location: + + with open(filename) as fhandle: + file_data = json.load(fhandle) + if filename.endswith('_CV.json'): + previous_cv_checksum = file_data['version_metadata']['checksum'] + checksums['previous {}'.format(os.path.basename(filename))] = previous_cv_checksum + continue + checksums[os.path.basename(filename)] = file_data['version_metadata']['checksum'] + + if 'experiment_id' in file_data: + strip_experiment_id_info(file_data) + elif 'source_id' in file_data: + strip_source_id_info(file_data) + + for entry in file_data: + if entry != 'version_metadata': + output_dictionary[entry] = file_data[entry] + + + + + # add version_metadta from last file -- may need to be updated depending on processes + output_dictionary['version_metadata'] = file_data['version_metadata'] + output_dictionary['version_metadata']['file_checksums'] = checksums + calculate_checksum(output_dictionary) + + output_file_name = os.path.join(location, '{}_CV.json'.format(mip_era)) + with open(output_file_name, 'w') as fhandle: + json.dump(output_dictionary, fhandle, indent=2, sort_keys=True) + + +EXPERIMENT_ID_FIELDS_FOR_CV_FILE = [ + 'activity_id', 'additional_allowed_model_components', 'experiment', 'experiment_id', + 'parent_activity_id', 'parent_experiment_id', 'required_model_components', 'sub_experiment_id', +] + +def strip_experiment_id_info(experiment_id_file_data): + experiment_id_info = experiment_id_file_data['experiment_id'] + for experiment_id, info in experiment_id_info.items(): + fields_to_delete = [i for i in info if i not in EXPERIMENT_ID_FIELDS_FOR_CV_FILE] + for field in fields_to_delete: + del info[field] + + +SOURCE_ID_FIELDS_FOR_CV_FILE = [ + "activity_participation", "cohort", "institution_id", "license_info", "source", "source_id" +] + +def strip_source_id_info(source_id_file_data): + + source_id_info = source_id_file_data['source_id'] + source_ids = source_id_info.keys() + for source_id in source_ids: + info = source_id_info[source_id] + component_string = '\n'.join(['{}: {}'.format(k, v['description']) for k,v in info['model_component'].items()]) + # had problems with strange source strings here, so take a copy of the dictionary modify that and then overwrite + new_info = deepcopy(info) + new_info['source'] = '{} ({}): \n{}'.format(info['label'], info['release_year'], component_string) + fields_to_delete = [i for i in new_info if i not in SOURCE_ID_FIELDS_FOR_CV_FILE] + for field in fields_to_delete: + del new_info[field] + source_id_info[source_id] = new_info + + +if __name__ == '__main__': + main() + + + + + + + diff --git a/src/cv_checksums.py b/src/cv_checksums.py new file mode 100644 index 00000000..97b44470 --- /dev/null +++ b/src/cv_checksums.py @@ -0,0 +1,69 @@ +""" +Tools for adding and validating checksums +""" +from copy import copy, deepcopy +import hashlib +import json + + +def calculate_checksum(dictionary, overwrite=True, checksum_location='version_metadata'): + """ + Calculate the checksum for dictionary and add it to the Header + + Parameters + ---------- + dictionary: dict + The dictionary to set the checksum for. + overwrite: bool + Overwrite the existing checksum (default True). + checksum_location: str + sub-dictionary to look for in /add the checksum to. + + Raises + ------ + RuntimeError + If the ``checksum`` key already exists and ``overwrite`` is + False. + """ + if 'checksum' in dictionary[checksum_location]: + if not overwrite: + raise RuntimeError('Checksum already exists.') + del dictionary[checksum_location]['checksum'] + checksum = _checksum(dictionary) + dictionary[checksum_location]['checksum'] = checksum + + +def validate_checksum(dictionary, checksum_location='version_metadata'): + """ + Validate the checksum in the ``dictionary``. + + Parameters + ---------- + dictionary: dict + The dictionary containing the ``checksum`` to validate. + checksum_location: str + sub-dictionary to look for in /add the checksum to. + + Raises + ------ + KeyError + If the ``checksum`` key does not exist. + RuntimeError + If the ``checksum`` value is invalid. + """ + if 'checksum' not in dictionary[checksum_location]: + raise KeyError('No checksum to validate') + dictionary_copy = deepcopy(dictionary) + del dictionary_copy[checksum_location]['checksum'] + checksum = _checksum(dictionary_copy) + if dictionary[checksum_location]['checksum'] != checksum: + msg = ('Expected checksum "{}"\n' + 'Calculated checksum "{}"').format(dictionary[checksum_location]['checksum'], + checksum) + raise RuntimeError(msg) + + +def _checksum(obj): + obj_str = json.dumps(obj, sort_keys=True) + checksum_hex = hashlib.md5(obj_str.encode('utf8')).hexdigest() + return 'md5: {}'.format(checksum_hex) \ No newline at end of file