From 6d8c496f46066a43bf7f90c5304dc08923e7b6ad Mon Sep 17 00:00:00 2001 From: Marco Braga Date: Wed, 18 Oct 2023 18:53:59 -0300 Subject: [PATCH] describe the steps to provision ARM64 clusters --- .../OCI/installing-quickly-external-arm64.md | 169 ++++++++++++++++++ .../guides/OCI/installing-quickly-external.md | 84 +++++---- .../vars/oci/profiles/ha/node-bootstrap.yaml | 8 +- .../vars/oci/profiles/ha/node-compute.yaml | 20 +-- .../oci/profiles/ha/node-controlplane.yaml | 65 +++++-- 5 files changed, 284 insertions(+), 62 deletions(-) create mode 100644 docs/guides/OCI/installing-quickly-external-arm64.md diff --git a/docs/guides/OCI/installing-quickly-external-arm64.md b/docs/guides/OCI/installing-quickly-external-arm64.md new file mode 100644 index 0000000..bf0bf00 --- /dev/null +++ b/docs/guides/OCI/installing-quickly-external-arm64.md @@ -0,0 +1,169 @@ +## Install a OCP cluster with ARM64 Arch on Oracle Cloud Infrastructure (OCI) with CCM + +Install an OCP cluster in OCI with Platform External as an option and OCI Cloud Controler Manager. + +## Prerequisites + +- okd-installer Collection with [OCI dependencies installed](./oci-prerequisites.md): +- Compartments used to launch the cluster created and exported to variable `${OCI_COMPARTMENT_ID}` +- DNS Zone place the DNS zone and exported to variable `${OCI_COMPARTMENT_ID_DNS}` +- Compartment used to store the RHCOS image exported to variable `${OCI_COMPARTMENT_ID_IMAGE}` + +Example: + +```bash +cat < ~/.oci/env +# Compartment that the cluster will be installed +OCI_COMPARTMENT_ID="" + +# Compartment that the DNS Zone is created (based domain) +OCI_COMPARTMENT_ID_DNS="" + +# Compartment that the OS Image will be created +OCI_COMPARTMENT_ID_IMAGE="" +EOF +source ~/.oci/env +``` + +## Setup with Platform External type and CCM + +Create the vars file for okd-installer collection: + +```bash +# MCO patch without revendor (w/o disabling FG) +CLUSTER_NAME=oci-e414rc2arm1usash1 +VARS_FILE=./vars-oci-ha_${CLUSTER_NAME}.yaml + +cat < ${VARS_FILE} +provider: oci +cluster_name: ${CLUSTER_NAME} +config_cluster_region: us-ashburn-1 + +cluster_profile: ha +destroy_bootstrap: no + +#config_base_domain: splat-oci.devcluster.openshift.com +config_base_domain: us-ashburn-1.splat-oci.devcluster.openshift.com + +config_ssh_key: "$(cat ~/.ssh/openshift-dev.pub)" +config_pull_secret_file: "${HOME}/.openshift/pull-secret-latest.json" + +config_cluster_version: 4.14.0-rc.2 +version: 4.14.0-rc.2 + +config_platform: external +config_platform_spec: '{"platformName":"oci"}' + +oci_ccm_namespace: oci-cloud-controller-manager +oci_compartment_id: ${OCI_COMPARTMENT_ID} +oci_compartment_id_dns: ${OCI_COMPARTMENT_ID_DNS} +oci_compartment_id_image: ${OCI_COMPARTMENT_ID_IMAGE} + +# Available manifest paches (runs after 'create manifest' stage) +config_patches: +- rm-capi-machines +- mc_varlibetcd +- mc-kubelet-providerid +- deploy-oci-ccm +#- deploy-oci-csi + +# MachineConfig to set the Kubelet environment. Will use this script to discover the ProviderID +cfg_patch_kubelet_providerid_script: | + PROVIDERID=\$(curl -H "Authorization: Bearer Oracle" -sL http://169.254.169.254/opc/v2/instance/ | jq -r .id); + +# spread nodes between "AZs" +oci_availability_domains: +- gzqB:US-ASHBURN-AD-1 +- gzqB:US-ASHBURN-AD-2 +- gzqB:US-ASHBURN-AD-3 + +oci_fault_domains: +- FAULT-DOMAIN-1 +- FAULT-DOMAIN-2 +- FAULT-DOMAIN-3 + +# OCI config for ARM64 +config_default_architecture: arm64 +compute_shape: "VM.Standard.A1.Flex" +controlplane_shape: "VM.Standard.A1.Flex" +bootstrap_instance: "VM.Standard.A1.Flex" + +# Define the OS Image mirror +os_mirror: yes +os_mirror_from: stream_artifacts +os_mirror_stream: + architecture: aarch64 + artifact: openstack + format: qcow2.gz + +os_mirror_to_provider: oci +os_mirror_to_oci: + compartment_id: ${OCI_COMPARTMENT_ID_IMAGE} + bucket: rhcos-images + image_type: QCOW2 + # not supported yet, must be added for arm64 + # https://oci-ansible-collection.readthedocs.io/en/latest/collections/oracle/oci/oci_compute_image_shape_compatibility_entry_module.html#ansible-collections-oracle-oci-oci-compute-image-shape-compatibility-entry-module + compatibility_shapes: + - name: VM.Standard.A1.Flex + memory_constraints: + min_in_gbs: 4 + max_in_gbs: 128 + ocpu_constraints: + min: 2 + max: 32 +EOF +``` + +## Install the cluster + +```bash +ansible-playbook mtulio.okd_installer.create_all \ + -e cert_max_retries=30 \ + -e cert_wait_interval_sec=60 \ + -e @$VARS_FILE +``` + +### Approve certificates + +Export `KUBECONFIG`: + +```bash +export KUBECONFIG=$HOME/.ansible/okd-installer/clusters/${CLUSTER_NAME}/auth/kubeconfig +``` + +Check and Approve the certificates: +```bash +oc get csr \ + -o go-template='{{range .items}}{{if not .status}}{{.metadata.name}}{{"\n"}}{{end}}{{end}}' \ + | xargs oc adm certificate approve +``` + +Check if the nodes joined to the cluster: + +```bash +oc get nodes +``` + +## Testing + +Setup the test environment (internal registry, labeling and taint worker node, etc): + +```bash +test_node=$(oc get nodes -l node-role.kubernetes.io/worker='' -o jsonpath='{.items[0].metadata.name}') +oc label node $test_node node-role.kubernetes.io/tests="" +oc adm taint node $test_node node-role.kubernetes.io/tests="":NoSchedule +``` + +Run the tests: + +```bash +./opct run -w &&\ + ./opct retrieve &&\ + ./opct report *.tar.gz --save-to /tmp/results --server-skip +``` + +## Destroy the cluster + +```bash +ansible-playbook mtulio.okd_installer.destroy_cluster -e @$VARS_FILE +``` \ No newline at end of file diff --git a/docs/guides/OCI/installing-quickly-external.md b/docs/guides/OCI/installing-quickly-external.md index 9d3b873..bdfa230 100644 --- a/docs/guides/OCI/installing-quickly-external.md +++ b/docs/guides/OCI/installing-quickly-external.md @@ -5,13 +5,11 @@ Install an OCP cluster in OCI with Platform External as an option and OCI Cloud ## Prerequisites - okd-installer Collection with [OCI dependencies installed](./oci-prerequisites.md): -- Compartments used to create the cluster created and exported to variable `${}` -- DNS Zone place the DNS zone and exported to variable `${}` -- Compartment used to store the RHCOS image exported to variable `${}` +- Compartments used to launch the cluster created and exported to variable `${OCI_COMPARTMENT_ID}` +- DNS Zone place the DNS zone and exported to variable `${OCI_COMPARTMENT_ID_DNS}` +- Compartment used to store the RHCOS image exported to variable `${OCI_COMPARTMENT_ID_IMAGE}` -## Setup with Platform External type and CCM - -Create the vars file for okd-installer collection: +Example: ```bash cat < ~/.oci/env @@ -25,9 +23,15 @@ OCI_COMPARTMENT_ID_DNS="" OCI_COMPARTMENT_ID_IMAGE="" EOF source ~/.oci/env +``` +## Setup with Platform External type and CCM + +Create the vars file for okd-installer collection: + +```bash # MCO patch without revendor (w/o disabling FG) -CLUSTER_NAME=oci-e414rc0 +CLUSTER_NAME=oci-e414rc2 VARS_FILE=./vars-oci-ha_${CLUSTER_NAME}.yaml cat < ${VARS_FILE} @@ -43,41 +47,24 @@ cluster_profile: ha destroy_bootstrap: no config_base_domain: splat-oci.devcluster.openshift.com + config_ssh_key: "$(cat ~/.ssh/openshift-dev.pub)" config_pull_secret_file: "${HOME}/.openshift/pull-secret-latest.json" -config_cluster_version: 4.14.0-rc.0 -version: 4.14.0-rc.0 - -# Define the OS Image mirror -os_mirror: yes -os_mirror_from: stream_artifacts -os_mirror_stream: - architecture: x86_64 - artifact: openstack - format: qcow2.gz - -os_mirror_to_provider: oci -os_mirror_to_oci: - compartment_id: ${OCI_COMPARTMENT_ID_IMAGE} - bucket: rhcos-images - image_type: QCOW2 - -EOF - - -# Platform External setup only -cat <> ${VARS_FILE} +config_cluster_version: 4.14.0-rc.2 +version: 4.14.0-rc.2 +# Platform External setup config_platform: external config_platform_spec: '{"platformName":"oci"}' # Available manifest paches (runs after 'create manifest' stage) config_patches: - rm-capi-machines +- mc_varlibetcd - mc-kubelet-providerid - deploy-oci-ccm -- deploy-oci-csi +#- deploy-oci-csi # MachineConfig to set the Kubelet environment. Will use this script to discover the ProviderID cfg_patch_kubelet_providerid_script: | @@ -85,9 +72,33 @@ cfg_patch_kubelet_providerid_script: | oci_ccm_namespace: oci-cloud-controller-manager +# Define the OS Image mirror +os_mirror: yes +os_mirror_from: stream_artifacts +os_mirror_stream: + architecture: x86_64 + artifact: openstack + format: qcow2.gz + +os_mirror_to_provider: oci +os_mirror_to_oci: + compartment_id: ${OCI_COMPARTMENT_ID_IMAGE} + bucket: rhcos-images + image_type: QCOW2 + +# Experimental: increase the boot volume performance +# controlplane_source_details: +# source_type: image +# boot_volume_size_in_gbs: 1200 +# boot_volume_vpus_per_gb: 120 + +# Mount control plane as a second volume +# cfg_patch_mc_varlibetcd: +# device_path: /dev/sdb EOF ``` + ## Install the cluster ```bash @@ -99,12 +110,25 @@ ansible-playbook mtulio.okd_installer.create_all \ ### Approve certificates +Export `KUBECONFIG`: + +```bash +export KUBECONFIG=$HOME/.ansible/okd-installer/clusters/${CLUSTER_NAME}/auth/kubeconfig +``` + +Check and Approve the certificates: ```bash oc get csr \ -o go-template='{{range .items}}{{if not .status}}{{.metadata.name}}{{"\n"}}{{end}}{{end}}' \ | xargs oc adm certificate approve ``` +Check if the nodes joined to the cluster: + +```bash +oc get nodes +``` + ## Testing Setup the test environment (internal registry, labeling and taint worker node, etc): diff --git a/playbooks/vars/oci/profiles/ha/node-bootstrap.yaml b/playbooks/vars/oci/profiles/ha/node-bootstrap.yaml index 2841235..df7ba28 100644 --- a/playbooks/vars/oci/profiles/ha/node-bootstrap.yaml +++ b/playbooks/vars/oci/profiles/ha/node-bootstrap.yaml @@ -5,7 +5,7 @@ _cluster_prefix: "{{ cluster_state.infra_id }}" bootstrap_bucket: "{{ _cluster_prefix }}-infra" # Vars used on Machine/Compute Stack -_instance_type: "{{ bootstrap_instance | d('m6i.xlarge') }}" +_instance_type: "{{ bootstrap_instance | d('VM.Standard.E4.Flex') }}" _instance_profile: "{{ cluster_state.compute.iam_profile_bootstrap }}" # _image_id: "{{ custom_image_id | d(cluster_state.compute.image_id) }}" _image_id: "{{ custom_image_id }}" @@ -16,6 +16,8 @@ _machine_suffix: '' ## User Data template userdata_config_source: "{{ bootstrap_bucket_signed_url }}" +default_availability_domain: "gzqB:US-ASHBURN-AD-1" + ## Common vars used in the Stack vars # _common: # prefix: "{{ _cluster_prefix }}-bootstrap" @@ -72,10 +74,10 @@ compute_resources: region: "{{ config_cluster_region }}" #freeform_tags: {'Department': 'Finance'} #defined_tags: {'Operations': {'CostCenter': 'US'}} - availability_domain: "gzqB:US-SANJOSE-1-AD-1" + availability_domain: "{{ default_availability_domain }}" # platform_config: # type: AMD_VM - shape: "VM.Standard.E4.Flex" + shape: "{{ _instance_type }}" shape_config: ocpus: 4 memory_in_gbs: 16 diff --git a/playbooks/vars/oci/profiles/ha/node-compute.yaml b/playbooks/vars/oci/profiles/ha/node-compute.yaml index 5a4ddc1..6087ab1 100644 --- a/playbooks/vars/oci/profiles/ha/node-compute.yaml +++ b/playbooks/vars/oci/profiles/ha/node-compute.yaml @@ -10,10 +10,10 @@ _shape_config_default: # Uncomment if you want to run the nodes in the same FD #node_compute_single_fault_domain: FAULT-DOMAIN-1 -_compute_fault_domains: # it will be used by index: worker-1 uses index 0... - - FAULT-DOMAIN-1 - - FAULT-DOMAIN-2 - - FAULT-DOMAIN-3 +default_availability_domain: "gzqB:US-ASHBURN-AD-1" +default_fault_domain: FAULT-DOMAIN-1 +_compute_availability_domain: "{{ oci_availability_domains }}" +_compute_fault_domains: "{{ oci_fault_domains }}" _shape: "{{ compute_shape | d('VM.Standard.E4.Flex') }}" _shape_config: "{{ compute_shape_config | d(_shape_config_default) }}" @@ -54,8 +54,8 @@ compute_resources: region: "{{ config_cluster_region }}" #freeform_tags: {'Department': 'Finance'} #defined_tags: {'Operations': {'CostCenter': 'US'}} - availability_domain: "gzqB:US-SANJOSE-1-AD-1" - fault_domain: "{{ _compute_fault_domains[0] | d('FAULT-DOMAIN-1') }}" + availability_domain: "{{ _compute_availability_domain[0] | d(default_availability_domain) }}" + fault_domain: "{{ _compute_fault_domains[0] | d(default_fault_domain) }}" # platform_config: # type: AMD_VM @@ -105,8 +105,8 @@ compute_resources: region: "{{ config_cluster_region }}" #freeform_tags: {'Department': 'Finance'} #defined_tags: {'Operations': {'CostCenter': 'US'}} - availability_domain: "gzqB:US-SANJOSE-1-AD-1" - fault_domain: "{{ _compute_fault_domains[1] | d('FAULT-DOMAIN-2') }}" + availability_domain: "{{ _compute_availability_domain[1] | d(default_availability_domain) }}" + fault_domain: "{{ _compute_fault_domains[1] | d(default_fault_domain) }}" # platform_config: # type: AMD_VM @@ -156,8 +156,8 @@ compute_resources: region: "{{ config_cluster_region }}" #freeform_tags: {'Department': 'Finance'} #defined_tags: {'Operations': {'CostCenter': 'US'}} - availability_domain: "gzqB:US-SANJOSE-1-AD-1" - fault_domain: "{{ _compute_fault_domains[2] | d('FAULT-DOMAIN-3') }}" + availability_domain: "{{ _compute_availability_domain[2] | d(default_availability_domain) }}" + fault_domain: "{{ _compute_fault_domains[2] | d(default_fault_domain) }}" # platform_config: # type: AMD_VM diff --git a/playbooks/vars/oci/profiles/ha/node-controlplane.yaml b/playbooks/vars/oci/profiles/ha/node-controlplane.yaml index c9668bd..afbe836 100644 --- a/playbooks/vars/oci/profiles/ha/node-controlplane.yaml +++ b/playbooks/vars/oci/profiles/ha/node-controlplane.yaml @@ -4,9 +4,9 @@ # Defaults used in thie file node_controlplane_userdata_path: "{{ config_install_dir }}/master.ign" -_platform_config: - type: AMD_VM -_shape: "VM.Standard.E4.Flex" +# _platform_config: +# type: AMD_VM +_shape: "{{ controlplane_shape | d('VM.Standard.E4.Flex') }}" _shape_config: ocpus: 4 memory_in_gbs: 16 @@ -15,20 +15,20 @@ _shape_config: # Uncomment if you want to run the nodes in the same FD #node_controlplane_single_fault_domain: "FAULT-DOMAIN-1" -_controlplane_fault_domains: # it will be used by index: worker-1 uses index 0... - - FAULT-DOMAIN-1 - - FAULT-DOMAIN-2 - - FAULT-DOMAIN-3 +default_availability_domain: "gzqB:US-ASHBURN-AD-1" +default_fault_domain: FAULT-DOMAIN-1 +_controlplane_availability_domain: "{{ oci_availability_domains }}" +_controlplane_fault_domains: "{{ oci_fault_domains }}" _agent_config: are_all_plugins_disabled: true _source_details: source_type: image - boot_volume_size_in_gbs: 120 # VPU/GB # https://docs.oracle.com/en-us/iaas/Content/Block/Concepts/blockvolumeperformance.htm - boot_volume_vpus_per_gb: 90 + boot_volume_size_in_gbs: 512 + boot_volume_vpus_per_gb: 60 # Callbacks used to register the instances _callbacks: @@ -67,14 +67,14 @@ compute_resources: region: "{{ config_cluster_region }}" #freeform_tags: {'Department': 'Finance'} #defined_tags: {'Operations': {'CostCenter': 'US'}} - availability_domain: "gzqB:US-SANJOSE-1-AD-1" - fault_domain: "{{ _controlplane_fault_domains[0] | d('FAULT-DOMAIN-1') }}" + availability_domain: "{{ _controlplane_availability_domain[0] | d(default_availability_domain) }}" + fault_domain: "{{ _controlplane_fault_domains[0] | d(default_fault_domain) }}" # platform_config: "{{ _platform_config }}" shape: "{{ _shape }}" shape_config: "{{ _shape_config }}" agent_config: "{{ _agent_config }}" - source_details: "{{ _source_details }}" + source_details: "{{ controlplane_source_details | d(_source_details) }}" create_vnic_details: display_name: "{{ cluster_state.infra_id }}-master-01-vnic0" @@ -84,10 +84,37 @@ compute_resources: metadata: user_data: "{{ lookup('file', node_controlplane_userdata_path) | b64encode }}" + # Extra volumes + # https://oci-ansible-collection.readthedocs.io/en/latest/collections/oracle/oci/oci_blockstorage_volume_module.html#ansible-collections-oracle-oci-oci-blockstorage-volume-module + # oracle.oci.oci_compute_volume_attachment + # volume_attachment_spec: + # device: /dev/sdb + # display_name: master-01-etcd-attc + # #instance_id + # is_read_only: no + # is_shareable: no + # type: service_determined + # #volume_id + # # oracle.oci.oci_blockstorage_volume + # blockstorage_volume_spec: + # # required + # #compartment_id: "ocid1.compartment.oc1..xxxxxxEXAMPLExxxxxx" + # # optional + # availability_domain: "{{ _controlplane_fault_domains[0] | d('FAULT-DOMAIN-1') }}" + # # source_details: + # # # required + # # type: blockVolumeReplica + # # id: "ocid1.resource.oc1..xxxxxxEXAMPLExxxxxx" + # display_name: master-01-etcd + # vpus_per_gb: 60 + # size_in_gbs: 60 + # is_auto_tune_enabled: true + + + ## attachments https://oci-ansible-collection.readthedocs.io/en/latest/collections/oracle/oci/oci_compute_volume_attachment_module.html#ansible-collections-oracle-oci-oci-compute-volume-attachment-module # Register the instance using callbacks callbacks: "{{ _callbacks }}" - # # Node role: controlplane # Node: master-02 @@ -113,14 +140,14 @@ compute_resources: region: "{{ config_cluster_region }}" #freeform_tags: {'Department': 'Finance'} #defined_tags: {'Operations': {'CostCenter': 'US'}} - availability_domain: "gzqB:US-SANJOSE-1-AD-1" - fault_domain: "{{ _controlplane_fault_domains[1] | d('FAULT-DOMAIN-2') }}" + availability_domain: "{{ _controlplane_availability_domain[1] | d(default_availability_domain) }}" + fault_domain: "{{ _controlplane_fault_domains[1] | d(default_fault_domain) }}" # platform_config: "{{ _platform_config }}" shape: "{{ _shape }}" shape_config: "{{ _shape_config }}" agent_config: "{{ _agent_config }}" - source_details: "{{ _source_details }}" + source_details: "{{ controlplane_source_details | d(_source_details) }}" create_vnic_details: display_name: "{{ cluster_state.infra_id }}-master-02-vnic0" @@ -158,14 +185,14 @@ compute_resources: region: "{{ config_cluster_region }}" #freeform_tags: {'Department': 'Finance'} #defined_tags: {'Operations': {'CostCenter': 'US'}} - availability_domain: "gzqB:US-SANJOSE-1-AD-1" - fault_domain: "{{ _controlplane_fault_domains[2] | d('FAULT-DOMAIN-3') }}" + availability_domain: "{{ _controlplane_availability_domain[2] | d(default_availability_domain) }}" + fault_domain: "{{ _controlplane_fault_domains[2] | d(default_fault_domain) }}" # platform_config: "{{ _platform_config }}" shape: "{{ _shape }}" shape_config: "{{ _shape_config }}" agent_config: "{{ _agent_config }}" - source_details: "{{ _source_details }}" + source_details: "{{ controlplane_source_details | d(_source_details) }}" create_vnic_details: display_name: "{{ cluster_state.infra_id }}-master-03-vnic0"