Skip to content

Commit

Permalink
Merge pull request #1028 from yejingxin/master
Browse files Browse the repository at this point in the history
Add terraform output and maintenance interval feature
  • Loading branch information
yejingxin authored Aug 25, 2023
2 parents e85cb0d + dddf7cb commit dee3dbb
Show file tree
Hide file tree
Showing 10 changed files with 96 additions and 21 deletions.
2 changes: 2 additions & 0 deletions tools/kubernetes/terraform/examples/v4/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ variable "project_id" {}
variable "resource_name_prefix" {}
variable "region" {}
variable "tpu_node_pools" {}
variable "maintenance_interval" {}


module "tpu-gke" {
Expand All @@ -10,4 +11,5 @@ module "tpu-gke" {
resource_name_prefix = var.resource_name_prefix
region = var.region
tpu_node_pools = var.tpu_node_pools
maintenance_interval = var.maintenance_interval
}
24 changes: 24 additions & 0 deletions tools/kubernetes/terraform/examples/v4/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
output "region" {
value = var.region
description = "GCloud Region"
}

output "project_id" {
value = var.project_id
description = "GCloud Project ID"
}

output "kubernetes_cluster_name" {
value = module.tpu-gke.kubernetes_cluster_name
description = "GKE Cluster Name"
}

output "kubernetes_cluster_host" {
value = module.tpu-gke.kubernetes_cluster_host
description = "GKE Cluster Host"
}

output "nodepool_tpu_topology" {
value = module.tpu-gke.nodepool_tpu_topology
description = "GKE TPU topology"
}
16 changes: 3 additions & 13 deletions tools/kubernetes/terraform/examples/v4/terraform.tfvars
Original file line number Diff line number Diff line change
@@ -1,19 +1,9 @@
project_id = "tpu-prod-env-multipod"
resource_name_prefix = "yejingxin"
project_id = "project-id"
resource_name_prefix = "tpu-test"
region = "us-central2"
tpu_node_pools = [{
zone = "us-central2-b"
node_count = 4
machine_type = "ct4p-hightpu-4t"
topology = "2x2x4"
}, {
zone = "us-central2-b"
node_count = 4
machine_type = "ct4p-hightpu-4t"
topology = "2x2x4"
}, {
zone = "us-central2-b"
node_count = 2
machine_type = "ct4p-hightpu-4t"
topology = "2x2x2"
}]
}]
2 changes: 2 additions & 0 deletions tools/kubernetes/terraform/examples/v5e/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ variable "project_id" {}
variable "resource_name_prefix" {}
variable "region" {}
variable "tpu_node_pools" {}
variable "maintenance_interval" {}


module "tpu-gke" {
Expand All @@ -10,4 +11,5 @@ module "tpu-gke" {
resource_name_prefix = var.resource_name_prefix
region = var.region
tpu_node_pools = var.tpu_node_pools
maintenance_interval = var.maintenance_interval
}
24 changes: 24 additions & 0 deletions tools/kubernetes/terraform/examples/v5e/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
output "region" {
value = var.region
description = "GCloud Region"
}

output "project_id" {
value = var.project_id
description = "GCloud Project ID"
}

output "kubernetes_cluster_name" {
value = module.tpu-gke.kubernetes_cluster_name
description = "GKE Cluster Name"
}

output "kubernetes_cluster_host" {
value = module.tpu-gke.kubernetes_cluster_host
description = "GKE Cluster Host"
}

output "nodepool_tpu_topology" {
value = module.tpu-gke.nodepool_tpu_topology
description = "GKE TPU topology"
}
9 changes: 5 additions & 4 deletions tools/kubernetes/terraform/examples/v5e/terraform.tfvars
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
project_id = "tpu-burn-in-prod-env-multipod"
project_id = "project-id"
resource_name_prefix = "tpu-v5lite-test"
region = "us-east5"
tpu_node_pools = [{
Expand All @@ -8,7 +8,8 @@ tpu_node_pools = [{
topology = "2x2"
}, {
zone = "us-east5-b"
node_count = 4
node_count = 1
machine_type = "ct5lp-hightpu-4t"
topology = "4x4"
}]
topology = "2x2"
}]
maintenance_interval = "PERIODIC"
4 changes: 3 additions & 1 deletion tools/kubernetes/terraform/module/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ resource "google_container_node_pool" "multihost_tpu" {
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
]

host_maintenance_policy {
maintenance_interval = var.maintenance_interval
}
labels = {
env = var.project_id
}
Expand Down
24 changes: 24 additions & 0 deletions tools/kubernetes/terraform/module/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
output "region" {
value = var.region
description = "GCloud Region"
}

output "project_id" {
value = var.project_id
description = "GCloud Project ID"
}

output "kubernetes_cluster_name" {
value = google_container_cluster.tpu_cluster.name
description = "GKE Cluster Name"
}

output "kubernetes_cluster_host" {
value = google_container_cluster.tpu_cluster.endpoint
description = "GKE Cluster Host"
}

output "nodepool_tpu_topology" {
value = flatten(google_container_node_pool.multihost_tpu[*].placement_policy[0].tpu_topology)
description = "GKE TPU topology"
}
7 changes: 4 additions & 3 deletions tools/kubernetes/terraform/module/terraform.tfvars
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
project_id = "tpu-prod-env-multipod"
resource_name_prefix = "yejingxin"
project_id = "project-id"
resource_name_prefix = "tpu-test"
region = "us-central2"
tpu_node_pools = [{
zone = "us-central2-b"
Expand All @@ -16,4 +16,5 @@ tpu_node_pools = [{
node_count = 2
machine_type = "ct4p-hightpu-4t"
topology = "2x2x2"
}]
}]
maintenance_interval = "AS_NEEDED"
5 changes: 5 additions & 0 deletions tools/kubernetes/terraform/module/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,9 @@ variable "tpu_node_pools" {
machine_type = string,
topology = string,
}))
}

variable "maintenance_interval" {
default = "AS_NEEDED"
description = "maintenance interval for TPU machines."
}

0 comments on commit dee3dbb

Please sign in to comment.