Skip to content

Commit

Permalink
Update cluster as regional and update ip_cidr_range, enable vpc-nativ…
Browse files Browse the repository at this point in the history
…e traffic routing and the example for v5e.

PiperOrigin-RevId: 566429751
  • Loading branch information
tensorflower-gardener authored and danjan1234 committed Mar 26, 2024
1 parent 921f11f commit 9a8e8c4
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 25 deletions.
2 changes: 1 addition & 1 deletion tools/kubernetes/terraform/examples/v4/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ module "tpu-gke" {
region = var.region
tpu_node_pools = var.tpu_node_pools
maintenance_interval = var.maintenance_interval
}
}
2 changes: 1 addition & 1 deletion tools/kubernetes/terraform/examples/v4/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ output "kubernetes_cluster_host" {
output "nodepool_tpu_topology" {
value = module.tpu-gke.nodepool_tpu_topology
description = "GKE TPU topology"
}
}
2 changes: 1 addition & 1 deletion tools/kubernetes/terraform/examples/v4/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ tpu_node_pools = [{
node_count = 2
machine_type = "ct4p-hightpu-4t"
topology = "2x2x2"
}]
}]
2 changes: 1 addition & 1 deletion tools/kubernetes/terraform/examples/v5e/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ module "tpu-gke" {
region = var.region
tpu_node_pools = var.tpu_node_pools
maintenance_interval = var.maintenance_interval
}
}
6 changes: 3 additions & 3 deletions tools/kubernetes/terraform/examples/v5e/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ output "kubernetes_cluster_host" {
description = "GKE Cluster Host"
}

output "nodepool_tpu_topology" {
value = module.tpu-gke.nodepool_tpu_topology
description = "GKE TPU topology"
output "placement_policy_names" {
value = module.tpu-gke.placement_policy_names
description = "GKE TPU Placement Policy Names"
}
100 changes: 93 additions & 7 deletions tools/kubernetes/terraform/examples/v5e/terraform.tfvars
Original file line number Diff line number Diff line change
@@ -1,15 +1,101 @@
project_id = "project-id"
resource_name_prefix = "tpu-v5lite-test"
project_id = "project_id"
resource_name_prefix = "tpu-v5e-test"
region = "us-east5"
tpu_node_pools = [{
zone = "us-east5-b"
node_count = 1
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "2x2"
topology = "16x16"
policy = "sb-compact-4a"
}, {
zone = "us-east5-b"
node_count = 1
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "2x2"
topology = "16x16"
policy = "sb-compact-4a"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4a"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4a"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4b"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4b"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4b"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4b"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4c"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4c"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4c"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4c"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4d"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4d"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4d"
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-4d"
}]
maintenance_interval = "PERIODIC"
maintenance_interval = "PERIODIC"
15 changes: 11 additions & 4 deletions tools/kubernetes/terraform/module/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ resource "google_compute_subnetwork" "subnet" {
name = "${var.resource_name_prefix}-subnet"
region = var.region
network = google_compute_network.vpc.name
ip_cidr_range = "10.10.0.0/24"
ip_cidr_range = "10.10.0.0/19"
}

resource "google_container_cluster" "tpu_cluster" {
Expand All @@ -48,6 +48,12 @@ resource "google_container_cluster" "tpu_cluster" {
# node pool and immediately delete it.
remove_default_node_pool = true
initial_node_count = 1
networking_mode = "VPC_NATIVE"
ip_allocation_policy {
cluster_ipv4_cidr_block = "/14"
services_ipv4_cidr_block = "/20"
}
default_max_pods_per_node = 50

release_channel {
channel = "UNSPECIFIED"
Expand Down Expand Up @@ -84,6 +90,7 @@ resource "google_container_node_pool" "multihost_tpu" {
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/cloud-platform",
]
host_maintenance_policy {
maintenance_interval = var.maintenance_interval
Expand All @@ -98,15 +105,15 @@ resource "google_container_node_pool" "multihost_tpu" {
enabled = true
}

image_type = "COS_CONTAINERD"
image_type = "COS_CONTAINERD"
machine_type = var.tpu_node_pools[count.index].machine_type
tags = ["gke-node"]
metadata = {
disable-legacy-endpoints = "true"
}
}
placement_policy {
type = "COMPACT"
tpu_topology = var.tpu_node_pools[count.index].topology
type = "COMPACT"
policy_name = var.tpu_node_pools[count.index].policy
}
}
10 changes: 6 additions & 4 deletions tools/kubernetes/terraform/module/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ output "kubernetes_cluster_host" {
description = "GKE Cluster Host"
}

output "nodepool_tpu_topology" {
value = flatten(google_container_node_pool.multihost_tpu[*].placement_policy[0].tpu_topology)
description = "GKE TPU topology"
}
output "placement_policy_names" {
value = flatten([
google_container_node_pool.multihost_tpu[*].placement_policy[0].policy_name
])
description = "GKE TPU Placement Policy Names"
}
3 changes: 2 additions & 1 deletion tools/kubernetes/terraform/module/terraform.tfvars
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
project_id = "project-id"
resource_name_prefix = "tpu-test"
region = "us-central2"
location = "us-central2-b"
tpu_node_pools = [{
zone = "us-central2-b"
node_count = 4
Expand All @@ -17,4 +18,4 @@ tpu_node_pools = [{
machine_type = "ct4p-hightpu-4t"
topology = "2x2x2"
}]
maintenance_interval = "AS_NEEDED"
maintenance_interval = "AS_NEEDED"
5 changes: 3 additions & 2 deletions tools/kubernetes/terraform/module/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,11 @@ variable "tpu_node_pools" {
node_count = number,
machine_type = string,
topology = string,
policy = string,
}))
}

variable "maintenance_interval" {
default = "AS_NEEDED"
default = "AS_NEEDED"
description = "maintenance interval for TPU machines."
}
}

0 comments on commit 9a8e8c4

Please sign in to comment.