Skip to content

Commit

Permalink
Merge pull request #1042 from tensorflow/revert-1041-kangminx/pd-ssd-01
Browse files Browse the repository at this point in the history
Revert "Override boot-disk type and size for v5e node"
  • Loading branch information
yejingxin authored Sep 8, 2023
2 parents 9791eed + 0d2e612 commit a1bec7a
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 20 deletions.
10 changes: 1 addition & 9 deletions tools/kubernetes/terraform/examples/v5e/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -7,31 +7,23 @@ tpu_node_pools = [{
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-1"
disk_type = "pd-balanced"
disk_size_gb = 50
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-1"
disk_type = "pd-balanced"
disk_size_gb = 50
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-1"
disk_type = "pd-balanced"
disk_size_gb = 50
}, {
zone = "us-east5-b"
node_count = 64
machine_type = "ct5lp-hightpu-4t"
topology = "16x16"
policy = "sb-compact-1"
disk_type = "pd-balanced"
disk_size_gb = 50
}]
}]
maintenance_interval = "PERIODIC"
14 changes: 6 additions & 8 deletions tools/kubernetes/terraform/module/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ resource "google_container_cluster" "tpu_cluster" {
release_channel {
channel = "UNSPECIFIED"
}

network = google_compute_network.vpc.name
subnetwork = google_compute_subnetwork.subnet.name
logging_service = "logging.googleapis.com/kubernetes"
Expand All @@ -81,7 +81,7 @@ resource "google_container_node_pool" "multihost_tpu" {
cluster = google_container_cluster.tpu_cluster.name

initial_node_count = var.tpu_node_pools[count.index].node_count

management {
auto_upgrade = false
}
Expand All @@ -104,18 +104,16 @@ resource "google_container_node_pool" "multihost_tpu" {
gcfs_config {
enabled = true
}

image_type = "COS_CONTAINERD"
image_type = "COS_CONTAINERD"
machine_type = var.tpu_node_pools[count.index].machine_type
disk_type = var.tpu_node_pools[count.index].disk_type
disk_size_gb = var.tpu_node_pools[count.index].disk_size_gb
tags = ["gke-node"]
metadata = {
disable-legacy-endpoints = "true"
}
}
placement_policy {
type = "COMPACT"
policy_name = var.tpu_node_pools[count.index].policy
type = "COMPACT"
policy_name = var.tpu_node_pools[count.index].policy
}
}
4 changes: 1 addition & 3 deletions tools/kubernetes/terraform/module/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,10 @@ variable "tpu_node_pools" {
machine_type = string,
topology = string,
policy = string,
disk_type = string,
disk_size_gb = number,
}))
}

variable "maintenance_interval" {
default = "AS_NEEDED"
default = "AS_NEEDED"
description = "maintenance interval for TPU machines."
}

0 comments on commit a1bec7a

Please sign in to comment.