From 71b26382652694603f3b47a6494a206a8faa64b2 Mon Sep 17 00:00:00 2001
From: Sina Chavoshi <sina.chavoshi@gmail.com>
Date: Mon, 15 Sep 2025 18:20:01 +0000
Subject: [PATCH 1/3] copy of gke standard cluster

---
 .../README.md                                 |  41 +++++++
 .../main.tf                                   | 103 ++++++++++++++++++
 .../outputs.tf                                |  57 ++++++++++
 .../variables.tf                              |  59 ++++++++++
 .../versions.tf                               |  30 +++++
 5 files changed, 290 insertions(+)
 create mode 100644 examples/gke_inference_gateway_standard_cluster/README.md
 create mode 100644 examples/gke_inference_gateway_standard_cluster/main.tf
 create mode 100644 examples/gke_inference_gateway_standard_cluster/outputs.tf
 create mode 100644 examples/gke_inference_gateway_standard_cluster/variables.tf
 create mode 100644 examples/gke_inference_gateway_standard_cluster/versions.tf

diff --git a/examples/gke_inference_gateway_standard_cluster/README.md b/examples/gke_inference_gateway_standard_cluster/README.md
new file mode 100644
index 0000000000..926a01b23b
--- /dev/null
+++ b/examples/gke_inference_gateway_standard_cluster/README.md
@@ -0,0 +1,41 @@
+# GKE Standard Cluster and Node Pool
+
+This example creates a GKE private cluster and Node Pool with beta features.
+For a full example see [simple_regional_private_beta](../simple_regional_private_beta/README.md) example.
+
+<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| cluster\_name\_suffix | A suffix to append to the default cluster name | `string` | `""` | no |
+| dns\_cache | Boolean to enable / disable NodeLocal DNSCache | `bool` | `false` | no |
+| gce\_pd\_csi\_driver | (Beta) Whether this cluster should enable the Google Compute Engine Persistent Disk Container Storage Interface (CSI) Driver. | `bool` | `false` | no |
+| ip\_range\_pods | The secondary ip range to use for pods | `any` | n/a | yes |
+| ip\_range\_services | The secondary ip range to use for services | `any` | n/a | yes |
+| network | The VPC network to host the cluster in | `any` | n/a | yes |
+| project\_id | The project ID to host the cluster in | `any` | n/a | yes |
+| region | The region to host the cluster in | `any` | n/a | yes |
+| service\_account | Service account to associate to the nodes in the cluster | `any` | n/a | yes |
+| subnetwork | The subnetwork to host the cluster in | `any` | n/a | yes |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| addons\_config | The configuration for addons supported by GKE Autopilot. |
+| ca\_certificate | The cluster ca certificate (base64 encoded) |
+| cluster\_name | Cluster name |
+| endpoint | The cluster endpoint |
+| location | Cluster location |
+| master\_version | The master Kubernetes version |
+| node\_locations | Cluster node locations |
+| project\_id | The project ID the cluster is in |
+
+<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
+
+To provision this example, run the following from within this directory:
+- `terraform init` to get the plugins
+- `terraform plan` to see the infrastructure plan
+- `terraform apply` to apply the infrastructure build
+- `terraform destroy` to destroy the built infrastructure
diff --git a/examples/gke_inference_gateway_standard_cluster/main.tf b/examples/gke_inference_gateway_standard_cluster/main.tf
new file mode 100644
index 0000000000..d3927472e6
--- /dev/null
+++ b/examples/gke_inference_gateway_standard_cluster/main.tf
@@ -0,0 +1,103 @@
+/**
+ * Copyright 2025 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+locals {
+  cluster_type          = "gke-standard"
+  default_workload_pool = "${var.project_id}.svc.id.goog"
+}
+
+data "google_client_config" "default" {}
+
+provider "kubernetes" {
+  host                   = "https://${module.gke.endpoint}"
+  token                  = data.google_client_config.default.access_token
+  cluster_ca_certificate = base64decode(module.gke.ca_certificate)
+}
+
+data "google_compute_subnetwork" "subnetwork" {
+  name    = var.subnetwork
+  project = var.project_id
+  region  = var.region
+}
+
+module "gke" {
+  source  = "terraform-google-modules/kubernetes-engine/google//modules/gke-standard-cluster"
+  version = "~> 38.0"
+
+  project_id    = var.project_id
+  name       = "${local.cluster_type}-cluster${var.cluster_name_suffix}"
+  location   = var.region
+  network    = var.network
+  subnetwork = var.subnetwork
+
+  ip_allocation_policy = {
+    cluster_secondary_range_name  = var.ip_range_pods
+    services_secondary_range_name = var.ip_range_services
+  }
+
+  private_cluster_config = {
+    enable_private_endpoint = true
+    enable_private_nodes    = true
+    master_ipv4_cidr_block  = "172.16.0.0/28"
+    master_global_access_config = {
+      enabled = true
+    }
+  }
+
+  deletion_protection      = false
+  remove_default_node_pool = true
+  initial_node_count       = 1
+
+  workload_identity_config = {
+    workload_pool = local.default_workload_pool
+  }
+
+  master_authorized_networks_config = {
+    cidr_blocks = [{
+      cidr_block   = data.google_compute_subnetwork.subnetwork.ip_cidr_range
+      display_name = "VPC"
+    }]
+  }
+
+  addons_config = {
+    dns_cache_config = {
+      enabled = var.dns_cache
+    }
+
+    gce_persistent_disk_csi_driver_config = {
+      enabled = var.gce_pd_csi_driver
+    }
+  }
+}
+
+module "node_pool" {
+  source  = "terraform-google-modules/kubernetes-engine/google//modules/gke-node-pool"
+  version = "~> 38.0"
+
+  project_id  = var.project_id
+  location = var.region
+  cluster  = module.gke.cluster_name
+  node_config = {
+    disk_size_gb    = 100
+    disk_type       = "pd-standard"
+    image_type      = "COS_CONTAINERD"
+    machine_type    = "e2-medium"
+    service_account = var.service_account
+    workload_metadata_config = {
+      mode = "GKE_METADATA"
+    }
+  }
+}
diff --git a/examples/gke_inference_gateway_standard_cluster/outputs.tf b/examples/gke_inference_gateway_standard_cluster/outputs.tf
new file mode 100644
index 0000000000..7f97dde8a9
--- /dev/null
+++ b/examples/gke_inference_gateway_standard_cluster/outputs.tf
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2025 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+output "endpoint" {
+  sensitive   = true
+  description = "The cluster endpoint"
+  value       = module.gke.endpoint
+}
+
+output "ca_certificate" {
+  sensitive   = true
+  description = "The cluster ca certificate (base64 encoded)"
+  value       = module.gke.ca_certificate
+}
+
+output "project_id" {
+  description = "The project ID the cluster is in"
+  value       = var.project_id
+}
+
+output "location" {
+  description = "Cluster location"
+  value       = module.gke.location
+}
+
+output "node_locations" {
+  description = "Cluster node locations"
+  value       = module.gke.node_locations
+}
+
+output "addons_config" {
+  description = "The configuration for addons supported by GKE Autopilot."
+  value       = module.gke.addons_config
+}
+
+output "cluster_name" {
+  description = "Cluster name"
+  value       = module.gke.cluster_name
+}
+
+output "master_version" {
+  description = "The master Kubernetes version"
+  value       = module.gke.master_version
+}
diff --git a/examples/gke_inference_gateway_standard_cluster/variables.tf b/examples/gke_inference_gateway_standard_cluster/variables.tf
new file mode 100644
index 0000000000..c7f8ebcf64
--- /dev/null
+++ b/examples/gke_inference_gateway_standard_cluster/variables.tf
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2025 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+variable "project_id" {
+  description = "The project ID to host the cluster in"
+}
+
+variable "cluster_name_suffix" {
+  description = "A suffix to append to the default cluster name"
+  default     = ""
+}
+
+variable "region" {
+  description = "The region to host the cluster in"
+}
+
+variable "network" {
+  description = "The VPC network to host the cluster in"
+}
+
+variable "subnetwork" {
+  description = "The subnetwork to host the cluster in"
+}
+
+variable "ip_range_pods" {
+  description = "The secondary ip range to use for pods"
+}
+
+variable "ip_range_services" {
+  description = "The secondary ip range to use for services"
+}
+
+variable "service_account" {
+  description = "Service account to associate to the nodes in the cluster"
+}
+
+variable "dns_cache" {
+  description = "Boolean to enable / disable NodeLocal DNSCache "
+  default     = false
+}
+
+variable "gce_pd_csi_driver" {
+  type        = bool
+  description = "(Beta) Whether this cluster should enable the Google Compute Engine Persistent Disk Container Storage Interface (CSI) Driver."
+  default     = false
+}
diff --git a/examples/gke_inference_gateway_standard_cluster/versions.tf b/examples/gke_inference_gateway_standard_cluster/versions.tf
new file mode 100644
index 0000000000..220cbfdb31
--- /dev/null
+++ b/examples/gke_inference_gateway_standard_cluster/versions.tf
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2025 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+terraform {
+  required_version = ">= 1.3"
+  required_providers {
+    google = {
+      source = "hashicorp/google"
+    }
+    google-beta = {
+      source = "hashicorp/google-beta"
+    }
+    kubernetes = {
+      source = "hashicorp/kubernetes"
+    }
+  }
+}

From 8ebbc560153f00d0649cfb896e552524df360d03 Mon Sep 17 00:00:00 2001
From: Sina Chavoshi <sina.chavoshi@gmail.com>
Date: Tue, 16 Sep 2025 19:05:32 +0000
Subject: [PATCH 2/3] update to ga release

---
 .../README.md                                 | 121 +++--
 .../main.tf                                   | 473 +++++++++++++++++-
 .../variables.tf                              |  45 +-
 3 files changed, 564 insertions(+), 75 deletions(-)

diff --git a/examples/gke_inference_gateway_standard_cluster/README.md b/examples/gke_inference_gateway_standard_cluster/README.md
index 926a01b23b..777ff237a4 100644
--- a/examples/gke_inference_gateway_standard_cluster/README.md
+++ b/examples/gke_inference_gateway_standard_cluster/README.md
@@ -1,41 +1,80 @@
-# GKE Standard Cluster and Node Pool
-
-This example creates a GKE private cluster and Node Pool with beta features.
-For a full example see [simple_regional_private_beta](../simple_regional_private_beta/README.md) example.
-
-<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
-## Inputs
-
-| Name | Description | Type | Default | Required |
-|------|-------------|------|---------|:--------:|
-| cluster\_name\_suffix | A suffix to append to the default cluster name | `string` | `""` | no |
-| dns\_cache | Boolean to enable / disable NodeLocal DNSCache | `bool` | `false` | no |
-| gce\_pd\_csi\_driver | (Beta) Whether this cluster should enable the Google Compute Engine Persistent Disk Container Storage Interface (CSI) Driver. | `bool` | `false` | no |
-| ip\_range\_pods | The secondary ip range to use for pods | `any` | n/a | yes |
-| ip\_range\_services | The secondary ip range to use for services | `any` | n/a | yes |
-| network | The VPC network to host the cluster in | `any` | n/a | yes |
-| project\_id | The project ID to host the cluster in | `any` | n/a | yes |
-| region | The region to host the cluster in | `any` | n/a | yes |
-| service\_account | Service account to associate to the nodes in the cluster | `any` | n/a | yes |
-| subnetwork | The subnetwork to host the cluster in | `any` | n/a | yes |
-
-## Outputs
-
-| Name | Description |
-|------|-------------|
-| addons\_config | The configuration for addons supported by GKE Autopilot. |
-| ca\_certificate | The cluster ca certificate (base64 encoded) |
-| cluster\_name | Cluster name |
-| endpoint | The cluster endpoint |
-| location | Cluster location |
-| master\_version | The master Kubernetes version |
-| node\_locations | Cluster node locations |
-| project\_id | The project ID the cluster is in |
-
-<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
-
-To provision this example, run the following from within this directory:
-- `terraform init` to get the plugins
-- `terraform plan` to see the infrastructure plan
-- `terraform apply` to apply the infrastructure build
-- `terraform destroy` to destroy the built infrastructure
+# GKE Inference Gateway Example
+
+This example provisions a GKE Standard cluster and a node pool with H100 GPUs, suitable for deploying and serving Large Language Models (LLMs) using the GKE Inference Gateway.
+
+The cluster is configured with:
+- GKE Gateway API enabled.
+- Managed Prometheus for monitoring.
+- DCGM for GPU monitoring.
+- A dedicated node pool with NVIDIA H100 80GB GPUs.
+
+This Terraform script automates the deployment of all necessary Kubernetes resources, including:
+- Authorization for metrics scraping.
+- A vLLM model server for a Llama3.1 model.
+- GKE Inference Gateway CRDs.
+- GKE Inference Gateway resources (`InferencePool`, `InferenceObjective`, `Gateway`, `HTTPRoute`).
+
+## Usage
+
+1.  **Enable APIs**
+
+    ```bash
+    gcloud services enable container.googleapis.com
+    ```
+
+2.  **Set up your environment**
+
+    You will need to set the following environment variables. You may also need to create a `terraform.tfvars` file to provide values for the variables in `variables.tf`.
+
+    ```bash
+    export PROJECT_ID="your-project-id"
+    export REGION="us-central1"
+    export CLUSTER_NAME="inference-gateway-cluster"
+    export HF_TOKEN="your-hugging-face-token"
+    ```
+
+3.  **Run Terraform**
+
+    The `terraform apply` command will provision the GKE cluster and deploy all the necessary Kubernetes resources.
+
+    ```bash
+    terraform init
+    terraform apply
+    ```
+
+4.  **Configure kubectl**
+
+    After the apply is complete, configure `kubectl` to communicate with your new cluster.
+
+    ```bash
+    gcloud container clusters get-credentials $(terraform output -raw cluster_name) --region $(terraform output -raw location)
+    ```
+
+5.  **Send an inference request**
+
+    Get the Gateway IP address:
+    ```bash
+    IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
+    PORT=80
+    ```
+
+    Send a request:
+    ```bash
+    curl -i -X POST http://${IP}:${PORT}/v1/completions \
+    -H "Content-Type: application/json" \
+    -d 
+    {
+        "model": "food-review",
+        "prompt": "What is a good recipe for a chicken curry?",
+        "max_tokens": 100,
+        "temperature": "0.7"
+    }
+    ```
+
+## Cleanup
+
+Running `terraform destroy` will deprovision the GKE cluster and all associated Kubernetes resources.
+
+```bash
+terraform destroy
+```
\ No newline at end of file
diff --git a/examples/gke_inference_gateway_standard_cluster/main.tf b/examples/gke_inference_gateway_standard_cluster/main.tf
index d3927472e6..baba283c16 100644
--- a/examples/gke_inference_gateway_standard_cluster/main.tf
+++ b/examples/gke_inference_gateway_standard_cluster/main.tf
@@ -27,6 +27,14 @@ provider "kubernetes" {
   cluster_ca_certificate = base64decode(module.gke.ca_certificate)
 }
 
+provider "helm" {
+  kubernetes {
+    host                   = "https://${module.gke.endpoint}"
+    token                  = data.google_client_config.default.access_token
+    cluster_ca_certificate = base64decode(module.gke.ca_certificate)
+  }
+}
+
 data "google_compute_subnetwork" "subnetwork" {
   name    = var.subnetwork
   project = var.project_id
@@ -35,44 +43,40 @@ data "google_compute_subnetwork" "subnetwork" {
 
 module "gke" {
   source  = "terraform-google-modules/kubernetes-engine/google//modules/gke-standard-cluster"
-  version = "~> 38.0"
+  version = "~> 39.0"
 
   project_id    = var.project_id
   name       = "${local.cluster_type}-cluster${var.cluster_name_suffix}"
   location   = var.region
   network    = var.network
   subnetwork = var.subnetwork
+  release_channel = "RAPID"
+  gateway_api_config = {
+    channel = "CHANNEL_STANDARD"
+  }
+  monitoring_config = {
+    enable_managed_prometheus = true
+    enabled_components        = ["SYSTEM_COMPONENTS", "DCGM"]
+  }
+  logging_service    = "logging.googleapis.com/kubernetes"
+
 
   ip_allocation_policy = {
     cluster_secondary_range_name  = var.ip_range_pods
     services_secondary_range_name = var.ip_range_services
   }
 
-  private_cluster_config = {
-    enable_private_endpoint = true
-    enable_private_nodes    = true
-    master_ipv4_cidr_block  = "172.16.0.0/28"
-    master_global_access_config = {
-      enabled = true
-    }
-  }
-
   deletion_protection      = false
-  remove_default_node_pool = true
-  initial_node_count       = 1
+  remove_default_node_pool = false
 
   workload_identity_config = {
     workload_pool = local.default_workload_pool
   }
 
-  master_authorized_networks_config = {
-    cidr_blocks = [{
-      cidr_block   = data.google_compute_subnetwork.subnetwork.ip_cidr_range
-      display_name = "VPC"
-    }]
-  }
-
   addons_config = {
+    http_load_balancing = {
+      enabled = true
+    }
     dns_cache_config = {
       enabled = var.dns_cache
     }
@@ -81,23 +85,446 @@ module "gke" {
       enabled = var.gce_pd_csi_driver
     }
   }
+  enable_shielded_nodes = true
 }
 
 module "node_pool" {
   source  = "terraform-google-modules/kubernetes-engine/google//modules/gke-node-pool"
-  version = "~> 38.0"
+  version = "~> 39.0"
 
   project_id  = var.project_id
-  location = var.region
+  location = var.zone
   cluster  = module.gke.cluster_name
+  name     = "gpupool"
+  node_count = 1
+
   node_config = {
-    disk_size_gb    = 100
+    disk_size_gb    = 200
     disk_type       = "pd-standard"
     image_type      = "COS_CONTAINERD"
-    machine_type    = "e2-medium"
+    machine_type    = "a3-highgpu-2g"
     service_account = var.service_account
+    guest_accelerator = {
+      type  = "nvidia-h100-80gb"
+      count = 2
+    }
+    gpu_driver_installation_config = {
+      gpu_driver_version = "LATEST"
+    }
     workload_metadata_config = {
       mode = "GKE_METADATA"
     }
   }
 }
+
+resource "kubernetes_secret" "hf_secret" {
+  metadata {
+    name = "hf-token"
+  }
+  data = {
+    token = var.hf_token
+  }
+  type = "Opaque"
+}
+
+resource "kubernetes_config_map" "vllm_adapters" {
+  metadata {
+    name = "vllm-llama3.1-8b-adapters"
+  }
+  data = {
+    "configmap.yaml" = <<-EOT
+      vLLMLoRAConfig:
+        name: vllm-llama3.1-8b-instruct
+        port: 8000
+        defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct
+        ensureExist:
+          models:
+          - id: food-review
+            source: Kawon/llama3.1-food-finetune_v14_r8
+          - id: cad-fabricator
+            source: redcathode/fabricator
+    EOT
+  }
+}
+
+resource "kubernetes_deployment" "vllm" {
+  metadata {
+    name = "vllm-llama3.1-8b-instruct"
+  }
+  spec {
+    replicas = 3
+    selector {
+      match_labels = {
+        app = "vllm-llama3.1-8b-instruct"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          app = "vllm-llama3.1-8b-instruct"
+        }
+      }
+      spec {
+        termination_grace_period_seconds = 130
+        enable_service_links             = false
+        container {
+          name              = "vllm"
+          image             = "vllm/vllm-openai:latest"
+          image_pull_policy = "Always"
+          command           = ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+          args = [
+            "--model", "meta-llama/Llama-3.1-8B-Instruct",
+            "--tensor-parallel-size", "1",
+            "--port", "8000",
+            "--enable-lora",
+            "--max-loras", "2",
+            "--max-cpu-loras", "12"
+          ]
+          port {
+            container_port = 8000
+            name           = "http"
+            protocol       = "TCP"
+          }
+          env {
+            name  = "VLLM_USE_V1"
+            value = "1"
+          }
+          env {
+            name  = "PORT"
+            value = "8000"
+          }
+          env {
+            name = "HUGGING_FACE_HUB_TOKEN"
+            value_from {
+              secret_key_ref {
+                name = kubernetes_secret.hf_secret.metadata[0].name
+                key  = "token"
+              }
+            }
+          }
+          env {
+            name  = "VLLM_ALLOW_RUNTIME_LORA_UPDATING"
+            value = "true"
+          }
+          lifecycle {
+            pre_stop {
+              exec {
+                command = ["/bin/sh", "-c", "sleep 30"]
+              }
+            }
+          }
+          resources {
+            limits = {
+              "nvidia.com/gpu" = 1
+            }
+            requests = {
+              "nvidia.com/gpu" = 1
+            }
+          }
+          liveness_probe {
+            http_get {
+              path   = "/health"
+              port   = "http"
+              scheme = "HTTP"
+            }
+            period_seconds     = 1
+            success_threshold  = 1
+            failure_threshold  = 5
+            timeout_seconds    = 1
+          }
+          readiness_probe {
+            http_get {
+              path   = "/health"
+              port   = "http"
+              scheme = "HTTP"
+            }
+            period_seconds     = 1
+            success_threshold  = 1
+            failure_threshold  = 1
+            timeout_seconds    = 1
+          }
+          startup_probe {
+            http_get {
+              path   = "/health"
+              port   = "http"
+              scheme = "HTTP"
+            }
+            failure_threshold   = 600
+            initial_delay_seconds = 2
+            period_seconds      = 1
+          }
+          volume_mount {
+            mount_path = "/data"
+            name       = "data"
+          }
+          volume_mount {
+            mount_path = "/dev/shm"
+            name       = "shm"
+          }
+          volume_mount {
+            mount_path = "/adapters"
+            name       = "adapters"
+          }
+        }
+        container {
+          name  = "lora-adapter-syncer"
+          image = "us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main"
+          image_pull_policy = "Always"
+          env {
+            name  = "DYNAMIC_LORA_ROLLOUT_CONFIG"
+            value = "/config/configmap.yaml"
+          }
+          volume_mount {
+            name       = "config-volume"
+            mount_path = "/config"
+          }
+        }
+        volume {
+          name = "data"
+          empty_dir {}
+        }
+        volume {
+          name = "shm"
+          empty_dir {
+            medium = "Memory"
+          }
+        }
+        volume {
+          name = "adapters"
+          empty_dir {}
+        }
+        volume {
+          name = "config-volume"
+          config_map {
+            name = kubernetes_config_map.vllm_adapters.metadata[0].name
+          }
+        }
+        node_selector = {
+          "cloud.google.com/gke-accelerator" = "nvidia-h100-80gb"
+        }
+      }
+    }
+  }
+}
+
+resource "null_resource" "apply_crds" {
+  provisioner "local-exec" {
+    command = "kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.0.0/manifests.yaml"
+  }
+  depends_on = [module.gke]
+}
+
+resource "kubernetes_cluster_role" "metrics_reader" {
+  metadata {
+    name = "inference-gateway-metrics-reader"
+  }
+  rule {
+    non_resource_urls = ["/metrics"]
+    verbs             = ["get"]
+  }
+}
+
+resource "kubernetes_service_account" "metrics_reader" {
+  metadata {
+    name      = "inference-gateway-sa-metrics-reader"
+    namespace = "default"
+  }
+}
+
+resource "kubernetes_cluster_role_binding" "metrics_reader" {
+  metadata {
+    name = "inference-gateway-sa-metrics-reader-role-binding"
+  }
+  role_ref {
+    api_group = "rbac.authorization.k8s.io"
+    kind      = "ClusterRole"
+    name      = kubernetes_cluster_role.metrics_reader.metadata[0].name
+  }
+  subject {
+    kind      = "ServiceAccount"
+    name      = kubernetes_service_account.metrics_reader.metadata[0].name
+    namespace = "default"
+  }
+}
+
+resource "kubernetes_secret" "metrics_reader_token" {
+  metadata {
+    name      = "inference-gateway-sa-metrics-reader-secret"
+    namespace = "default"
+    annotations = {
+      "kubernetes.io/service-account.name" = kubernetes_service_account.metrics_reader.metadata[0].name
+    }
+  }
+  type = "kubernetes.io/service-account-token"
+}
+
+resource "kubernetes_cluster_role" "secret_reader" {
+  metadata {
+    name = "inference-gateway-sa-metrics-reader-secret-read"
+  }
+  rule {
+    api_groups     = [""]
+    resources      = ["secrets"]
+    resource_names = [kubernetes_secret.metrics_reader_token.metadata[0].name]
+    verbs          = ["get", "list", "watch"]
+  }
+}
+
+resource "kubernetes_cluster_role_binding" "gmp_secret_reader" {
+  metadata {
+    name = "gmp-system:collector:inference-gateway-sa-metrics-reader-secret-read"
+  }
+  role_ref {
+    api_group = "rbac.authorization.k8s.io"
+    kind      = "ClusterRole"
+    name      = kubernetes_cluster_role.secret_reader.metadata[0].name
+  }
+  subject {
+    kind      = "ServiceAccount"
+    name      = "collector"
+    namespace = "gmp-system"
+  }
+}
+
+resource "helm_release" "inference_pool" {
+  name       = "vllm-llama3.1-8b-instruct"
+  repository = "oci://registry.k8s.io/gateway-api-inference-extension/charts"
+  chart      = "inferencepool"
+  version    = "v1.0.0"
+
+  set {
+    name  = "inferencePool.modelServers.matchLabels.app"
+    value = "vllm-llama3.1-8b-instruct"
+  }
+  set {
+    name  = "provider.name"
+    value = "gke"
+  }
+  set {
+    name  = "healthCheckPolicy.create"
+    value = "false"
+  }
+  depends_on = [kubernetes_deployment.vllm, null_resource.apply_crds]
+}
+
+resource "kubernetes_manifest" "food_review_model" {
+  manifest = {
+    "apiVersion" = "inference.networking.k8s.io/v1alpha1"
+    "kind"       = "InferenceObjective"
+    "metadata" = {
+      "name" = "food-review"
+    }
+    "spec" = {
+      "priority" = 10
+      "poolRef" = {
+        "name" = "vllm-llama3.1-8b-instruct"
+        "kind" = "InferencePool"
+      }
+    }
+  }
+  depends_on = [helm_release.inference_pool]
+}
+
+resource "kubernetes_manifest" "base_model" {
+  manifest = {
+    "apiVersion" = "inference.networking.k8s.io/v1alpha1"
+    "kind"       = "InferenceObjective"
+    "metadata" = {
+      "name" = "llama3-base-model"
+    }
+    "spec" = {
+      "priority" = 20
+      "poolRef" = {
+        "name" = "vllm-llama3.1-8b-instruct"
+        "kind" = "InferencePool"
+      }
+    }
+  }
+  depends_on = [helm_release.inference_pool]
+}
+
+resource "kubernetes_manifest" "health_check_policy" {
+  manifest = {
+    "apiVersion" = "networking.gke.io/v1"
+    "kind"       = "HealthCheckPolicy"
+    "metadata" = {
+      "name"      = "health-check-policy"
+      "namespace" = "default"
+    }
+    "spec" = {
+      "targetRef" = {
+        "group" = "inference.networking.k8s.io"
+        "kind"  = "InferencePool"
+        "name"  = "vllm-llama3.1-8b-instruct"
+      }
+      "default" = {
+        "config" = {
+          "type" = "HTTP"
+          "httpHealthCheck" = {
+            "requestPath" = "/health"
+            "port"        = 8000
+          }
+        }
+      }
+    }
+  }
+  depends_on = [helm_release.inference_pool]
+}
+
+resource "kubernetes_manifest" "gateway" {
+  manifest = {
+    "apiVersion" = "gateway.networking.k8s.io/v1"
+    "kind"       = "Gateway"
+    "metadata" = {
+      "name" = "inference-gateway"
+    }
+    "spec" = {
+      "gatewayClassName" = "gke-l7-regional-external-managed"
+      "listeners" = [
+        {
+          "protocol" = "HTTP"
+          "port"     = 80
+          "name"     = "http"
+        }
+      ]
+    }
+  }
+  depends_on = [helm_release.inference_pool]
+}
+
+resource "kubernetes_manifest" "http_route" {
+  manifest = {
+    "apiVersion" = "gateway.networking.k8s.io/v1"
+    "kind"       = "HTTPRoute"
+    "metadata" = {
+      "name" = "my-route"
+    }
+    "spec" = {
+      "parentRefs" = [
+        {
+          "name" = "inference-gateway"
+        }
+      ]
+      "rules" = [
+        {
+          "matches" = [
+            {
+              "path" = {
+                "type"  = "PathPrefix"
+                "value" = "/"
+              }
+            }
+          ]
+          "backendRefs" = [
+            {
+              "name"  = "vllm-llama3.1-8b-instruct"
+              "group" = "inference.networking.k8s.io"
+              "kind"  = "InferencePool"
+            }
+          ]
+        }
+      ]
+    }
+  }
+  depends_on = [kubernetes_manifest.gateway]
+}
\ No newline at end of file
diff --git a/examples/gke_inference_gateway_standard_cluster/variables.tf b/examples/gke_inference_gateway_standard_cluster/variables.tf
index c7f8ebcf64..d0b99efafb 100644
--- a/examples/gke_inference_gateway_standard_cluster/variables.tf
+++ b/examples/gke_inference_gateway_standard_cluster/variables.tf
@@ -16,44 +16,67 @@
 
 variable "project_id" {
   description = "The project ID to host the cluster in"
-}
-
-variable "cluster_name_suffix" {
-  description = "A suffix to append to the default cluster name"
-  default     = ""
+  type        = string
 }
 
 variable "region" {
   description = "The region to host the cluster in"
+  type        = string
+  default     = "us-central1"
+}
+
+variable "zone" {
+  description = "The zone to host the cluster in"
+  type        = string
+  default     = "us-central1-a"
 }
 
 variable "network" {
   description = "The VPC network to host the cluster in"
+  type        = string
 }
 
 variable "subnetwork" {
   description = "The subnetwork to host the cluster in"
+  type        = string
 }
 
 variable "ip_range_pods" {
-  description = "The secondary ip range to use for pods"
+  description = "The secondary ip range for pods"
+  type        = string
 }
 
 variable "ip_range_services" {
-  description = "The secondary ip range to use for services"
+  description = "The secondary ip range for services"
+  type        = string
+}
+
+variable "cluster_name_suffix" {
+  description = "A suffix to append to the cluster name"
+  type        = string
+  default     = ""
 }
 
 variable "service_account" {
-  description = "Service account to associate to the nodes in the cluster"
+  description = "Service account to attach to the node pool."
+  type        = string
+  default     = null
 }
 
 variable "dns_cache" {
-  description = "Boolean to enable / disable NodeLocal DNSCache "
+  description = "Enable DNS cache for the cluster"
+  type        = bool
   default     = false
 }
 
 variable "gce_pd_csi_driver" {
+  description = "Enable GCE Persistent Disk CSI driver"
   type        = bool
-  description = "(Beta) Whether this cluster should enable the Google Compute Engine Persistent Disk Container Storage Interface (CSI) Driver."
-  default     = false
+  default     = true
 }
+
+variable "hf_token" {
+  description = "Hugging Face token"
+  type        = string
+  sensitive   = true
+}
\ No newline at end of file

From dc362edd1d2cbd8b31eb5fa5bc9e7d8cbf60e786 Mon Sep 17 00:00:00 2001
From: Sina Chavoshi <sina.chavoshi@gmail.com>
Date: Tue, 16 Sep 2025 19:09:13 +0000
Subject: [PATCH 3/3] fix curl command in readme

---
 examples/gke_inference_gateway_standard_cluster/README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/gke_inference_gateway_standard_cluster/README.md b/examples/gke_inference_gateway_standard_cluster/README.md
index 777ff237a4..40763e5377 100644
--- a/examples/gke_inference_gateway_standard_cluster/README.md
+++ b/examples/gke_inference_gateway_standard_cluster/README.md
@@ -62,13 +62,12 @@ This Terraform script automates the deployment of all necessary Kubernetes resou
     ```bash
     curl -i -X POST http://${IP}:${PORT}/v1/completions \
     -H "Content-Type: application/json" \
-    -d 
-    {
+    -d '{
         "model": "food-review",
         "prompt": "What is a good recipe for a chicken curry?",
         "max_tokens": 100,
         "temperature": "0.7"
-    }
+    }'
     ```
 
 ## Cleanup