From 71b26382652694603f3b47a6494a206a8faa64b2 Mon Sep 17 00:00:00 2001 From: Sina Chavoshi Date: Mon, 15 Sep 2025 18:20:01 +0000 Subject: [PATCH 1/3] copy of gke standard cluster --- .../README.md | 41 +++++++ .../main.tf | 103 ++++++++++++++++++ .../outputs.tf | 57 ++++++++++ .../variables.tf | 59 ++++++++++ .../versions.tf | 30 +++++ 5 files changed, 290 insertions(+) create mode 100644 examples/gke_inference_gateway_standard_cluster/README.md create mode 100644 examples/gke_inference_gateway_standard_cluster/main.tf create mode 100644 examples/gke_inference_gateway_standard_cluster/outputs.tf create mode 100644 examples/gke_inference_gateway_standard_cluster/variables.tf create mode 100644 examples/gke_inference_gateway_standard_cluster/versions.tf diff --git a/examples/gke_inference_gateway_standard_cluster/README.md b/examples/gke_inference_gateway_standard_cluster/README.md new file mode 100644 index 0000000000..926a01b23b --- /dev/null +++ b/examples/gke_inference_gateway_standard_cluster/README.md @@ -0,0 +1,41 @@ +# GKE Standard Cluster and Node Pool + +This example creates a GKE private cluster and Node Pool with beta features. +For a full example see [simple_regional_private_beta](../simple_regional_private_beta/README.md) example. + + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| cluster\_name\_suffix | A suffix to append to the default cluster name | `string` | `""` | no | +| dns\_cache | Boolean to enable / disable NodeLocal DNSCache | `bool` | `false` | no | +| gce\_pd\_csi\_driver | (Beta) Whether this cluster should enable the Google Compute Engine Persistent Disk Container Storage Interface (CSI) Driver. | `bool` | `false` | no | +| ip\_range\_pods | The secondary ip range to use for pods | `any` | n/a | yes | +| ip\_range\_services | The secondary ip range to use for services | `any` | n/a | yes | +| network | The VPC network to host the cluster in | `any` | n/a | yes | +| project\_id | The project ID to host the cluster in | `any` | n/a | yes | +| region | The region to host the cluster in | `any` | n/a | yes | +| service\_account | Service account to associate to the nodes in the cluster | `any` | n/a | yes | +| subnetwork | The subnetwork to host the cluster in | `any` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| addons\_config | The configuration for addons supported by GKE Autopilot. | +| ca\_certificate | The cluster ca certificate (base64 encoded) | +| cluster\_name | Cluster name | +| endpoint | The cluster endpoint | +| location | Cluster location | +| master\_version | The master Kubernetes version | +| node\_locations | Cluster node locations | +| project\_id | The project ID the cluster is in | + + + +To provision this example, run the following from within this directory: +- `terraform init` to get the plugins +- `terraform plan` to see the infrastructure plan +- `terraform apply` to apply the infrastructure build +- `terraform destroy` to destroy the built infrastructure diff --git a/examples/gke_inference_gateway_standard_cluster/main.tf b/examples/gke_inference_gateway_standard_cluster/main.tf new file mode 100644 index 0000000000..d3927472e6 --- /dev/null +++ b/examples/gke_inference_gateway_standard_cluster/main.tf @@ -0,0 +1,103 @@ +/** + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + cluster_type = "gke-standard" + default_workload_pool = "${var.project_id}.svc.id.goog" +} + +data "google_client_config" "default" {} + +provider "kubernetes" { + host = "https://${module.gke.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(module.gke.ca_certificate) +} + +data "google_compute_subnetwork" "subnetwork" { + name = var.subnetwork + project = var.project_id + region = var.region +} + +module "gke" { + source = "terraform-google-modules/kubernetes-engine/google//modules/gke-standard-cluster" + version = "~> 38.0" + + project_id = var.project_id + name = "${local.cluster_type}-cluster${var.cluster_name_suffix}" + location = var.region + network = var.network + subnetwork = var.subnetwork + + ip_allocation_policy = { + cluster_secondary_range_name = var.ip_range_pods + services_secondary_range_name = var.ip_range_services + } + + private_cluster_config = { + enable_private_endpoint = true + enable_private_nodes = true + master_ipv4_cidr_block = "172.16.0.0/28" + master_global_access_config = { + enabled = true + } + } + + deletion_protection = false + remove_default_node_pool = true + initial_node_count = 1 + + workload_identity_config = { + workload_pool = local.default_workload_pool + } + + master_authorized_networks_config = { + cidr_blocks = [{ + cidr_block = data.google_compute_subnetwork.subnetwork.ip_cidr_range + display_name = "VPC" + }] + } + + addons_config = { + dns_cache_config = { + enabled = var.dns_cache + } + + gce_persistent_disk_csi_driver_config = { + enabled = var.gce_pd_csi_driver + } + } +} + +module "node_pool" { + source = "terraform-google-modules/kubernetes-engine/google//modules/gke-node-pool" + version = "~> 38.0" + + project_id = var.project_id + location = var.region + cluster = module.gke.cluster_name + node_config = { + disk_size_gb = 100 + disk_type = "pd-standard" + image_type = "COS_CONTAINERD" + machine_type = "e2-medium" + service_account = var.service_account + workload_metadata_config = { + mode = "GKE_METADATA" + } + } +} diff --git a/examples/gke_inference_gateway_standard_cluster/outputs.tf b/examples/gke_inference_gateway_standard_cluster/outputs.tf new file mode 100644 index 0000000000..7f97dde8a9 --- /dev/null +++ b/examples/gke_inference_gateway_standard_cluster/outputs.tf @@ -0,0 +1,57 @@ +/** + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "endpoint" { + sensitive = true + description = "The cluster endpoint" + value = module.gke.endpoint +} + +output "ca_certificate" { + sensitive = true + description = "The cluster ca certificate (base64 encoded)" + value = module.gke.ca_certificate +} + +output "project_id" { + description = "The project ID the cluster is in" + value = var.project_id +} + +output "location" { + description = "Cluster location" + value = module.gke.location +} + +output "node_locations" { + description = "Cluster node locations" + value = module.gke.node_locations +} + +output "addons_config" { + description = "The configuration for addons supported by GKE Autopilot." + value = module.gke.addons_config +} + +output "cluster_name" { + description = "Cluster name" + value = module.gke.cluster_name +} + +output "master_version" { + description = "The master Kubernetes version" + value = module.gke.master_version +} diff --git a/examples/gke_inference_gateway_standard_cluster/variables.tf b/examples/gke_inference_gateway_standard_cluster/variables.tf new file mode 100644 index 0000000000..c7f8ebcf64 --- /dev/null +++ b/examples/gke_inference_gateway_standard_cluster/variables.tf @@ -0,0 +1,59 @@ +/** + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "The project ID to host the cluster in" +} + +variable "cluster_name_suffix" { + description = "A suffix to append to the default cluster name" + default = "" +} + +variable "region" { + description = "The region to host the cluster in" +} + +variable "network" { + description = "The VPC network to host the cluster in" +} + +variable "subnetwork" { + description = "The subnetwork to host the cluster in" +} + +variable "ip_range_pods" { + description = "The secondary ip range to use for pods" +} + +variable "ip_range_services" { + description = "The secondary ip range to use for services" +} + +variable "service_account" { + description = "Service account to associate to the nodes in the cluster" +} + +variable "dns_cache" { + description = "Boolean to enable / disable NodeLocal DNSCache " + default = false +} + +variable "gce_pd_csi_driver" { + type = bool + description = "(Beta) Whether this cluster should enable the Google Compute Engine Persistent Disk Container Storage Interface (CSI) Driver." + default = false +} diff --git a/examples/gke_inference_gateway_standard_cluster/versions.tf b/examples/gke_inference_gateway_standard_cluster/versions.tf new file mode 100644 index 0000000000..220cbfdb31 --- /dev/null +++ b/examples/gke_inference_gateway_standard_cluster/versions.tf @@ -0,0 +1,30 @@ +/** + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +terraform { + required_version = ">= 1.3" + required_providers { + google = { + source = "hashicorp/google" + } + google-beta = { + source = "hashicorp/google-beta" + } + kubernetes = { + source = "hashicorp/kubernetes" + } + } +} From 8ebbc560153f00d0649cfb896e552524df360d03 Mon Sep 17 00:00:00 2001 From: Sina Chavoshi Date: Tue, 16 Sep 2025 19:05:32 +0000 Subject: [PATCH 2/3] update to ga release --- .../README.md | 121 +++-- .../main.tf | 473 +++++++++++++++++- .../variables.tf | 45 +- 3 files changed, 564 insertions(+), 75 deletions(-) diff --git a/examples/gke_inference_gateway_standard_cluster/README.md b/examples/gke_inference_gateway_standard_cluster/README.md index 926a01b23b..777ff237a4 100644 --- a/examples/gke_inference_gateway_standard_cluster/README.md +++ b/examples/gke_inference_gateway_standard_cluster/README.md @@ -1,41 +1,80 @@ -# GKE Standard Cluster and Node Pool - -This example creates a GKE private cluster and Node Pool with beta features. -For a full example see [simple_regional_private_beta](../simple_regional_private_beta/README.md) example. - - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| cluster\_name\_suffix | A suffix to append to the default cluster name | `string` | `""` | no | -| dns\_cache | Boolean to enable / disable NodeLocal DNSCache | `bool` | `false` | no | -| gce\_pd\_csi\_driver | (Beta) Whether this cluster should enable the Google Compute Engine Persistent Disk Container Storage Interface (CSI) Driver. | `bool` | `false` | no | -| ip\_range\_pods | The secondary ip range to use for pods | `any` | n/a | yes | -| ip\_range\_services | The secondary ip range to use for services | `any` | n/a | yes | -| network | The VPC network to host the cluster in | `any` | n/a | yes | -| project\_id | The project ID to host the cluster in | `any` | n/a | yes | -| region | The region to host the cluster in | `any` | n/a | yes | -| service\_account | Service account to associate to the nodes in the cluster | `any` | n/a | yes | -| subnetwork | The subnetwork to host the cluster in | `any` | n/a | yes | - -## Outputs - -| Name | Description | -|------|-------------| -| addons\_config | The configuration for addons supported by GKE Autopilot. | -| ca\_certificate | The cluster ca certificate (base64 encoded) | -| cluster\_name | Cluster name | -| endpoint | The cluster endpoint | -| location | Cluster location | -| master\_version | The master Kubernetes version | -| node\_locations | Cluster node locations | -| project\_id | The project ID the cluster is in | - - - -To provision this example, run the following from within this directory: -- `terraform init` to get the plugins -- `terraform plan` to see the infrastructure plan -- `terraform apply` to apply the infrastructure build -- `terraform destroy` to destroy the built infrastructure +# GKE Inference Gateway Example + +This example provisions a GKE Standard cluster and a node pool with H100 GPUs, suitable for deploying and serving Large Language Models (LLMs) using the GKE Inference Gateway. + +The cluster is configured with: +- GKE Gateway API enabled. +- Managed Prometheus for monitoring. +- DCGM for GPU monitoring. +- A dedicated node pool with NVIDIA H100 80GB GPUs. + +This Terraform script automates the deployment of all necessary Kubernetes resources, including: +- Authorization for metrics scraping. +- A vLLM model server for a Llama3.1 model. +- GKE Inference Gateway CRDs. +- GKE Inference Gateway resources (`InferencePool`, `InferenceObjective`, `Gateway`, `HTTPRoute`). + +## Usage + +1. **Enable APIs** + + ```bash + gcloud services enable container.googleapis.com + ``` + +2. **Set up your environment** + + You will need to set the following environment variables. You may also need to create a `terraform.tfvars` file to provide values for the variables in `variables.tf`. + + ```bash + export PROJECT_ID="your-project-id" + export REGION="us-central1" + export CLUSTER_NAME="inference-gateway-cluster" + export HF_TOKEN="your-hugging-face-token" + ``` + +3. **Run Terraform** + + The `terraform apply` command will provision the GKE cluster and deploy all the necessary Kubernetes resources. + + ```bash + terraform init + terraform apply + ``` + +4. **Configure kubectl** + + After the apply is complete, configure `kubectl` to communicate with your new cluster. + + ```bash + gcloud container clusters get-credentials $(terraform output -raw cluster_name) --region $(terraform output -raw location) + ``` + +5. **Send an inference request** + + Get the Gateway IP address: + ```bash + IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') + PORT=80 + ``` + + Send a request: + ```bash + curl -i -X POST http://${IP}:${PORT}/v1/completions \ + -H "Content-Type: application/json" \ + -d + { + "model": "food-review", + "prompt": "What is a good recipe for a chicken curry?", + "max_tokens": 100, + "temperature": "0.7" + } + ``` + +## Cleanup + +Running `terraform destroy` will deprovision the GKE cluster and all associated Kubernetes resources. + +```bash +terraform destroy +``` \ No newline at end of file diff --git a/examples/gke_inference_gateway_standard_cluster/main.tf b/examples/gke_inference_gateway_standard_cluster/main.tf index d3927472e6..baba283c16 100644 --- a/examples/gke_inference_gateway_standard_cluster/main.tf +++ b/examples/gke_inference_gateway_standard_cluster/main.tf @@ -27,6 +27,14 @@ provider "kubernetes" { cluster_ca_certificate = base64decode(module.gke.ca_certificate) } +provider "helm" { + kubernetes { + host = "https://${module.gke.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(module.gke.ca_certificate) + } +} + data "google_compute_subnetwork" "subnetwork" { name = var.subnetwork project = var.project_id @@ -35,44 +43,40 @@ data "google_compute_subnetwork" "subnetwork" { module "gke" { source = "terraform-google-modules/kubernetes-engine/google//modules/gke-standard-cluster" - version = "~> 38.0" + version = "~> 39.0" project_id = var.project_id name = "${local.cluster_type}-cluster${var.cluster_name_suffix}" location = var.region network = var.network subnetwork = var.subnetwork + release_channel = "RAPID" + gateway_api_config = { + channel = "CHANNEL_STANDARD" + } + monitoring_config = { + enable_managed_prometheus = true + enabled_components = ["SYSTEM_COMPONENTS", "DCGM"] + } + logging_service = "logging.googleapis.com/kubernetes" + ip_allocation_policy = { cluster_secondary_range_name = var.ip_range_pods services_secondary_range_name = var.ip_range_services } - private_cluster_config = { - enable_private_endpoint = true - enable_private_nodes = true - master_ipv4_cidr_block = "172.16.0.0/28" - master_global_access_config = { - enabled = true - } - } - deletion_protection = false - remove_default_node_pool = true - initial_node_count = 1 + remove_default_node_pool = false workload_identity_config = { workload_pool = local.default_workload_pool } - master_authorized_networks_config = { - cidr_blocks = [{ - cidr_block = data.google_compute_subnetwork.subnetwork.ip_cidr_range - display_name = "VPC" - }] - } - addons_config = { + http_load_balancing = { + enabled = true + } dns_cache_config = { enabled = var.dns_cache } @@ -81,23 +85,446 @@ module "gke" { enabled = var.gce_pd_csi_driver } } + enable_shielded_nodes = true } module "node_pool" { source = "terraform-google-modules/kubernetes-engine/google//modules/gke-node-pool" - version = "~> 38.0" + version = "~> 39.0" project_id = var.project_id - location = var.region + location = var.zone cluster = module.gke.cluster_name + name = "gpupool" + node_count = 1 + node_config = { - disk_size_gb = 100 + disk_size_gb = 200 disk_type = "pd-standard" image_type = "COS_CONTAINERD" - machine_type = "e2-medium" + machine_type = "a3-highgpu-2g" service_account = var.service_account + guest_accelerator = { + type = "nvidia-h100-80gb" + count = 2 + } + gpu_driver_installation_config = { + gpu_driver_version = "LATEST" + } workload_metadata_config = { mode = "GKE_METADATA" } } } + +resource "kubernetes_secret" "hf_secret" { + metadata { + name = "hf-token" + } + data = { + token = var.hf_token + } + type = "Opaque" +} + +resource "kubernetes_config_map" "vllm_adapters" { + metadata { + name = "vllm-llama3.1-8b-adapters" + } + data = { + "configmap.yaml" = <<-EOT + vLLMLoRAConfig: + name: vllm-llama3.1-8b-instruct + port: 8000 + defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct + ensureExist: + models: + - id: food-review + source: Kawon/llama3.1-food-finetune_v14_r8 + - id: cad-fabricator + source: redcathode/fabricator + EOT + } +} + +resource "kubernetes_deployment" "vllm" { + metadata { + name = "vllm-llama3.1-8b-instruct" + } + spec { + replicas = 3 + selector { + match_labels = { + app = "vllm-llama3.1-8b-instruct" + } + } + template { + metadata { + labels = { + app = "vllm-llama3.1-8b-instruct" + } + } + spec { + termination_grace_period_seconds = 130 + enable_service_links = false + container { + name = "vllm" + image = "vllm/vllm-openai:latest" + image_pull_policy = "Always" + command = ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args = [ + "--model", "meta-llama/Llama-3.1-8B-Instruct", + "--tensor-parallel-size", "1", + "--port", "8000", + "--enable-lora", + "--max-loras", "2", + "--max-cpu-loras", "12" + ] + port { + container_port = 8000 + name = "http" + protocol = "TCP" + } + env { + name = "VLLM_USE_V1" + value = "1" + } + env { + name = "PORT" + value = "8000" + } + env { + name = "HUGGING_FACE_HUB_TOKEN" + value_from { + secret_key_ref { + name = kubernetes_secret.hf_secret.metadata[0].name + key = "token" + } + } + } + env { + name = "VLLM_ALLOW_RUNTIME_LORA_UPDATING" + value = "true" + } + lifecycle { + pre_stop { + exec { + command = ["/bin/sh", "-c", "sleep 30"] + } + } + } + resources { + limits = { + "nvidia.com/gpu" = 1 + } + requests = { + "nvidia.com/gpu" = 1 + } + } + liveness_probe { + http_get { + path = "/health" + port = "http" + scheme = "HTTP" + } + period_seconds = 1 + success_threshold = 1 + failure_threshold = 5 + timeout_seconds = 1 + } + readiness_probe { + http_get { + path = "/health" + port = "http" + scheme = "HTTP" + } + period_seconds = 1 + success_threshold = 1 + failure_threshold = 1 + timeout_seconds = 1 + } + startup_probe { + http_get { + path = "/health" + port = "http" + scheme = "HTTP" + } + failure_threshold = 600 + initial_delay_seconds = 2 + period_seconds = 1 + } + volume_mount { + mount_path = "/data" + name = "data" + } + volume_mount { + mount_path = "/dev/shm" + name = "shm" + } + volume_mount { + mount_path = "/adapters" + name = "adapters" + } + } + container { + name = "lora-adapter-syncer" + image = "us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main" + image_pull_policy = "Always" + env { + name = "DYNAMIC_LORA_ROLLOUT_CONFIG" + value = "/config/configmap.yaml" + } + volume_mount { + name = "config-volume" + mount_path = "/config" + } + } + volume { + name = "data" + empty_dir {} + } + volume { + name = "shm" + empty_dir { + medium = "Memory" + } + } + volume { + name = "adapters" + empty_dir {} + } + volume { + name = "config-volume" + config_map { + name = kubernetes_config_map.vllm_adapters.metadata[0].name + } + } + node_selector = { + "cloud.google.com/gke-accelerator" = "nvidia-h100-80gb" + } + } + } + } +} + +resource "null_resource" "apply_crds" { + provisioner "local-exec" { + command = "kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.0.0/manifests.yaml" + } + depends_on = [module.gke] +} + +resource "kubernetes_cluster_role" "metrics_reader" { + metadata { + name = "inference-gateway-metrics-reader" + } + rule { + non_resource_urls = ["/metrics"] + verbs = ["get"] + } +} + +resource "kubernetes_service_account" "metrics_reader" { + metadata { + name = "inference-gateway-sa-metrics-reader" + namespace = "default" + } +} + +resource "kubernetes_cluster_role_binding" "metrics_reader" { + metadata { + name = "inference-gateway-sa-metrics-reader-role-binding" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role.metrics_reader.metadata[0].name + } + subject { + kind = "ServiceAccount" + name = kubernetes_service_account.metrics_reader.metadata[0].name + namespace = "default" + } +} + +resource "kubernetes_secret" "metrics_reader_token" { + metadata { + name = "inference-gateway-sa-metrics-reader-secret" + namespace = "default" + annotations = { + "kubernetes.io/service-account.name" = kubernetes_service_account.metrics_reader.metadata[0].name + } + } + type = "kubernetes.io/service-account-token" +} + +resource "kubernetes_cluster_role" "secret_reader" { + metadata { + name = "inference-gateway-sa-metrics-reader-secret-read" + } + rule { + api_groups = [""] + resources = ["secrets"] + resource_names = [kubernetes_secret.metrics_reader_token.metadata[0].name] + verbs = ["get", "list", "watch"] + } +} + +resource "kubernetes_cluster_role_binding" "gmp_secret_reader" { + metadata { + name = "gmp-system:collector:inference-gateway-sa-metrics-reader-secret-read" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role.secret_reader.metadata[0].name + } + subject { + kind = "ServiceAccount" + name = "collector" + namespace = "gmp-system" + } +} + +resource "helm_release" "inference_pool" { + name = "vllm-llama3.1-8b-instruct" + repository = "oci://registry.k8s.io/gateway-api-inference-extension/charts" + chart = "inferencepool" + version = "v1.0.0" + + set { + name = "inferencePool.modelServers.matchLabels.app" + value = "vllm-llama3.1-8b-instruct" + } + set { + name = "provider.name" + value = "gke" + } + set { + name = "healthCheckPolicy.create" + value = "false" + } + depends_on = [kubernetes_deployment.vllm, null_resource.apply_crds] +} + +resource "kubernetes_manifest" "food_review_model" { + manifest = { + "apiVersion" = "inference.networking.k8s.io/v1alpha1" + "kind" = "InferenceObjective" + "metadata" = { + "name" = "food-review" + } + "spec" = { + "priority" = 10 + "poolRef" = { + "name" = "vllm-llama3.1-8b-instruct" + "kind" = "InferencePool" + } + } + } + depends_on = [helm_release.inference_pool] +} + +resource "kubernetes_manifest" "base_model" { + manifest = { + "apiVersion" = "inference.networking.k8s.io/v1alpha1" + "kind" = "InferenceObjective" + "metadata" = { + "name" = "llama3-base-model" + } + "spec" = { + "priority" = 20 + "poolRef" = { + "name" = "vllm-llama3.1-8b-instruct" + "kind" = "InferencePool" + } + } + } + depends_on = [helm_release.inference_pool] +} + +resource "kubernetes_manifest" "health_check_policy" { + manifest = { + "apiVersion" = "networking.gke.io/v1" + "kind" = "HealthCheckPolicy" + "metadata" = { + "name" = "health-check-policy" + "namespace" = "default" + } + "spec" = { + "targetRef" = { + "group" = "inference.networking.k8s.io" + "kind" = "InferencePool" + "name" = "vllm-llama3.1-8b-instruct" + } + "default" = { + "config" = { + "type" = "HTTP" + "httpHealthCheck" = { + "requestPath" = "/health" + "port" = 8000 + } + } + } + } + } + depends_on = [helm_release.inference_pool] +} + +resource "kubernetes_manifest" "gateway" { + manifest = { + "apiVersion" = "gateway.networking.k8s.io/v1" + "kind" = "Gateway" + "metadata" = { + "name" = "inference-gateway" + } + "spec" = { + "gatewayClassName" = "gke-l7-regional-external-managed" + "listeners" = [ + { + "protocol" = "HTTP" + "port" = 80 + "name" = "http" + } + ] + } + } + depends_on = [helm_release.inference_pool] +} + +resource "kubernetes_manifest" "http_route" { + manifest = { + "apiVersion" = "gateway.networking.k8s.io/v1" + "kind" = "HTTPRoute" + "metadata" = { + "name" = "my-route" + } + "spec" = { + "parentRefs" = [ + { + "name" = "inference-gateway" + } + ] + "rules" = [ + { + "matches" = [ + { + "path" = { + "type" = "PathPrefix" + "value" = "/" + } + } + ] + "backendRefs" = [ + { + "name" = "vllm-llama3.1-8b-instruct" + "group" = "inference.networking.k8s.io" + "kind" = "InferencePool" + } + ] + } + ] + } + } + depends_on = [kubernetes_manifest.gateway] +} \ No newline at end of file diff --git a/examples/gke_inference_gateway_standard_cluster/variables.tf b/examples/gke_inference_gateway_standard_cluster/variables.tf index c7f8ebcf64..d0b99efafb 100644 --- a/examples/gke_inference_gateway_standard_cluster/variables.tf +++ b/examples/gke_inference_gateway_standard_cluster/variables.tf @@ -16,44 +16,67 @@ variable "project_id" { description = "The project ID to host the cluster in" -} - -variable "cluster_name_suffix" { - description = "A suffix to append to the default cluster name" - default = "" + type = string } variable "region" { description = "The region to host the cluster in" + type = string + default = "us-central1" +} + +variable "zone" { + description = "The zone to host the cluster in" + type = string + default = "us-central1-a" } variable "network" { description = "The VPC network to host the cluster in" + type = string } variable "subnetwork" { description = "The subnetwork to host the cluster in" + type = string } variable "ip_range_pods" { - description = "The secondary ip range to use for pods" + description = "The secondary ip range for pods" + type = string } variable "ip_range_services" { - description = "The secondary ip range to use for services" + description = "The secondary ip range for services" + type = string +} + +variable "cluster_name_suffix" { + description = "A suffix to append to the cluster name" + type = string + default = "" } variable "service_account" { - description = "Service account to associate to the nodes in the cluster" + description = "Service account to attach to the node pool." + type = string + default = null } variable "dns_cache" { - description = "Boolean to enable / disable NodeLocal DNSCache " + description = "Enable DNS cache for the cluster" + type = bool default = false } variable "gce_pd_csi_driver" { + description = "Enable GCE Persistent Disk CSI driver" type = bool - description = "(Beta) Whether this cluster should enable the Google Compute Engine Persistent Disk Container Storage Interface (CSI) Driver." - default = false + default = true } + +variable "hf_token" { + description = "Hugging Face token" + type = string + sensitive = true +} \ No newline at end of file From dc362edd1d2cbd8b31eb5fa5bc9e7d8cbf60e786 Mon Sep 17 00:00:00 2001 From: Sina Chavoshi Date: Tue, 16 Sep 2025 19:09:13 +0000 Subject: [PATCH 3/3] fix curl command in readme --- examples/gke_inference_gateway_standard_cluster/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/gke_inference_gateway_standard_cluster/README.md b/examples/gke_inference_gateway_standard_cluster/README.md index 777ff237a4..40763e5377 100644 --- a/examples/gke_inference_gateway_standard_cluster/README.md +++ b/examples/gke_inference_gateway_standard_cluster/README.md @@ -62,13 +62,12 @@ This Terraform script automates the deployment of all necessary Kubernetes resou ```bash curl -i -X POST http://${IP}:${PORT}/v1/completions \ -H "Content-Type: application/json" \ - -d - { + -d '{ "model": "food-review", "prompt": "What is a good recipe for a chicken curry?", "max_tokens": 100, "temperature": "0.7" - } + }' ``` ## Cleanup