From 94a594ae33e653115efaa02f54bb076f32c07f11 Mon Sep 17 00:00:00 2001 From: amanpruthi Date: Tue, 4 Jun 2024 13:29:10 +0530 Subject: [PATCH] feat: added support for stackdriver and otel metrics --- main.tf | 78 ++++++++++++++++++++++----- modules/app_gke/main.tf | 15 +++++- modules/app_gke/variables.tf | 5 ++ modules/service_accounts/main.tf | 24 +++++++++ modules/service_accounts/outputs.tf | 4 ++ modules/service_accounts/variables.tf | 15 ++++++ variables.tf | 15 ++++++ 7 files changed, 143 insertions(+), 13 deletions(-) diff --git a/main.tf b/main.tf index d26147a..62e3e17 100644 --- a/main.tf +++ b/main.tf @@ -29,10 +29,13 @@ locals { } module "service_accounts" { - source = "./modules/service_accounts" - namespace = var.namespace - bucket_name = var.bucket_name - depends_on = [module.project_factory_project_services] + source = "./modules/service_accounts" + namespace = var.namespace + bucket_name = var.bucket_name + account_id = var.workload_account_id + service_account_name = var.service_account_name + enable_stackdriver = var.enable_stackdriver + depends_on = [module.project_factory_project_services] } module "kms" { @@ -77,14 +80,15 @@ locals { } module "app_gke" { - source = "./modules/app_gke" - namespace = var.namespace - machine_type = coalesce(try(local.deployment_size[var.size].node_instance, null), var.gke_machine_type) - node_count = coalesce(try(local.deployment_size[var.size].node_count, null), var.gke_node_count) - network = local.network - subnetwork = local.subnetwork - service_account = module.service_accounts.service_account - depends_on = [module.project_factory_project_services] + source = "./modules/app_gke" + namespace = var.namespace + machine_type = coalesce(try(local.deployment_size[var.size].node_instance, null), var.gke_machine_type) + node_count = coalesce(try(local.deployment_size[var.size].node_count, null), var.gke_node_count) + network = local.network + subnetwork = local.subnetwork + service_account = module.service_accounts.service_account + create_workload_identity = var.enable_stackdriver + depends_on = [module.project_factory_project_services] } module "app_lb" { @@ -186,6 +190,8 @@ locals { } : {} } +data "google_client_config" "current" {} + module "wandb" { source = "wandb/wandb/helm" version = "1.2.0" @@ -241,6 +247,54 @@ module "wandb" { "ingress.gcp.kubernetes.io/pre-shared-cert" = module.app_lb.certificate } } + # To support otel rds and redis metrics need operator-wandb chart minimum version 0.13.8 ( stackdriver subchart) + stackdriver = var.enable_stackdriver ? { + install = true + stackdriver = { + projectId = data.google_client_config.current.project + } + serviceAccount = { annotations = { "iam.gke.io/gcp-service-account" = module.service_accounts.monitoring_role } } + } : { + install = false + stackdriver = {} + serviceAccount = {} + } + + otel = { + daemonset = var.enable_stackdriver ? { + config = { + receivers = { + prometheus = { + config = { + scrape_configs = [ + { job_name = "stackdriver" + scheme = "http" + metrics_path = "/metrics" + dns_sd_configs = [ + { names = ["stackdriver"] + type = "A" + port = 9255 + } + ] + } + ] + } + } + } + service = { + pipelines = { + metrics = { + receivers = ["hostmetrics", "k8s_cluster", "kubeletstats", "prometheus"] + } + } + } + } + } : { config = { + receivers = {} + service = {} + } + } + } redis = { install = false } mysql = { install = false } diff --git a/modules/app_gke/main.tf b/modules/app_gke/main.tf index e57cbea..5027a22 100644 --- a/modules/app_gke/main.tf +++ b/modules/app_gke/main.tf @@ -1,3 +1,9 @@ +data "google_client_config" "current" {} + +locals { + project_id = data.google_client_config.current.project +} + resource "google_container_cluster" "default" { name = "${var.namespace}-cluster" @@ -11,7 +17,14 @@ resource "google_container_cluster" "default" { evaluation_mode = "PROJECT_SINGLETON_POLICY_ENFORCE" } - + # Conditionally enable workload identity + dynamic "workload_identity_config" { + for_each = var.create_workload_identity == true ? [1] : [] + content { + workload_pool = "${local.project_id}.svc.id.goog" + } + } + ip_allocation_policy { cluster_ipv4_cidr_block = "/14" services_ipv4_cidr_block = "/19" diff --git a/modules/app_gke/variables.tf b/modules/app_gke/variables.tf index a9ec740..fa502bb 100644 --- a/modules/app_gke/variables.tf +++ b/modules/app_gke/variables.tf @@ -43,4 +43,9 @@ variable "parquet_wandb_env" { variable "node_count" { type = number +} + +variable "create_workload_identity" { + description = "Flag to indicate whether to enable workload identity for the service account." + type = bool } \ No newline at end of file diff --git a/modules/service_accounts/main.tf b/modules/service_accounts/main.tf index 724e7d7..ca85630 100644 --- a/modules/service_accounts/main.tf +++ b/modules/service_accounts/main.tf @@ -1,4 +1,5 @@ data "google_client_config" "current" {} +data "google_project" "project" {} resource "random_id" "main" { # 30 bytes ensures that enough characters are generated to satisfy the service account ID requirements, regardless of @@ -60,3 +61,26 @@ resource "google_project_iam_member" "secretmanager_admin" { member = local.sa_member role = "roles/secretmanager.admin" } + + +resource "google_service_account" "workload-identity-user-sa" { + count = var.enable_stackdriver == true ? 1 : 0 + account_id = "stackdriver" + display_name = "Service Account For Workload Identity" + +} + +resource "google_project_iam_member" "monitoring-role" { + count = var.enable_stackdriver == true ? 1 : 0 + project = local.project_id + role = "roles/monitoring.viewer" + member = "serviceAccount:${google_service_account.workload-identity-user-sa[count.index].email}" +} + + +resource "google_project_iam_member" "workload_identity-role" { + count = var.enable_stackdriver == true ? 1 : 0 + project = local.project_id + role = "roles/iam.workloadIdentityUser" + member = "serviceAccount:${local.project_id}.svc.id.goog[default/${var.service_account_name}]" +} \ No newline at end of file diff --git a/modules/service_accounts/outputs.tf b/modules/service_accounts/outputs.tf index 0ed66fa..ba84de5 100644 --- a/modules/service_accounts/outputs.tf +++ b/modules/service_accounts/outputs.tf @@ -2,4 +2,8 @@ output "service_account" { value = google_service_account.main description = "The service account." +} + +output "monitoring_role" { + value = var.enable_stackdriver == true ? google_service_account.workload-identity-user-sa[0].email : null } \ No newline at end of file diff --git a/modules/service_accounts/variables.tf b/modules/service_accounts/variables.tf index e4d4bb8..6cc7675 100644 --- a/modules/service_accounts/variables.tf +++ b/modules/service_accounts/variables.tf @@ -7,4 +7,19 @@ variable "bucket_name" { type = string description = "Existing bucket the service account will access" default = "" +} + +variable "account_id" { + description = "The ID of the Google Cloud Platform (GCP) account." + type = string +} + +variable "service_account_name" { + description = "The name of the service account." + type = string +} + +variable "enable_stackdriver" { + description = "Flag to indicate whether to enable workload identity for the service account." + type = bool } \ No newline at end of file diff --git a/variables.tf b/variables.tf index a2cbff8..57aa658 100644 --- a/variables.tf +++ b/variables.tf @@ -253,3 +253,18 @@ variable "parquet_wandb_env" { description = "Extra environment variables for W&B" default = {} } + +variable "enable_stackdriver" { + type = bool + default = false +} + +variable "workload_account_id" { + type = string + default = "stackdriver" +} + +variable "service_account_name" { + type = string + default = "stackdriver" +} \ No newline at end of file