From 9cbe121b11baa0d6bb90036baeea36f82de7995c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Trkan?= Date: Sun, 5 Oct 2025 01:23:22 +0200 Subject: [PATCH] fix(infrastructure): prometheus --- tofu/main.tf | 35 ++-- tofu/modules/cloudflare/cluster-tunnel.yaml | 4 +- tofu/modules/cloudflare/main.tf | 8 +- .../maxscale/charts/maxscale-helm/Chart.yaml | 2 +- .../maxscale-helm/templates/config.yaml | 24 ++- .../templates/phpmyadmin-deployment.yaml | 2 +- tofu/modules/maxscale/main.tf | 2 +- tofu/modules/prometheus/grafana-ui.yaml | 14 ++ tofu/modules/prometheus/main.tf | 66 ++++++ tofu/modules/prometheus/values.yaml | 189 ++++++++++++++++++ tofu/modules/prometheus/variables.tf | 5 + tofu/modules/rabbitmq/main.tf | 6 +- 12 files changed, 332 insertions(+), 25 deletions(-) create mode 100644 tofu/modules/prometheus/grafana-ui.yaml create mode 100644 tofu/modules/prometheus/main.tf create mode 100644 tofu/modules/prometheus/values.yaml create mode 100644 tofu/modules/prometheus/variables.tf diff --git a/tofu/main.tf b/tofu/main.tf index e701d35..6919b8b 100644 --- a/tofu/main.tf +++ b/tofu/main.tf @@ -53,7 +53,8 @@ module "loadbalancer" { } module "cert-manager" { - source = "${path.module}/modules/cert-manager" + source = "${path.module}/modules/cert-manager" + depends_on = [module.loadbalancer] } module "cloudflare" { @@ -67,10 +68,16 @@ module "cloudflare" { cloudflare_account_id = var.cloudflare_account_id } +module "monitoring" { + source = "${path.module}/modules/prometheus" + depends_on = [module.cloudflare] + cloudflare_domain = var.cloudflare_domain +} + module "database" { source = "${path.module}/modules/maxscale" - depends_on = [module.storage, module.loadbalancer, module.cloudflare] + depends_on = [module.monitoring] mariadb_password = var.mariadb_password mariadb_root_password = var.mariadb_root_password @@ -87,23 +94,23 @@ module "database" { cloudflare_domain = var.cloudflare_domain } -module "argocd" { - source = "${path.module}/modules/argocd" - depends_on = [module.storage, module.loadbalancer, module.cloudflare] +#module "argocd" { +# source = "${path.module}/modules/argocd" +# depends_on = [module.storage, module.loadbalancer, module.cloudflare] - argocd_admin_password = var.argocd_admin_password - cloudflare_domain = var.cloudflare_domain -} +# argocd_admin_password = var.argocd_admin_password +# cloudflare_domain = var.cloudflare_domain +#} -module "redis" { - source = "${path.module}/modules/redis" - depends_on = [module.storage] - cloudflare_base_domain = var.cloudflare_domain -} +#module "redis" { +# source = "${path.module}/modules/redis" +# depends_on = [module.storage] +# cloudflare_base_domain = var.cloudflare_domain +#} module "rabbitmq" { source = "${path.module}/modules/rabbitmq" - depends_on = [module.storage] + depends_on = [module.database] base_domain = var.cloudflare_domain rabbitmq-password = var.rabbitmq-password } diff --git a/tofu/modules/cloudflare/cluster-tunnel.yaml b/tofu/modules/cloudflare/cluster-tunnel.yaml index dbae0fa..7a89362 100644 --- a/tofu/modules/cloudflare/cluster-tunnel.yaml +++ b/tofu/modules/cloudflare/cluster-tunnel.yaml @@ -1,10 +1,10 @@ apiVersion: networking.cfargotunnel.com/v1alpha2 kind: ClusterTunnel metadata: - name: cluster-tunnel # The ClusterTunnel Custom Resource Name + name: cluster-tunnel spec: newTunnel: - name: ${cloudflare_tunnel_name} # Name of your new tunnel on Cloudflare + name: ${cloudflare_tunnel_name} cloudflare: email: ${cloudflare_email} domain: ${cloudflare_domain} diff --git a/tofu/modules/cloudflare/main.tf b/tofu/modules/cloudflare/main.tf index a58890a..6d7dfa2 100644 --- a/tofu/modules/cloudflare/main.tf +++ b/tofu/modules/cloudflare/main.tf @@ -41,10 +41,10 @@ resource "kubectl_manifest" "cloudflare-api-token" { resource "kubectl_manifest" "cloudflare-tunnel" { yaml_body = templatefile("${path.module}/cluster-tunnel.yaml", { cloudflare_tunnel_name = var.cloudflare_tunnel_name - cloudflare_email = var.cloudflare_email - cloudflare_domain = var.cloudflare_domain - cloudflare_account_id = var.cloudflare_account_id + cloudflare_email = var.cloudflare_email + cloudflare_domain = var.cloudflare_domain + cloudflare_account_id = var.cloudflare_account_id }) depends_on = [kustomization_resource.cloudflare] -} \ No newline at end of file +} diff --git a/tofu/modules/maxscale/charts/maxscale-helm/Chart.yaml b/tofu/modules/maxscale/charts/maxscale-helm/Chart.yaml index f23ea6c..15afe88 100644 --- a/tofu/modules/maxscale/charts/maxscale-helm/Chart.yaml +++ b/tofu/modules/maxscale/charts/maxscale-helm/Chart.yaml @@ -1,4 +1,4 @@ apiVersion: v2 name: maxscale-helm -version: 1.0.2 +version: 1.0.7 description: Helm chart for MaxScale related Kubernetes manifests diff --git a/tofu/modules/maxscale/charts/maxscale-helm/templates/config.yaml b/tofu/modules/maxscale/charts/maxscale-helm/templates/config.yaml index f6e060e..f7f8d9f 100644 --- a/tofu/modules/maxscale/charts/maxscale-helm/templates/config.yaml +++ b/tofu/modules/maxscale/charts/maxscale-helm/templates/config.yaml @@ -54,6 +54,12 @@ spec: metrics: enabled: true + serviceMonitor: + enabled: true + interval: 30s + scrapeTimeout: 10s + prometheusRelease: kube-prometheus-stack + jobLabel: mariadb-monitoring tls: enabled: true @@ -106,7 +112,17 @@ spec: key: dsn affinity: - antiAffinityEnabled: true + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - mariadb-repl + topologyKey: kubernetes.io/hostname tolerations: - key: "k8s.mariadb.com/ha" @@ -149,6 +165,12 @@ spec: metrics: enabled: true + serviceMonitor: + enabled: true + interval: 30s + scrapeTimeout: 10s + prometheusRelease: kube-prometheus-stack + jobLabel: mariadb-monitoring tls: enabled: true diff --git a/tofu/modules/maxscale/charts/maxscale-helm/templates/phpmyadmin-deployment.yaml b/tofu/modules/maxscale/charts/maxscale-helm/templates/phpmyadmin-deployment.yaml index b66ee27..4a0156c 100644 --- a/tofu/modules/maxscale/charts/maxscale-helm/templates/phpmyadmin-deployment.yaml +++ b/tofu/modules/maxscale/charts/maxscale-helm/templates/phpmyadmin-deployment.yaml @@ -33,7 +33,7 @@ spec: value: "3306" - name: PHPMYADMIN_ALLOW_NO_PASSWORD value: "false" - image: "docker.io/bitnami/phpmyadmin:5.2.2" + image: "bitnamilegacy/phpmyadmin:5.2.2" imagePullPolicy: IfNotPresent livenessProbe: failureThreshold: 3 diff --git a/tofu/modules/maxscale/main.tf b/tofu/modules/maxscale/main.tf index 6356195..2b44909 100644 --- a/tofu/modules/maxscale/main.tf +++ b/tofu/modules/maxscale/main.tf @@ -58,7 +58,7 @@ resource "helm_release" "mariadb-operator" { resource "helm_release" "maxscale_helm" { name = "maxscale-helm" chart = "${path.module}/charts/maxscale-helm" - version = "1.0.2" + version = "1.0.7" depends_on = [ helm_release.mariadb-operator-crds, kubectl_manifest.secrets ] timeout = 3600 diff --git a/tofu/modules/prometheus/grafana-ui.yaml b/tofu/modules/prometheus/grafana-ui.yaml new file mode 100644 index 0000000..8ccefb4 --- /dev/null +++ b/tofu/modules/prometheus/grafana-ui.yaml @@ -0,0 +1,14 @@ +apiVersion: networking.cfargotunnel.com/v1alpha1 +kind: TunnelBinding +metadata: + name: grafana-tunnel-binding + namespace: monitoring +subjects: + - name: grafana + spec: + target: http://kube-prometheus-stack-grafana.monitoring.svc.cluster.local + fqdn: grafana.${base_domain} + noTlsVerify: true +tunnelRef: + kind: ClusterTunnel + name: cluster-tunnel \ No newline at end of file diff --git a/tofu/modules/prometheus/main.tf b/tofu/modules/prometheus/main.tf new file mode 100644 index 0000000..e943cfa --- /dev/null +++ b/tofu/modules/prometheus/main.tf @@ -0,0 +1,66 @@ +terraform { + required_providers { + kubectl = { + source = "gavinbunney/kubectl" + version = "1.19.0" + } + helm = { + source = "hashicorp/helm" + version = "3.0.2" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "2.38.0" + } + kustomization = { + source = "kbst/kustomization" + version = "0.9.6" + } + time = { + source = "hashicorp/time" + version = "0.13.1" + } + } +} + +# Create namespace for monitoring +resource "kubernetes_namespace" "monitoring" { + metadata { + name = "monitoring" + labels = { + "pod-security.kubernetes.io/enforce" = "privileged" + } + } +} + +# Deploy kube-prometheus-stack +resource "helm_release" "kube_prometheus_stack" { + name = "kube-prometheus-stack" + repository = "https://prometheus-community.github.io/helm-charts" + chart = "kube-prometheus-stack" + namespace = kubernetes_namespace.monitoring.metadata[0].name + version = "67.2.1" # Check for latest version + + # Wait for CRDs to be created + wait = true + timeout = 600 + force_update = false + recreate_pods = false + + # Reference the values file + values = [ + file("${path.module}/values.yaml") + ] + + depends_on = [ + kubernetes_namespace.monitoring + ] +} + +resource "kubectl_manifest" "argocd-tunnel-bind" { + depends_on = [helm_release.kube_prometheus_stack] + + yaml_body = templatefile("${path.module}/grafana-ui.yaml", { + base_domain = var.cloudflare_domain + }) +} diff --git a/tofu/modules/prometheus/values.yaml b/tofu/modules/prometheus/values.yaml new file mode 100644 index 0000000..a75f89b --- /dev/null +++ b/tofu/modules/prometheus/values.yaml @@ -0,0 +1,189 @@ +# Prometheus configuration +prometheus: + prometheusSpec: + retention: 30d + retentionSize: "45GB" + + # Storage configuration + storageSpec: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + # storageClassName: "your-storage-class" # Uncomment and specify if needed + + # Resource limits + resources: + requests: + cpu: 500m + memory: 2Gi + limits: + cpu: 2000m + memory: 4Gi + + # Scrape interval + scrapeInterval: 30s + evaluationInterval: 30s + + # Service configuration + service: + type: ClusterIP + port: 9090 + + # Ingress (disabled by default) + ingress: + enabled: false + # ingressClassName: nginx + # hosts: + # - prometheus.example.com + # tls: + # - secretName: prometheus-tls + # hosts: + # - prometheus.example.com + +# Grafana configuration +grafana: + enabled: true + + # Admin credentials + adminPassword: "admin" # CHANGE THIS IN PRODUCTION! + + # Persistence + persistence: + enabled: true + size: 10Gi + # storageClassName: "your-storage-class" # Uncomment and specify if needed + + # Resource limits + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + # Service configuration + service: + type: ClusterIP + port: 80 + + # Ingress (disabled by default) + ingress: + enabled: false + # ingressClassName: nginx + # hosts: + # - grafana.example.com + # tls: + # - secretName: grafana-tls + # hosts: + # - grafana.example.com + + # Default dashboards + defaultDashboardsEnabled: true + defaultDashboardsTimezone: Europe/Prague + +# Alertmanager configuration +alertmanager: + enabled: true + + alertmanagerSpec: + # Storage configuration + storage: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + # storageClassName: "your-storage-class" # Uncomment and specify if needed + + # Resource limits + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + + # Service configuration + service: + type: ClusterIP + port: 9093 + + # Ingress (disabled by default) + ingress: + enabled: false + # ingressClassName: nginx + # hosts: + # - alertmanager.example.com + # tls: + # - secretName: alertmanager-tls + # hosts: + # - alertmanager.example.com + + # Alertmanager configuration + config: + global: + resolve_timeout: 5m + + route: + group_by: [ 'alertname', 'cluster', 'service' ] + group_wait: 10s + group_interval: 10s + repeat_interval: 12h + receiver: 'null' + routes: + - match: + alertname: Watchdog + receiver: 'null' + + receivers: + - name: 'null' + # Add your receivers here (email, slack, pagerduty, etc.) + # - name: 'slack' + # slack_configs: + # - api_url: 'YOUR_SLACK_WEBHOOK_URL' + # channel: '#alerts' + # title: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}' + # text: '{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}' + +# Node Exporter +nodeExporter: + enabled: true + +# Kube State Metrics +kubeStateMetrics: + enabled: true + +# Prometheus Operator +prometheusOperator: + enabled: true + + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + +# Service Monitors +# Automatically discover and monitor services with appropriate labels +prometheus-node-exporter: + prometheus: + monitor: + enabled: true + +# Additional ServiceMonitors can be defined here +# additionalServiceMonitors: [] + +# Global settings +global: + rbac: + create: true \ No newline at end of file diff --git a/tofu/modules/prometheus/variables.tf b/tofu/modules/prometheus/variables.tf new file mode 100644 index 0000000..a40343b --- /dev/null +++ b/tofu/modules/prometheus/variables.tf @@ -0,0 +1,5 @@ +variable "cloudflare_domain" { + type = string + default = "Base cloudflare domain, e.g. example.com" + nullable = false +} diff --git a/tofu/modules/rabbitmq/main.tf b/tofu/modules/rabbitmq/main.tf index 36a39e2..5276f42 100644 --- a/tofu/modules/rabbitmq/main.tf +++ b/tofu/modules/rabbitmq/main.tf @@ -65,7 +65,11 @@ resource "helm_release" "rabbitmq" { { name = "podAntiAffinityPreset" value = "soft" - } + }, + { + name = "image.repository" + value = "bitnamilegacy/rabbitmq" + }, ] }