fix(infrastructure): prometheus

This commit is contained in:
2025-10-05 01:23:22 +02:00
parent 8edaaee117
commit 9cbe121b11
12 changed files with 332 additions and 25 deletions

View File

@@ -53,7 +53,8 @@ module "loadbalancer" {
} }
module "cert-manager" { module "cert-manager" {
source = "${path.module}/modules/cert-manager" source = "${path.module}/modules/cert-manager"
depends_on = [module.loadbalancer]
} }
module "cloudflare" { module "cloudflare" {
@@ -67,10 +68,16 @@ module "cloudflare" {
cloudflare_account_id = var.cloudflare_account_id cloudflare_account_id = var.cloudflare_account_id
} }
module "monitoring" {
source = "${path.module}/modules/prometheus"
depends_on = [module.cloudflare]
cloudflare_domain = var.cloudflare_domain
}
module "database" { module "database" {
source = "${path.module}/modules/maxscale" source = "${path.module}/modules/maxscale"
depends_on = [module.storage, module.loadbalancer, module.cloudflare] depends_on = [module.monitoring]
mariadb_password = var.mariadb_password mariadb_password = var.mariadb_password
mariadb_root_password = var.mariadb_root_password mariadb_root_password = var.mariadb_root_password
@@ -87,23 +94,23 @@ module "database" {
cloudflare_domain = var.cloudflare_domain cloudflare_domain = var.cloudflare_domain
} }
module "argocd" { #module "argocd" {
source = "${path.module}/modules/argocd" # source = "${path.module}/modules/argocd"
depends_on = [module.storage, module.loadbalancer, module.cloudflare] # depends_on = [module.storage, module.loadbalancer, module.cloudflare]
argocd_admin_password = var.argocd_admin_password # argocd_admin_password = var.argocd_admin_password
cloudflare_domain = var.cloudflare_domain # cloudflare_domain = var.cloudflare_domain
} #}
module "redis" { #module "redis" {
source = "${path.module}/modules/redis" # source = "${path.module}/modules/redis"
depends_on = [module.storage] # depends_on = [module.storage]
cloudflare_base_domain = var.cloudflare_domain # cloudflare_base_domain = var.cloudflare_domain
} #}
module "rabbitmq" { module "rabbitmq" {
source = "${path.module}/modules/rabbitmq" source = "${path.module}/modules/rabbitmq"
depends_on = [module.storage] depends_on = [module.database]
base_domain = var.cloudflare_domain base_domain = var.cloudflare_domain
rabbitmq-password = var.rabbitmq-password rabbitmq-password = var.rabbitmq-password
} }

View File

@@ -1,10 +1,10 @@
apiVersion: networking.cfargotunnel.com/v1alpha2 apiVersion: networking.cfargotunnel.com/v1alpha2
kind: ClusterTunnel kind: ClusterTunnel
metadata: metadata:
name: cluster-tunnel # The ClusterTunnel Custom Resource Name name: cluster-tunnel
spec: spec:
newTunnel: newTunnel:
name: ${cloudflare_tunnel_name} # Name of your new tunnel on Cloudflare name: ${cloudflare_tunnel_name}
cloudflare: cloudflare:
email: ${cloudflare_email} email: ${cloudflare_email}
domain: ${cloudflare_domain} domain: ${cloudflare_domain}

View File

@@ -41,9 +41,9 @@ resource "kubectl_manifest" "cloudflare-api-token" {
resource "kubectl_manifest" "cloudflare-tunnel" { resource "kubectl_manifest" "cloudflare-tunnel" {
yaml_body = templatefile("${path.module}/cluster-tunnel.yaml", { yaml_body = templatefile("${path.module}/cluster-tunnel.yaml", {
cloudflare_tunnel_name = var.cloudflare_tunnel_name cloudflare_tunnel_name = var.cloudflare_tunnel_name
cloudflare_email = var.cloudflare_email cloudflare_email = var.cloudflare_email
cloudflare_domain = var.cloudflare_domain cloudflare_domain = var.cloudflare_domain
cloudflare_account_id = var.cloudflare_account_id cloudflare_account_id = var.cloudflare_account_id
}) })
depends_on = [kustomization_resource.cloudflare] depends_on = [kustomization_resource.cloudflare]

View File

@@ -1,4 +1,4 @@
apiVersion: v2 apiVersion: v2
name: maxscale-helm name: maxscale-helm
version: 1.0.2 version: 1.0.7
description: Helm chart for MaxScale related Kubernetes manifests description: Helm chart for MaxScale related Kubernetes manifests

View File

@@ -54,6 +54,12 @@ spec:
metrics: metrics:
enabled: true enabled: true
serviceMonitor:
enabled: true
interval: 30s
scrapeTimeout: 10s
prometheusRelease: kube-prometheus-stack
jobLabel: mariadb-monitoring
tls: tls:
enabled: true enabled: true
@@ -106,7 +112,17 @@ spec:
key: dsn key: dsn
affinity: affinity:
antiAffinityEnabled: true podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app.kubernetes.io/name
operator: In
values:
- mariadb-repl
topologyKey: kubernetes.io/hostname
tolerations: tolerations:
- key: "k8s.mariadb.com/ha" - key: "k8s.mariadb.com/ha"
@@ -149,6 +165,12 @@ spec:
metrics: metrics:
enabled: true enabled: true
serviceMonitor:
enabled: true
interval: 30s
scrapeTimeout: 10s
prometheusRelease: kube-prometheus-stack
jobLabel: mariadb-monitoring
tls: tls:
enabled: true enabled: true

View File

@@ -33,7 +33,7 @@ spec:
value: "3306" value: "3306"
- name: PHPMYADMIN_ALLOW_NO_PASSWORD - name: PHPMYADMIN_ALLOW_NO_PASSWORD
value: "false" value: "false"
image: "docker.io/bitnami/phpmyadmin:5.2.2" image: "bitnamilegacy/phpmyadmin:5.2.2"
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
livenessProbe: livenessProbe:
failureThreshold: 3 failureThreshold: 3

View File

@@ -58,7 +58,7 @@ resource "helm_release" "mariadb-operator" {
resource "helm_release" "maxscale_helm" { resource "helm_release" "maxscale_helm" {
name = "maxscale-helm" name = "maxscale-helm"
chart = "${path.module}/charts/maxscale-helm" chart = "${path.module}/charts/maxscale-helm"
version = "1.0.2" version = "1.0.7"
depends_on = [ helm_release.mariadb-operator-crds, kubectl_manifest.secrets ] depends_on = [ helm_release.mariadb-operator-crds, kubectl_manifest.secrets ]
timeout = 3600 timeout = 3600

View File

@@ -0,0 +1,14 @@
apiVersion: networking.cfargotunnel.com/v1alpha1
kind: TunnelBinding
metadata:
name: grafana-tunnel-binding
namespace: monitoring
subjects:
- name: grafana
spec:
target: http://kube-prometheus-stack-grafana.monitoring.svc.cluster.local
fqdn: grafana.${base_domain}
noTlsVerify: true
tunnelRef:
kind: ClusterTunnel
name: cluster-tunnel

View File

@@ -0,0 +1,66 @@
terraform {
required_providers {
kubectl = {
source = "gavinbunney/kubectl"
version = "1.19.0"
}
helm = {
source = "hashicorp/helm"
version = "3.0.2"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "2.38.0"
}
kustomization = {
source = "kbst/kustomization"
version = "0.9.6"
}
time = {
source = "hashicorp/time"
version = "0.13.1"
}
}
}
# Create namespace for monitoring
resource "kubernetes_namespace" "monitoring" {
metadata {
name = "monitoring"
labels = {
"pod-security.kubernetes.io/enforce" = "privileged"
}
}
}
# Deploy kube-prometheus-stack
resource "helm_release" "kube_prometheus_stack" {
name = "kube-prometheus-stack"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "kube-prometheus-stack"
namespace = kubernetes_namespace.monitoring.metadata[0].name
version = "67.2.1" # Check for latest version
# Wait for CRDs to be created
wait = true
timeout = 600
force_update = false
recreate_pods = false
# Reference the values file
values = [
file("${path.module}/values.yaml")
]
depends_on = [
kubernetes_namespace.monitoring
]
}
resource "kubectl_manifest" "argocd-tunnel-bind" {
depends_on = [helm_release.kube_prometheus_stack]
yaml_body = templatefile("${path.module}/grafana-ui.yaml", {
base_domain = var.cloudflare_domain
})
}

View File

@@ -0,0 +1,189 @@
# Prometheus configuration
prometheus:
prometheusSpec:
retention: 30d
retentionSize: "45GB"
# Storage configuration
storageSpec:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi
# storageClassName: "your-storage-class" # Uncomment and specify if needed
# Resource limits
resources:
requests:
cpu: 500m
memory: 2Gi
limits:
cpu: 2000m
memory: 4Gi
# Scrape interval
scrapeInterval: 30s
evaluationInterval: 30s
# Service configuration
service:
type: ClusterIP
port: 9090
# Ingress (disabled by default)
ingress:
enabled: false
# ingressClassName: nginx
# hosts:
# - prometheus.example.com
# tls:
# - secretName: prometheus-tls
# hosts:
# - prometheus.example.com
# Grafana configuration
grafana:
enabled: true
# Admin credentials
adminPassword: "admin" # CHANGE THIS IN PRODUCTION!
# Persistence
persistence:
enabled: true
size: 10Gi
# storageClassName: "your-storage-class" # Uncomment and specify if needed
# Resource limits
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
# Service configuration
service:
type: ClusterIP
port: 80
# Ingress (disabled by default)
ingress:
enabled: false
# ingressClassName: nginx
# hosts:
# - grafana.example.com
# tls:
# - secretName: grafana-tls
# hosts:
# - grafana.example.com
# Default dashboards
defaultDashboardsEnabled: true
defaultDashboardsTimezone: Europe/Prague
# Alertmanager configuration
alertmanager:
enabled: true
alertmanagerSpec:
# Storage configuration
storage:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
# storageClassName: "your-storage-class" # Uncomment and specify if needed
# Resource limits
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 256Mi
# Service configuration
service:
type: ClusterIP
port: 9093
# Ingress (disabled by default)
ingress:
enabled: false
# ingressClassName: nginx
# hosts:
# - alertmanager.example.com
# tls:
# - secretName: alertmanager-tls
# hosts:
# - alertmanager.example.com
# Alertmanager configuration
config:
global:
resolve_timeout: 5m
route:
group_by: [ 'alertname', 'cluster', 'service' ]
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'null'
routes:
- match:
alertname: Watchdog
receiver: 'null'
receivers:
- name: 'null'
# Add your receivers here (email, slack, pagerduty, etc.)
# - name: 'slack'
# slack_configs:
# - api_url: 'YOUR_SLACK_WEBHOOK_URL'
# channel: '#alerts'
# title: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}'
# text: '{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}'
# Node Exporter
nodeExporter:
enabled: true
# Kube State Metrics
kubeStateMetrics:
enabled: true
# Prometheus Operator
prometheusOperator:
enabled: true
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 256Mi
# Service Monitors
# Automatically discover and monitor services with appropriate labels
prometheus-node-exporter:
prometheus:
monitor:
enabled: true
# Additional ServiceMonitors can be defined here
# additionalServiceMonitors: []
# Global settings
global:
rbac:
create: true

View File

@@ -0,0 +1,5 @@
variable "cloudflare_domain" {
type = string
default = "Base cloudflare domain, e.g. example.com"
nullable = false
}

View File

@@ -65,7 +65,11 @@ resource "helm_release" "rabbitmq" {
{ {
name = "podAntiAffinityPreset" name = "podAntiAffinityPreset"
value = "soft" value = "soft"
} },
{
name = "image.repository"
value = "bitnamilegacy/rabbitmq"
},
] ]
} }