fix(infrastructure): prometheus

This commit is contained in:
2025-10-05 01:23:22 +02:00
parent 8edaaee117
commit 9cbe121b11
12 changed files with 332 additions and 25 deletions

View File

@@ -0,0 +1,14 @@
apiVersion: networking.cfargotunnel.com/v1alpha1
kind: TunnelBinding
metadata:
name: grafana-tunnel-binding
namespace: monitoring
subjects:
- name: grafana
spec:
target: http://kube-prometheus-stack-grafana.monitoring.svc.cluster.local
fqdn: grafana.${base_domain}
noTlsVerify: true
tunnelRef:
kind: ClusterTunnel
name: cluster-tunnel

View File

@@ -0,0 +1,66 @@
terraform {
required_providers {
kubectl = {
source = "gavinbunney/kubectl"
version = "1.19.0"
}
helm = {
source = "hashicorp/helm"
version = "3.0.2"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "2.38.0"
}
kustomization = {
source = "kbst/kustomization"
version = "0.9.6"
}
time = {
source = "hashicorp/time"
version = "0.13.1"
}
}
}
# Create namespace for monitoring
resource "kubernetes_namespace" "monitoring" {
metadata {
name = "monitoring"
labels = {
"pod-security.kubernetes.io/enforce" = "privileged"
}
}
}
# Deploy kube-prometheus-stack
resource "helm_release" "kube_prometheus_stack" {
name = "kube-prometheus-stack"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "kube-prometheus-stack"
namespace = kubernetes_namespace.monitoring.metadata[0].name
version = "67.2.1" # Check for latest version
# Wait for CRDs to be created
wait = true
timeout = 600
force_update = false
recreate_pods = false
# Reference the values file
values = [
file("${path.module}/values.yaml")
]
depends_on = [
kubernetes_namespace.monitoring
]
}
resource "kubectl_manifest" "argocd-tunnel-bind" {
depends_on = [helm_release.kube_prometheus_stack]
yaml_body = templatefile("${path.module}/grafana-ui.yaml", {
base_domain = var.cloudflare_domain
})
}

View File

@@ -0,0 +1,189 @@
# Prometheus configuration
prometheus:
prometheusSpec:
retention: 30d
retentionSize: "45GB"
# Storage configuration
storageSpec:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi
# storageClassName: "your-storage-class" # Uncomment and specify if needed
# Resource limits
resources:
requests:
cpu: 500m
memory: 2Gi
limits:
cpu: 2000m
memory: 4Gi
# Scrape interval
scrapeInterval: 30s
evaluationInterval: 30s
# Service configuration
service:
type: ClusterIP
port: 9090
# Ingress (disabled by default)
ingress:
enabled: false
# ingressClassName: nginx
# hosts:
# - prometheus.example.com
# tls:
# - secretName: prometheus-tls
# hosts:
# - prometheus.example.com
# Grafana configuration
grafana:
enabled: true
# Admin credentials
adminPassword: "admin" # CHANGE THIS IN PRODUCTION!
# Persistence
persistence:
enabled: true
size: 10Gi
# storageClassName: "your-storage-class" # Uncomment and specify if needed
# Resource limits
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
# Service configuration
service:
type: ClusterIP
port: 80
# Ingress (disabled by default)
ingress:
enabled: false
# ingressClassName: nginx
# hosts:
# - grafana.example.com
# tls:
# - secretName: grafana-tls
# hosts:
# - grafana.example.com
# Default dashboards
defaultDashboardsEnabled: true
defaultDashboardsTimezone: Europe/Prague
# Alertmanager configuration
alertmanager:
enabled: true
alertmanagerSpec:
# Storage configuration
storage:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
# storageClassName: "your-storage-class" # Uncomment and specify if needed
# Resource limits
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 256Mi
# Service configuration
service:
type: ClusterIP
port: 9093
# Ingress (disabled by default)
ingress:
enabled: false
# ingressClassName: nginx
# hosts:
# - alertmanager.example.com
# tls:
# - secretName: alertmanager-tls
# hosts:
# - alertmanager.example.com
# Alertmanager configuration
config:
global:
resolve_timeout: 5m
route:
group_by: [ 'alertname', 'cluster', 'service' ]
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'null'
routes:
- match:
alertname: Watchdog
receiver: 'null'
receivers:
- name: 'null'
# Add your receivers here (email, slack, pagerduty, etc.)
# - name: 'slack'
# slack_configs:
# - api_url: 'YOUR_SLACK_WEBHOOK_URL'
# channel: '#alerts'
# title: '{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}'
# text: '{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}'
# Node Exporter
nodeExporter:
enabled: true
# Kube State Metrics
kubeStateMetrics:
enabled: true
# Prometheus Operator
prometheusOperator:
enabled: true
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 256Mi
# Service Monitors
# Automatically discover and monitor services with appropriate labels
prometheus-node-exporter:
prometheus:
monitor:
enabled: true
# Additional ServiceMonitors can be defined here
# additionalServiceMonitors: []
# Global settings
global:
rbac:
create: true

View File

@@ -0,0 +1,5 @@
variable "cloudflare_domain" {
type = string
default = "Base cloudflare domain, e.g. example.com"
nullable = false
}