botserver/deploy/kubernetes/hpa.yaml

# General Bots Kubernetes HorizontalPodAutoscaler Configuration
# This file contains autoscaling configurations for General Bots components.
#
# Usage:
#   kubectl apply -f hpa.yaml
#
# Prerequisites:
#   - Metrics Server installed in cluster
#   - deployment.yaml already applied

---
# HPA for botserver - scales based on CPU and memory
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: botserver-hpa
  namespace: generalbots
  labels:
    app.kubernetes.io/name: generalbots
    app.kubernetes.io/component: hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: botserver
  minReplicas: 3
  maxReplicas: 20
  metrics:
    # Scale based on CPU utilization
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70

    # Scale based on memory utilization
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 80

    # Scale based on requests per second (requires custom metrics)
    # Uncomment if using Prometheus Adapter
    # - type: Pods
    #   pods:
    #     metric:
    #       name: http_requests_per_second
    #     target:
    #       type: AverageValue
    #       averageValue: 100

  behavior:
    scaleDown:
      stabilizationWindowSeconds: 300  # 5 minutes cooldown before scaling down
      policies:
        - type: Percent
          value: 10
          periodSeconds: 60
        - type: Pods
          value: 2
          periodSeconds: 60
      selectPolicy: Min  # Use the most conservative policy

    scaleUp:
      stabilizationWindowSeconds: 60  # 1 minute before scaling up
      policies:
        - type: Percent
          value: 100
          periodSeconds: 30
        - type: Pods
          value: 4
          periodSeconds: 30
      selectPolicy: Max  # Scale up aggressively when needed

---
# HPA for LLM server - scales based on CPU (inference is CPU/GPU bound)
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: llm-server-hpa
  namespace: generalbots
  labels:
    app.kubernetes.io/name: generalbots
    app.kubernetes.io/component: hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: llm-server
  minReplicas: 2
  maxReplicas: 10
  metrics:
    # Scale based on CPU utilization
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 60  # Lower threshold for LLM - inference is expensive

    # Scale based on memory utilization
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 75

    # Scale based on inference queue length (requires custom metrics)
    # Uncomment if using Prometheus Adapter
    # - type: Pods
    #   pods:
    #     metric:
    #       name: llm_inference_queue_length
    #     target:
    #       type: AverageValue
    #       averageValue: 5

  behavior:
    scaleDown:
      stabilizationWindowSeconds: 600  # 10 minutes - LLM pods are expensive to recreate
      policies:
        - type: Pods
          value: 1
          periodSeconds: 120
      selectPolicy: Min

    scaleUp:
      stabilizationWindowSeconds: 120  # 2 minutes
      policies:
        - type: Pods
          value: 2
          periodSeconds: 60
      selectPolicy: Max

---
# HPA for embedding server (if deployed separately)
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: embedding-server-hpa
  namespace: generalbots
  labels:
    app.kubernetes.io/name: generalbots
    app.kubernetes.io/component: hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: embedding-server
  minReplicas: 2
  maxReplicas: 8
  metrics:
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70

    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 80

  behavior:
    scaleDown:
      stabilizationWindowSeconds: 300
      policies:
        - type: Pods
          value: 1
          periodSeconds: 60
      selectPolicy: Min

    scaleUp:
      stabilizationWindowSeconds: 60
      policies:
        - type: Pods
          value: 2
          periodSeconds: 30
      selectPolicy: Max

---
# Vertical Pod Autoscaler for botserver (optional - requires VPA installed)
# Automatically adjusts resource requests/limits
apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
  name: botserver-vpa
  namespace: generalbots
  labels:
    app.kubernetes.io/name: generalbots
    app.kubernetes.io/component: vpa
spec:
  targetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: botserver
  updatePolicy:
    updateMode: "Auto"  # Options: Off, Initial, Recreate, Auto
  resourcePolicy:
    containerPolicies:
      - containerName: botserver
        minAllowed:
          cpu: 250m
          memory: 512Mi
        maxAllowed:
          cpu: 4000m
          memory: 8Gi
        controlledResources: ["cpu", "memory"]
        controlledValues: RequestsAndLimits

---
# Vertical Pod Autoscaler for LLM server
apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
  name: llm-server-vpa
  namespace: generalbots
  labels:
    app.kubernetes.io/name: generalbots
    app.kubernetes.io/component: vpa
spec:
  targetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: llm-server
  updatePolicy:
    updateMode: "Off"  # Manual for LLM - too disruptive to auto-update
  resourcePolicy:
    containerPolicies:
      - containerName: llm-server
        minAllowed:
          cpu: 2000m
          memory: 8Gi
        maxAllowed:
          cpu: 16000m
          memory: 64Gi
        controlledResources: ["cpu", "memory"]
        controlledValues: RequestsOnly  # Only adjust requests, not limits

---
# Custom metrics for HPA (requires Prometheus + Prometheus Adapter)
# This ServiceMonitor tells Prometheus to scrape botserver metrics
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: botserver-metrics
  namespace: generalbots
  labels:
    app.kubernetes.io/name: generalbots
    app.kubernetes.io/component: monitoring
spec:
  selector:
    matchLabels:
      app: botserver
  endpoints:
    - port: metrics
      interval: 30s
      path: /metrics
  namespaceSelector:
    matchNames:
      - generalbots

---
# PrometheusRule for alerting on scaling events
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: botserver-scaling-alerts
  namespace: generalbots
  labels:
    app.kubernetes.io/name: generalbots
    app.kubernetes.io/component: alerts
spec:
  groups:
    - name: botserver-scaling
      rules:
        # Alert when approaching max replicas
        - alert: BotserverNearMaxReplicas
          expr: |
            kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="botserver-hpa"}
            / kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="botserver-hpa"}
            > 0.8
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Botserver near maximum replicas"
            description: "Botserver HPA is at {{ $value | humanizePercentage }} of max replicas"

        # Alert when at max replicas
        - alert: BotserverAtMaxReplicas
          expr: |
            kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="botserver-hpa"}
            == kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="botserver-hpa"}
          for: 10m
          labels:
            severity: critical
          annotations:
            summary: "Botserver at maximum replicas"
            description: "Botserver HPA has been at max replicas for 10 minutes - consider increasing max"

        # Alert on rapid scaling
        - alert: BotserverRapidScaling
          expr: |
            increase(kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="botserver-hpa"}[10m])
            > 5
          for: 1m
          labels:
            severity: warning
          annotations:
            summary: "Botserver scaling rapidly"
            description: "Botserver has scaled by {{ $value }} replicas in 10 minutes"

        # Alert on LLM server max replicas
        - alert: LLMServerAtMaxReplicas
          expr: |
            kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="llm-server-hpa"}
            == kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="llm-server-hpa"}
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "LLM Server at maximum replicas"
            description: "LLM Server HPA is at max - inference capacity may be constrained"