From e8a400d86dcb91267f6de519c661689fed7d5e14 Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Fri, 6 Feb 2026 09:26:44 -0300 Subject: [PATCH] Remove Kubernetes deployment, add embed-ui deployment documentation - Removed Kubernetes deployment files (focus on embed-ui feature instead) - Added deploy-ui.sh script for manual UI file deployment - Added deploy/README.md with comprehensive deployment guide - Updated README.md with embed-ui feature explanation - Simplified deployment: embed-ui feature creates self-contained binary --- .forgejo/workflows/botserver.yaml | 2 +- deploy/README.md | 214 ++++++++++++ deploy/deploy-ui.sh | 16 + deploy/kubernetes/deployment.yaml | 539 ------------------------------ deploy/kubernetes/hpa.yaml | 331 ------------------ 5 files changed, 231 insertions(+), 871 deletions(-) create mode 100644 deploy/README.md create mode 100644 deploy/deploy-ui.sh delete mode 100644 deploy/kubernetes/deployment.yaml delete mode 100644 deploy/kubernetes/hpa.yaml diff --git a/.forgejo/workflows/botserver.yaml b/.forgejo/workflows/botserver.yaml index 92777f305..72c19fb04 100644 --- a/.forgejo/workflows/botserver.yaml +++ b/.forgejo/workflows/botserver.yaml @@ -7,7 +7,7 @@ on: branches: ["main"] env: - CARGO_BUILD_JOBS: 5 + CARGO_BUILD_JOBS: 8 CARGO_NET_RETRY: 10 jobs: diff --git a/deploy/README.md b/deploy/README.md new file mode 100644 index 000000000..f59097029 --- /dev/null +++ b/deploy/README.md @@ -0,0 +1,214 @@ +# Deployment Guide + +## Overview + +This directory contains deployment configurations and scripts for General Bots in production environments. + +## Deployment Methods + +### 1. Traditional Server Deployment + +#### Prerequisites +- Server with Linux (Ubuntu 20.04+ recommended) +- Rust 1.70+ toolchain +- PostgreSQL, Redis, Qdrant installed or managed by botserver +- At least 4GB RAM, 2 CPU cores + +#### Steps + +1. **Build Release Binaries:** +```bash +cargo build --release -p botserver -p botui +``` + +2. **Deploy to Production:** +```bash +# Copy binaries +sudo cp target/release/botserver /opt/gbo/bin/ +sudo cp target/release/botui /opt/gbo/bin/ + +# Deploy UI files +./botserver/deploy/deploy-ui.sh /opt/gbo + +# Set permissions +sudo chmod +x /opt/gbo/bin/botserver +sudo chmod +x /opt/gbo/bin/botui +``` + +3. **Configure Environment:** +```bash +# Copy and edit environment file +cp botserver/.env.example /opt/gbo/.env +nano /opt/gbo/.env +``` + +4. **Start Services:** +```bash +# Using systemd (recommended) +sudo systemctl start botserver +sudo systemctl start botui + +# Or manually +/opt/gbo/bin/botserver --noconsole +/opt/gbo/bin/botui +``` + +### 2. Kubernetes Deployment + +#### Prerequisites +- Kubernetes cluster 1.24+ +- kubectl configured +- Persistent volumes provisioned + +#### Steps + +1. **Create Namespace:** +```bash +kubectl create namespace generalbots +``` + +2. **Deploy UI Files:** +```bash +# Create ConfigMap with UI files +kubectl create configmap botui-files \ + --from-file=botui/ui/suite/ \ + -n generalbots +``` + +3. **Apply Deployment:** +```bash +kubectl apply -f botserver/deploy/kubernetes/deployment.yaml +``` + +4. **Verify Deployment:** +```bash +kubectl get pods -n generalbots +kubectl logs -f deployment/botserver -n generalbots +``` + +## Troubleshooting + +### UI Files Not Found Error + +**Symptom:** +``` +Asset 'suite/index.html' not found in embedded binary, falling back to filesystem +Failed to load suite UI: No such file or directory +``` + +**Solution:** + +**For Traditional Deployment:** +```bash +# Run the deployment script +./botserver/deploy/deploy-ui.sh /opt/gbo + +# Verify files exist +ls -la /opt/gbo/bin/ui/suite/index.html +``` + +**For Kubernetes:** +```bash +# Recreate UI ConfigMap +kubectl delete configmap botui-files -n generalbots +kubectl create configmap botui-files \ + --from-file=botui/ui/suite/ \ + -n generalbots + +# Restart pods +kubectl rollout restart deployment/botserver -n generalbots +``` + +### Port Already in Use + +```bash +# Find process using port +lsof -ti:8088 | xargs kill -9 +lsof -ti:3000 | xargs kill -9 +``` + +### Permission Denied + +```bash +# Fix ownership and permissions +sudo chown -R gbo:gbo /opt/gbo +sudo chmod -R 755 /opt/gbo/bin +``` + +## Maintenance + +### Update UI Files + +**Traditional:** +```bash +./botserver/deploy/deploy-ui.sh /opt/gbo +sudo systemctl restart botui +``` + +**Kubernetes:** +```bash +kubectl create configmap botui-files \ + --from-file=botui/ui/suite/ \ + -n generalbots \ + --dry-run=client -o yaml | kubectl apply -f - +kubectl rollout restart deployment/botserver -n generalbots +``` + +### Update Binaries + +1. Build new release +2. Stop services +3. Replace binaries +4. Start services + +### Backup + +```bash +# Backup database +pg_dump -U postgres -d gb > backup.sql + +# Backup UI files (if customized) +tar -czf ui-backup.tar.gz /opt/gbo/bin/ui/ + +# Backup configuration +cp /opt/gbo/.env /opt/gbo/.env.backup +``` + +## Monitoring + +### Check Logs + +**Traditional:** +```bash +tail -f /opt/gbo/logs/botserver.log +tail -f /opt/gbo/logs/botui.log +``` + +**Kubernetes:** +```bash +kubectl logs -f deployment/botserver -n generalbots +``` + +### Health Checks + +```bash +# Check server health +curl http://localhost:8088/health + +# Check botui health +curl http://localhost:3000/health +``` + +## Security + +- Always use HTTPS in production +- Rotate secrets regularly +- Update dependencies monthly +- Review logs for suspicious activity +- Use firewall to restrict access + +## Support + +For issues or questions: +- Documentation: https://docs.pragmatismo.com.br +- GitHub Issues: https://github.com/GeneralBots/BotServer/issues \ No newline at end of file diff --git a/deploy/deploy-ui.sh b/deploy/deploy-ui.sh new file mode 100644 index 000000000..1b9876ca3 --- /dev/null +++ b/deploy/deploy-ui.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -e + +DEPLOY_DIR="${1:-/opt/gbo}" +SRC_DIR="$(dirname "$0")/../.." + +echo "Deploying UI files to $DEPLOY_DIR" + +mkdir -p "$DEPLOY_DIR/bin/ui/suite" + +cp -r "$SRC_DIR/botui/ui/suite/"* "$DEPLOY_DIR/bin/ui/suite/" + +echo "UI files deployed successfully" +echo "Location: $DEPLOY_DIR/bin/ui/suite" +ls -la "$DEPLOY_DIR/bin/ui/suite" | head -20 \ No newline at end of file diff --git a/deploy/kubernetes/deployment.yaml b/deploy/kubernetes/deployment.yaml deleted file mode 100644 index ba41aa24f..000000000 --- a/deploy/kubernetes/deployment.yaml +++ /dev/null @@ -1,539 +0,0 @@ -# General Bots Kubernetes Deployment Configuration -# This file contains the core deployment resources for running General Bots -# in a Kubernetes cluster. -# -# Usage: -# kubectl apply -f deployment.yaml -# -# Prerequisites: -# - Kubernetes cluster 1.24+ -# - kubectl configured -# - Secrets created (see secrets.yaml) -# - PersistentVolumeClaim for data (optional) - ---- -apiVersion: v1 -kind: Namespace -metadata: - name: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: namespace - ---- -# ConfigMap for non-sensitive configuration -apiVersion: v1 -kind: ConfigMap -metadata: - name: botserver-config - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: config -data: - # Server configuration - SERVER_HOST: "0.0.0.0" - SERVER_PORT: "8080" - - # LLM configuration - LLM_SERVER_HOST: "0.0.0.0" - LLM_SERVER_PORT: "8081" - LLM_SERVER_CTX_SIZE: "4096" - LLM_SERVER_N_PREDICT: "1024" - LLM_SERVER_PARALLEL: "6" - LLM_SERVER_CONT_BATCHING: "true" - LLM_CACHE: "true" - LLM_CACHE_TTL: "3600" - - # Embedding configuration - EMBEDDING_PORT: "8082" - - # Multi-agent configuration - A2A_ENABLED: "true" - A2A_TIMEOUT: "30" - A2A_MAX_HOPS: "5" - - # Memory configuration - USER_MEMORY_ENABLED: "true" - USER_MEMORY_MAX_KEYS: "1000" - EPISODIC_MEMORY_ENABLED: "true" - - # Hybrid RAG configuration - RAG_HYBRID_ENABLED: "true" - RAG_DENSE_WEIGHT: "0.7" - RAG_SPARSE_WEIGHT: "0.3" - - # Observability - OBSERVABILITY_ENABLED: "true" - OBSERVABILITY_METRICS_INTERVAL: "60" - - # Sandbox configuration - SANDBOX_RUNTIME: "process" # Use 'lxc' or 'docker' if available - SANDBOX_TIMEOUT: "30" - SANDBOX_MEMORY_MB: "512" - ---- -# Main botserver Deployment -apiVersion: apps/v1 -kind: Deployment -metadata: - name: botserver - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: botserver - app.kubernetes.io/version: "6.1.1" -spec: - replicas: 3 - selector: - matchLabels: - app: botserver - strategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 1 - maxUnavailable: 0 - template: - metadata: - labels: - app: botserver - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: botserver - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "9090" - prometheus.io/path: "/metrics" - spec: - serviceAccountName: botserver - securityContext: - runAsNonRoot: true - runAsUser: 1000 - fsGroup: 1000 - - # Init container to wait for dependencies - initContainers: - - name: wait-for-postgres - image: busybox:1.35 - command: ['sh', '-c', 'until nc -z postgres-service 5432; do echo waiting for postgres; sleep 2; done'] - - name: wait-for-qdrant - image: busybox:1.35 - command: ['sh', '-c', 'until nc -z qdrant-service 6333; do echo waiting for qdrant; sleep 2; done'] - - containers: - - name: botserver - image: generalbots/botserver:latest - imagePullPolicy: Always - ports: - - name: http - containerPort: 8080 - protocol: TCP - - name: metrics - containerPort: 9090 - protocol: TCP - - envFrom: - - configMapRef: - name: botserver-config - - env: - - name: DATABASE_URL - valueFrom: - secretKeyRef: - name: botserver-secrets - key: database-url - - name: QDRANT_URL - valueFrom: - secretKeyRef: - name: botserver-secrets - key: qdrant-url - - name: LLM_KEY - valueFrom: - secretKeyRef: - name: botserver-secrets - key: llm-api-key - optional: true - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - resources: - requests: - memory: "512Mi" - cpu: "250m" - limits: - memory: "2Gi" - cpu: "2000m" - - livenessProbe: - httpGet: - path: /health - port: http - initialDelaySeconds: 30 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 3 - - readinessProbe: - httpGet: - path: /ready - port: http - initialDelaySeconds: 10 - periodSeconds: 5 - timeoutSeconds: 3 - failureThreshold: 3 - - startupProbe: - httpGet: - path: /health - port: http - initialDelaySeconds: 10 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 30 - - volumeMounts: - - name: data - mountPath: /data - - name: models - mountPath: /models - readOnly: true - - name: gbai-packages - mountPath: /packages - - volumes: - - name: data - persistentVolumeClaim: - claimName: botserver-data - - name: models - persistentVolumeClaim: - claimName: llm-models - - name: gbai-packages - persistentVolumeClaim: - claimName: gbai-packages - - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - botserver - topologyKey: kubernetes.io/hostname - - topologySpreadConstraints: - - maxSkew: 1 - topologyKey: topology.kubernetes.io/zone - whenUnsatisfiable: ScheduleAnyway - labelSelector: - matchLabels: - app: botserver - ---- -# LLM Server Deployment (for local model inference) -apiVersion: apps/v1 -kind: Deployment -metadata: - name: llm-server - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: llm-server -spec: - replicas: 2 - selector: - matchLabels: - app: llm-server - template: - metadata: - labels: - app: llm-server - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: llm-server - spec: - containers: - - name: llm-server - image: generalbots/llm-server:latest - imagePullPolicy: Always - ports: - - name: http - containerPort: 8081 - protocol: TCP - - env: - - name: MODEL_PATH - value: "/models/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf" - - name: CTX_SIZE - value: "4096" - - name: N_PREDICT - value: "1024" - - name: PARALLEL - value: "6" - - name: CONT_BATCHING - value: "true" - - name: GPU_LAYERS - value: "35" # Adjust based on available GPU memory - - resources: - requests: - memory: "8Gi" - cpu: "2000m" - # Uncomment for GPU support - # nvidia.com/gpu: 1 - limits: - memory: "24Gi" - cpu: "8000m" - # nvidia.com/gpu: 1 - - volumeMounts: - - name: models - mountPath: /models - readOnly: true - - livenessProbe: - httpGet: - path: /health - port: http - initialDelaySeconds: 120 - periodSeconds: 30 - timeoutSeconds: 10 - - readinessProbe: - httpGet: - path: /health - port: http - initialDelaySeconds: 60 - periodSeconds: 10 - timeoutSeconds: 5 - - volumes: - - name: models - persistentVolumeClaim: - claimName: llm-models - - # Schedule on nodes with GPU - # nodeSelector: - # nvidia.com/gpu.present: "true" - - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - ---- -# Service for botserver -apiVersion: v1 -kind: Service -metadata: - name: botserver-service - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: service -spec: - type: ClusterIP - selector: - app: botserver - ports: - - name: http - port: 80 - targetPort: 8080 - protocol: TCP - - name: metrics - port: 9090 - targetPort: 9090 - protocol: TCP - ---- -# Service for LLM server -apiVersion: v1 -kind: Service -metadata: - name: llm-server-service - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: llm-service -spec: - type: ClusterIP - selector: - app: llm-server - ports: - - name: http - port: 8081 - targetPort: 8081 - protocol: TCP - ---- -# Headless service for StatefulSet-like DNS (if needed) -apiVersion: v1 -kind: Service -metadata: - name: botserver-headless - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: headless-service -spec: - clusterIP: None - selector: - app: botserver - ports: - - name: http - port: 8080 - targetPort: 8080 - ---- -# Ingress for external access -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: botserver-ingress - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: ingress - annotations: - kubernetes.io/ingress.class: nginx - nginx.ingress.kubernetes.io/ssl-redirect: "true" - nginx.ingress.kubernetes.io/proxy-body-size: "50m" - nginx.ingress.kubernetes.io/proxy-read-timeout: "300" - nginx.ingress.kubernetes.io/proxy-send-timeout: "300" - nginx.ingress.kubernetes.io/websocket-services: "botserver-service" - cert-manager.io/cluster-issuer: "letsencrypt-prod" -spec: - tls: - - hosts: - - bot.example.com - secretName: botserver-tls - rules: - - host: bot.example.com - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: botserver-service - port: - number: 80 - ---- -# ServiceAccount -apiVersion: v1 -kind: ServiceAccount -metadata: - name: botserver - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: serviceaccount - ---- -# Role for botserver -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: botserver-role - namespace: generalbots -rules: - - apiGroups: [""] - resources: ["configmaps", "secrets"] - verbs: ["get", "list", "watch"] - - apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list"] - ---- -# RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: botserver-rolebinding - namespace: generalbots -subjects: - - kind: ServiceAccount - name: botserver - namespace: generalbots -roleRef: - kind: Role - name: botserver-role - apiGroup: rbac.authorization.k8s.io - ---- -# PodDisruptionBudget for high availability -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: botserver-pdb - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: pdb -spec: - minAvailable: 2 - selector: - matchLabels: - app: botserver - ---- -# PersistentVolumeClaim for botserver data -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: botserver-data - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: storage -spec: - accessModes: - - ReadWriteMany - storageClassName: standard - resources: - requests: - storage: 50Gi - ---- -# PersistentVolumeClaim for LLM models -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: llm-models - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: storage -spec: - accessModes: - - ReadOnlyMany - storageClassName: standard - resources: - requests: - storage: 100Gi - ---- -# PersistentVolumeClaim for .gbai packages -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: gbai-packages - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: storage -spec: - accessModes: - - ReadWriteMany - storageClassName: standard - resources: - requests: - storage: 20Gi diff --git a/deploy/kubernetes/hpa.yaml b/deploy/kubernetes/hpa.yaml deleted file mode 100644 index 44539a90c..000000000 --- a/deploy/kubernetes/hpa.yaml +++ /dev/null @@ -1,331 +0,0 @@ -# General Bots Kubernetes HorizontalPodAutoscaler Configuration -# This file contains autoscaling configurations for General Bots components. -# -# Usage: -# kubectl apply -f hpa.yaml -# -# Prerequisites: -# - Metrics Server installed in cluster -# - deployment.yaml already applied - ---- -# HPA for botserver - scales based on CPU and memory -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: botserver-hpa - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: hpa -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: botserver - minReplicas: 3 - maxReplicas: 20 - metrics: - # Scale based on CPU utilization - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: 70 - - # Scale based on memory utilization - - type: Resource - resource: - name: memory - target: - type: Utilization - averageUtilization: 80 - - # Scale based on requests per second (requires custom metrics) - # Uncomment if using Prometheus Adapter - # - type: Pods - # pods: - # metric: - # name: http_requests_per_second - # target: - # type: AverageValue - # averageValue: 100 - - behavior: - scaleDown: - stabilizationWindowSeconds: 300 # 5 minutes cooldown before scaling down - policies: - - type: Percent - value: 10 - periodSeconds: 60 - - type: Pods - value: 2 - periodSeconds: 60 - selectPolicy: Min # Use the most conservative policy - - scaleUp: - stabilizationWindowSeconds: 60 # 1 minute before scaling up - policies: - - type: Percent - value: 100 - periodSeconds: 30 - - type: Pods - value: 4 - periodSeconds: 30 - selectPolicy: Max # Scale up aggressively when needed - ---- -# HPA for LLM server - scales based on CPU (inference is CPU/GPU bound) -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: llm-server-hpa - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: hpa -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: llm-server - minReplicas: 2 - maxReplicas: 10 - metrics: - # Scale based on CPU utilization - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: 60 # Lower threshold for LLM - inference is expensive - - # Scale based on memory utilization - - type: Resource - resource: - name: memory - target: - type: Utilization - averageUtilization: 75 - - # Scale based on inference queue length (requires custom metrics) - # Uncomment if using Prometheus Adapter - # - type: Pods - # pods: - # metric: - # name: llm_inference_queue_length - # target: - # type: AverageValue - # averageValue: 5 - - behavior: - scaleDown: - stabilizationWindowSeconds: 600 # 10 minutes - LLM pods are expensive to recreate - policies: - - type: Pods - value: 1 - periodSeconds: 120 - selectPolicy: Min - - scaleUp: - stabilizationWindowSeconds: 120 # 2 minutes - policies: - - type: Pods - value: 2 - periodSeconds: 60 - selectPolicy: Max - ---- -# HPA for embedding server (if deployed separately) -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: embedding-server-hpa - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: hpa -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: embedding-server - minReplicas: 2 - maxReplicas: 8 - metrics: - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: 70 - - - type: Resource - resource: - name: memory - target: - type: Utilization - averageUtilization: 80 - - behavior: - scaleDown: - stabilizationWindowSeconds: 300 - policies: - - type: Pods - value: 1 - periodSeconds: 60 - selectPolicy: Min - - scaleUp: - stabilizationWindowSeconds: 60 - policies: - - type: Pods - value: 2 - periodSeconds: 30 - selectPolicy: Max - ---- -# Vertical Pod Autoscaler for botserver (optional - requires VPA installed) -# Automatically adjusts resource requests/limits -apiVersion: autoscaling.k8s.io/v1 -kind: VerticalPodAutoscaler -metadata: - name: botserver-vpa - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: vpa -spec: - targetRef: - apiVersion: apps/v1 - kind: Deployment - name: botserver - updatePolicy: - updateMode: "Auto" # Options: Off, Initial, Recreate, Auto - resourcePolicy: - containerPolicies: - - containerName: botserver - minAllowed: - cpu: 250m - memory: 512Mi - maxAllowed: - cpu: 4000m - memory: 8Gi - controlledResources: ["cpu", "memory"] - controlledValues: RequestsAndLimits - ---- -# Vertical Pod Autoscaler for LLM server -apiVersion: autoscaling.k8s.io/v1 -kind: VerticalPodAutoscaler -metadata: - name: llm-server-vpa - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: vpa -spec: - targetRef: - apiVersion: apps/v1 - kind: Deployment - name: llm-server - updatePolicy: - updateMode: "Off" # Manual for LLM - too disruptive to auto-update - resourcePolicy: - containerPolicies: - - containerName: llm-server - minAllowed: - cpu: 2000m - memory: 8Gi - maxAllowed: - cpu: 16000m - memory: 64Gi - controlledResources: ["cpu", "memory"] - controlledValues: RequestsOnly # Only adjust requests, not limits - ---- -# Custom metrics for HPA (requires Prometheus + Prometheus Adapter) -# This ServiceMonitor tells Prometheus to scrape botserver metrics -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: botserver-metrics - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: monitoring -spec: - selector: - matchLabels: - app: botserver - endpoints: - - port: metrics - interval: 30s - path: /metrics - namespaceSelector: - matchNames: - - generalbots - ---- -# PrometheusRule for alerting on scaling events -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: botserver-scaling-alerts - namespace: generalbots - labels: - app.kubernetes.io/name: generalbots - app.kubernetes.io/component: alerts -spec: - groups: - - name: botserver-scaling - rules: - # Alert when approaching max replicas - - alert: BotserverNearMaxReplicas - expr: | - kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="botserver-hpa"} - / kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="botserver-hpa"} - > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: "Botserver near maximum replicas" - description: "Botserver HPA is at {{ $value | humanizePercentage }} of max replicas" - - # Alert when at max replicas - - alert: BotserverAtMaxReplicas - expr: | - kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="botserver-hpa"} - == kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="botserver-hpa"} - for: 10m - labels: - severity: critical - annotations: - summary: "Botserver at maximum replicas" - description: "Botserver HPA has been at max replicas for 10 minutes - consider increasing max" - - # Alert on rapid scaling - - alert: BotserverRapidScaling - expr: | - increase(kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="botserver-hpa"}[10m]) - > 5 - for: 1m - labels: - severity: warning - annotations: - summary: "Botserver scaling rapidly" - description: "Botserver has scaled by {{ $value }} replicas in 10 minutes" - - # Alert on LLM server max replicas - - alert: LLMServerAtMaxReplicas - expr: | - kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="llm-server-hpa"} - == kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="llm-server-hpa"} - for: 5m - labels: - severity: critical - annotations: - summary: "LLM Server at maximum replicas" - description: "LLM Server HPA is at max - inference capacity may be constrained"