From e8a400d86dcb91267f6de519c661689fed7d5e14 Mon Sep 17 00:00:00 2001
From: "Rodrigo Rodriguez (Pragmatismo)" <me@rodrigorodriguez.com>
Date: Fri, 6 Feb 2026 09:26:44 -0300
Subject: [PATCH] Remove Kubernetes deployment, add embed-ui deployment
 documentation

- Removed Kubernetes deployment files (focus on embed-ui feature instead)
- Added deploy-ui.sh script for manual UI file deployment
- Added deploy/README.md with comprehensive deployment guide
- Updated README.md with embed-ui feature explanation
- Simplified deployment: embed-ui feature creates self-contained binary
---
 .forgejo/workflows/botserver.yaml |   2 +-
 deploy/README.md                  | 214 ++++++++++++
 deploy/deploy-ui.sh               |  16 +
 deploy/kubernetes/deployment.yaml | 539 ------------------------------
 deploy/kubernetes/hpa.yaml        | 331 ------------------
 5 files changed, 231 insertions(+), 871 deletions(-)
 create mode 100644 deploy/README.md
 create mode 100644 deploy/deploy-ui.sh
 delete mode 100644 deploy/kubernetes/deployment.yaml
 delete mode 100644 deploy/kubernetes/hpa.yaml

diff --git a/.forgejo/workflows/botserver.yaml b/.forgejo/workflows/botserver.yaml
index 92777f305..72c19fb04 100644
--- a/.forgejo/workflows/botserver.yaml
+++ b/.forgejo/workflows/botserver.yaml
@@ -7,7 +7,7 @@ on:
     branches: ["main"]
 
 env:
-  CARGO_BUILD_JOBS: 5
+  CARGO_BUILD_JOBS: 8
   CARGO_NET_RETRY: 10
 
 jobs:
diff --git a/deploy/README.md b/deploy/README.md
new file mode 100644
index 000000000..f59097029
--- /dev/null
+++ b/deploy/README.md
@@ -0,0 +1,214 @@
+# Deployment Guide
+
+## Overview
+
+This directory contains deployment configurations and scripts for General Bots in production environments.
+
+## Deployment Methods
+
+### 1. Traditional Server Deployment
+
+#### Prerequisites
+- Server with Linux (Ubuntu 20.04+ recommended)
+- Rust 1.70+ toolchain
+- PostgreSQL, Redis, Qdrant installed or managed by botserver
+- At least 4GB RAM, 2 CPU cores
+
+#### Steps
+
+1. **Build Release Binaries:**
+```bash
+cargo build --release -p botserver -p botui
+```
+
+2. **Deploy to Production:**
+```bash
+# Copy binaries
+sudo cp target/release/botserver /opt/gbo/bin/
+sudo cp target/release/botui /opt/gbo/bin/
+
+# Deploy UI files
+./botserver/deploy/deploy-ui.sh /opt/gbo
+
+# Set permissions
+sudo chmod +x /opt/gbo/bin/botserver
+sudo chmod +x /opt/gbo/bin/botui
+```
+
+3. **Configure Environment:**
+```bash
+# Copy and edit environment file
+cp botserver/.env.example /opt/gbo/.env
+nano /opt/gbo/.env
+```
+
+4. **Start Services:**
+```bash
+# Using systemd (recommended)
+sudo systemctl start botserver
+sudo systemctl start botui
+
+# Or manually
+/opt/gbo/bin/botserver --noconsole
+/opt/gbo/bin/botui
+```
+
+### 2. Kubernetes Deployment
+
+#### Prerequisites
+- Kubernetes cluster 1.24+
+- kubectl configured
+- Persistent volumes provisioned
+
+#### Steps
+
+1. **Create Namespace:**
+```bash
+kubectl create namespace generalbots
+```
+
+2. **Deploy UI Files:**
+```bash
+# Create ConfigMap with UI files
+kubectl create configmap botui-files \
+  --from-file=botui/ui/suite/ \
+  -n generalbots
+```
+
+3. **Apply Deployment:**
+```bash
+kubectl apply -f botserver/deploy/kubernetes/deployment.yaml
+```
+
+4. **Verify Deployment:**
+```bash
+kubectl get pods -n generalbots
+kubectl logs -f deployment/botserver -n generalbots
+```
+
+## Troubleshooting
+
+### UI Files Not Found Error
+
+**Symptom:**
+```
+Asset 'suite/index.html' not found in embedded binary, falling back to filesystem
+Failed to load suite UI: No such file or directory
+```
+
+**Solution:**
+
+**For Traditional Deployment:**
+```bash
+# Run the deployment script
+./botserver/deploy/deploy-ui.sh /opt/gbo
+
+# Verify files exist
+ls -la /opt/gbo/bin/ui/suite/index.html
+```
+
+**For Kubernetes:**
+```bash
+# Recreate UI ConfigMap
+kubectl delete configmap botui-files -n generalbots
+kubectl create configmap botui-files \
+  --from-file=botui/ui/suite/ \
+  -n generalbots
+
+# Restart pods
+kubectl rollout restart deployment/botserver -n generalbots
+```
+
+### Port Already in Use
+
+```bash
+# Find process using port
+lsof -ti:8088 | xargs kill -9
+lsof -ti:3000 | xargs kill -9
+```
+
+### Permission Denied
+
+```bash
+# Fix ownership and permissions
+sudo chown -R gbo:gbo /opt/gbo
+sudo chmod -R 755 /opt/gbo/bin
+```
+
+## Maintenance
+
+### Update UI Files
+
+**Traditional:**
+```bash
+./botserver/deploy/deploy-ui.sh /opt/gbo
+sudo systemctl restart botui
+```
+
+**Kubernetes:**
+```bash
+kubectl create configmap botui-files \
+  --from-file=botui/ui/suite/ \
+  -n generalbots \
+  --dry-run=client -o yaml | kubectl apply -f -
+kubectl rollout restart deployment/botserver -n generalbots
+```
+
+### Update Binaries
+
+1. Build new release
+2. Stop services
+3. Replace binaries
+4. Start services
+
+### Backup
+
+```bash
+# Backup database
+pg_dump -U postgres -d gb > backup.sql
+
+# Backup UI files (if customized)
+tar -czf ui-backup.tar.gz /opt/gbo/bin/ui/
+
+# Backup configuration
+cp /opt/gbo/.env /opt/gbo/.env.backup
+```
+
+## Monitoring
+
+### Check Logs
+
+**Traditional:**
+```bash
+tail -f /opt/gbo/logs/botserver.log
+tail -f /opt/gbo/logs/botui.log
+```
+
+**Kubernetes:**
+```bash
+kubectl logs -f deployment/botserver -n generalbots
+```
+
+### Health Checks
+
+```bash
+# Check server health
+curl http://localhost:8088/health
+
+# Check botui health
+curl http://localhost:3000/health
+```
+
+## Security
+
+- Always use HTTPS in production
+- Rotate secrets regularly
+- Update dependencies monthly
+- Review logs for suspicious activity
+- Use firewall to restrict access
+
+## Support
+
+For issues or questions:
+- Documentation: https://docs.pragmatismo.com.br
+- GitHub Issues: https://github.com/GeneralBots/BotServer/issues
\ No newline at end of file
diff --git a/deploy/deploy-ui.sh b/deploy/deploy-ui.sh
new file mode 100644
index 000000000..1b9876ca3
--- /dev/null
+++ b/deploy/deploy-ui.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -e
+
+DEPLOY_DIR="${1:-/opt/gbo}"
+SRC_DIR="$(dirname "$0")/../.."
+
+echo "Deploying UI files to $DEPLOY_DIR"
+
+mkdir -p "$DEPLOY_DIR/bin/ui/suite"
+
+cp -r "$SRC_DIR/botui/ui/suite/"* "$DEPLOY_DIR/bin/ui/suite/"
+
+echo "UI files deployed successfully"
+echo "Location: $DEPLOY_DIR/bin/ui/suite"
+ls -la "$DEPLOY_DIR/bin/ui/suite" | head -20
\ No newline at end of file
diff --git a/deploy/kubernetes/deployment.yaml b/deploy/kubernetes/deployment.yaml
deleted file mode 100644
index ba41aa24f..000000000
--- a/deploy/kubernetes/deployment.yaml
+++ /dev/null
@@ -1,539 +0,0 @@
-# General Bots Kubernetes Deployment Configuration
-# This file contains the core deployment resources for running General Bots
-# in a Kubernetes cluster.
-#
-# Usage:
-#   kubectl apply -f deployment.yaml
-#
-# Prerequisites:
-#   - Kubernetes cluster 1.24+
-#   - kubectl configured
-#   - Secrets created (see secrets.yaml)
-#   - PersistentVolumeClaim for data (optional)
-
----
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: namespace
-
----
-# ConfigMap for non-sensitive configuration
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: botserver-config
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: config
-data:
-  # Server configuration
-  SERVER_HOST: "0.0.0.0"
-  SERVER_PORT: "8080"
-
-  # LLM configuration
-  LLM_SERVER_HOST: "0.0.0.0"
-  LLM_SERVER_PORT: "8081"
-  LLM_SERVER_CTX_SIZE: "4096"
-  LLM_SERVER_N_PREDICT: "1024"
-  LLM_SERVER_PARALLEL: "6"
-  LLM_SERVER_CONT_BATCHING: "true"
-  LLM_CACHE: "true"
-  LLM_CACHE_TTL: "3600"
-
-  # Embedding configuration
-  EMBEDDING_PORT: "8082"
-
-  # Multi-agent configuration
-  A2A_ENABLED: "true"
-  A2A_TIMEOUT: "30"
-  A2A_MAX_HOPS: "5"
-
-  # Memory configuration
-  USER_MEMORY_ENABLED: "true"
-  USER_MEMORY_MAX_KEYS: "1000"
-  EPISODIC_MEMORY_ENABLED: "true"
-
-  # Hybrid RAG configuration
-  RAG_HYBRID_ENABLED: "true"
-  RAG_DENSE_WEIGHT: "0.7"
-  RAG_SPARSE_WEIGHT: "0.3"
-
-  # Observability
-  OBSERVABILITY_ENABLED: "true"
-  OBSERVABILITY_METRICS_INTERVAL: "60"
-
-  # Sandbox configuration
-  SANDBOX_RUNTIME: "process"  # Use 'lxc' or 'docker' if available
-  SANDBOX_TIMEOUT: "30"
-  SANDBOX_MEMORY_MB: "512"
-
----
-# Main botserver Deployment
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: botserver
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: botserver
-    app.kubernetes.io/version: "6.1.1"
-spec:
-  replicas: 3
-  selector:
-    matchLabels:
-      app: botserver
-  strategy:
-    type: RollingUpdate
-    rollingUpdate:
-      maxSurge: 1
-      maxUnavailable: 0
-  template:
-    metadata:
-      labels:
-        app: botserver
-        app.kubernetes.io/name: generalbots
-        app.kubernetes.io/component: botserver
-      annotations:
-        prometheus.io/scrape: "true"
-        prometheus.io/port: "9090"
-        prometheus.io/path: "/metrics"
-    spec:
-      serviceAccountName: botserver
-      securityContext:
-        runAsNonRoot: true
-        runAsUser: 1000
-        fsGroup: 1000
-
-      # Init container to wait for dependencies
-      initContainers:
-        - name: wait-for-postgres
-          image: busybox:1.35
-          command: ['sh', '-c', 'until nc -z postgres-service 5432; do echo waiting for postgres; sleep 2; done']
-        - name: wait-for-qdrant
-          image: busybox:1.35
-          command: ['sh', '-c', 'until nc -z qdrant-service 6333; do echo waiting for qdrant; sleep 2; done']
-
-      containers:
-        - name: botserver
-          image: generalbots/botserver:latest
-          imagePullPolicy: Always
-          ports:
-            - name: http
-              containerPort: 8080
-              protocol: TCP
-            - name: metrics
-              containerPort: 9090
-              protocol: TCP
-
-          envFrom:
-            - configMapRef:
-                name: botserver-config
-
-          env:
-            - name: DATABASE_URL
-              valueFrom:
-                secretKeyRef:
-                  name: botserver-secrets
-                  key: database-url
-            - name: QDRANT_URL
-              valueFrom:
-                secretKeyRef:
-                  name: botserver-secrets
-                  key: qdrant-url
-            - name: LLM_KEY
-              valueFrom:
-                secretKeyRef:
-                  name: botserver-secrets
-                  key: llm-api-key
-                  optional: true
-            - name: POD_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: POD_NAMESPACE
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.namespace
-
-          resources:
-            requests:
-              memory: "512Mi"
-              cpu: "250m"
-            limits:
-              memory: "2Gi"
-              cpu: "2000m"
-
-          livenessProbe:
-            httpGet:
-              path: /health
-              port: http
-            initialDelaySeconds: 30
-            periodSeconds: 10
-            timeoutSeconds: 5
-            failureThreshold: 3
-
-          readinessProbe:
-            httpGet:
-              path: /ready
-              port: http
-            initialDelaySeconds: 10
-            periodSeconds: 5
-            timeoutSeconds: 3
-            failureThreshold: 3
-
-          startupProbe:
-            httpGet:
-              path: /health
-              port: http
-            initialDelaySeconds: 10
-            periodSeconds: 10
-            timeoutSeconds: 5
-            failureThreshold: 30
-
-          volumeMounts:
-            - name: data
-              mountPath: /data
-            - name: models
-              mountPath: /models
-              readOnly: true
-            - name: gbai-packages
-              mountPath: /packages
-
-      volumes:
-        - name: data
-          persistentVolumeClaim:
-            claimName: botserver-data
-        - name: models
-          persistentVolumeClaim:
-            claimName: llm-models
-        - name: gbai-packages
-          persistentVolumeClaim:
-            claimName: gbai-packages
-
-      affinity:
-        podAntiAffinity:
-          preferredDuringSchedulingIgnoredDuringExecution:
-            - weight: 100
-              podAffinityTerm:
-                labelSelector:
-                  matchExpressions:
-                    - key: app
-                      operator: In
-                      values:
-                        - botserver
-                topologyKey: kubernetes.io/hostname
-
-      topologySpreadConstraints:
-        - maxSkew: 1
-          topologyKey: topology.kubernetes.io/zone
-          whenUnsatisfiable: ScheduleAnyway
-          labelSelector:
-            matchLabels:
-              app: botserver
-
----
-# LLM Server Deployment (for local model inference)
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llm-server
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: llm-server
-spec:
-  replicas: 2
-  selector:
-    matchLabels:
-      app: llm-server
-  template:
-    metadata:
-      labels:
-        app: llm-server
-        app.kubernetes.io/name: generalbots
-        app.kubernetes.io/component: llm-server
-    spec:
-      containers:
-        - name: llm-server
-          image: generalbots/llm-server:latest
-          imagePullPolicy: Always
-          ports:
-            - name: http
-              containerPort: 8081
-              protocol: TCP
-
-          env:
-            - name: MODEL_PATH
-              value: "/models/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
-            - name: CTX_SIZE
-              value: "4096"
-            - name: N_PREDICT
-              value: "1024"
-            - name: PARALLEL
-              value: "6"
-            - name: CONT_BATCHING
-              value: "true"
-            - name: GPU_LAYERS
-              value: "35"  # Adjust based on available GPU memory
-
-          resources:
-            requests:
-              memory: "8Gi"
-              cpu: "2000m"
-              # Uncomment for GPU support
-              # nvidia.com/gpu: 1
-            limits:
-              memory: "24Gi"
-              cpu: "8000m"
-              # nvidia.com/gpu: 1
-
-          volumeMounts:
-            - name: models
-              mountPath: /models
-              readOnly: true
-
-          livenessProbe:
-            httpGet:
-              path: /health
-              port: http
-            initialDelaySeconds: 120
-            periodSeconds: 30
-            timeoutSeconds: 10
-
-          readinessProbe:
-            httpGet:
-              path: /health
-              port: http
-            initialDelaySeconds: 60
-            periodSeconds: 10
-            timeoutSeconds: 5
-
-      volumes:
-        - name: models
-          persistentVolumeClaim:
-            claimName: llm-models
-
-      # Schedule on nodes with GPU
-      # nodeSelector:
-      #   nvidia.com/gpu.present: "true"
-
-      tolerations:
-        - key: "nvidia.com/gpu"
-          operator: "Exists"
-          effect: "NoSchedule"
-
----
-# Service for botserver
-apiVersion: v1
-kind: Service
-metadata:
-  name: botserver-service
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: service
-spec:
-  type: ClusterIP
-  selector:
-    app: botserver
-  ports:
-    - name: http
-      port: 80
-      targetPort: 8080
-      protocol: TCP
-    - name: metrics
-      port: 9090
-      targetPort: 9090
-      protocol: TCP
-
----
-# Service for LLM server
-apiVersion: v1
-kind: Service
-metadata:
-  name: llm-server-service
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: llm-service
-spec:
-  type: ClusterIP
-  selector:
-    app: llm-server
-  ports:
-    - name: http
-      port: 8081
-      targetPort: 8081
-      protocol: TCP
-
----
-# Headless service for StatefulSet-like DNS (if needed)
-apiVersion: v1
-kind: Service
-metadata:
-  name: botserver-headless
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: headless-service
-spec:
-  clusterIP: None
-  selector:
-    app: botserver
-  ports:
-    - name: http
-      port: 8080
-      targetPort: 8080
-
----
-# Ingress for external access
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: botserver-ingress
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: ingress
-  annotations:
-    kubernetes.io/ingress.class: nginx
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/proxy-body-size: "50m"
-    nginx.ingress.kubernetes.io/proxy-read-timeout: "300"
-    nginx.ingress.kubernetes.io/proxy-send-timeout: "300"
-    nginx.ingress.kubernetes.io/websocket-services: "botserver-service"
-    cert-manager.io/cluster-issuer: "letsencrypt-prod"
-spec:
-  tls:
-    - hosts:
-        - bot.example.com
-      secretName: botserver-tls
-  rules:
-    - host: bot.example.com
-      http:
-        paths:
-          - path: /
-            pathType: Prefix
-            backend:
-              service:
-                name: botserver-service
-                port:
-                  number: 80
-
----
-# ServiceAccount
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: botserver
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: serviceaccount
-
----
-# Role for botserver
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
-metadata:
-  name: botserver-role
-  namespace: generalbots
-rules:
-  - apiGroups: [""]
-    resources: ["configmaps", "secrets"]
-    verbs: ["get", "list", "watch"]
-  - apiGroups: [""]
-    resources: ["pods"]
-    verbs: ["get", "list"]
-
----
-# RoleBinding
-apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
-metadata:
-  name: botserver-rolebinding
-  namespace: generalbots
-subjects:
-  - kind: ServiceAccount
-    name: botserver
-    namespace: generalbots
-roleRef:
-  kind: Role
-  name: botserver-role
-  apiGroup: rbac.authorization.k8s.io
-
----
-# PodDisruptionBudget for high availability
-apiVersion: policy/v1
-kind: PodDisruptionBudget
-metadata:
-  name: botserver-pdb
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: pdb
-spec:
-  minAvailable: 2
-  selector:
-    matchLabels:
-      app: botserver
-
----
-# PersistentVolumeClaim for botserver data
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: botserver-data
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: storage
-spec:
-  accessModes:
-    - ReadWriteMany
-  storageClassName: standard
-  resources:
-    requests:
-      storage: 50Gi
-
----
-# PersistentVolumeClaim for LLM models
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: llm-models
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: storage
-spec:
-  accessModes:
-    - ReadOnlyMany
-  storageClassName: standard
-  resources:
-    requests:
-      storage: 100Gi
-
----
-# PersistentVolumeClaim for .gbai packages
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: gbai-packages
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: storage
-spec:
-  accessModes:
-    - ReadWriteMany
-  storageClassName: standard
-  resources:
-    requests:
-      storage: 20Gi
diff --git a/deploy/kubernetes/hpa.yaml b/deploy/kubernetes/hpa.yaml
deleted file mode 100644
index 44539a90c..000000000
--- a/deploy/kubernetes/hpa.yaml
+++ /dev/null
@@ -1,331 +0,0 @@
-# General Bots Kubernetes HorizontalPodAutoscaler Configuration
-# This file contains autoscaling configurations for General Bots components.
-#
-# Usage:
-#   kubectl apply -f hpa.yaml
-#
-# Prerequisites:
-#   - Metrics Server installed in cluster
-#   - deployment.yaml already applied
-
----
-# HPA for botserver - scales based on CPU and memory
-apiVersion: autoscaling/v2
-kind: HorizontalPodAutoscaler
-metadata:
-  name: botserver-hpa
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: hpa
-spec:
-  scaleTargetRef:
-    apiVersion: apps/v1
-    kind: Deployment
-    name: botserver
-  minReplicas: 3
-  maxReplicas: 20
-  metrics:
-    # Scale based on CPU utilization
-    - type: Resource
-      resource:
-        name: cpu
-        target:
-          type: Utilization
-          averageUtilization: 70
-
-    # Scale based on memory utilization
-    - type: Resource
-      resource:
-        name: memory
-        target:
-          type: Utilization
-          averageUtilization: 80
-
-    # Scale based on requests per second (requires custom metrics)
-    # Uncomment if using Prometheus Adapter
-    # - type: Pods
-    #   pods:
-    #     metric:
-    #       name: http_requests_per_second
-    #     target:
-    #       type: AverageValue
-    #       averageValue: 100
-
-  behavior:
-    scaleDown:
-      stabilizationWindowSeconds: 300  # 5 minutes cooldown before scaling down
-      policies:
-        - type: Percent
-          value: 10
-          periodSeconds: 60
-        - type: Pods
-          value: 2
-          periodSeconds: 60
-      selectPolicy: Min  # Use the most conservative policy
-
-    scaleUp:
-      stabilizationWindowSeconds: 60  # 1 minute before scaling up
-      policies:
-        - type: Percent
-          value: 100
-          periodSeconds: 30
-        - type: Pods
-          value: 4
-          periodSeconds: 30
-      selectPolicy: Max  # Scale up aggressively when needed
-
----
-# HPA for LLM server - scales based on CPU (inference is CPU/GPU bound)
-apiVersion: autoscaling/v2
-kind: HorizontalPodAutoscaler
-metadata:
-  name: llm-server-hpa
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: hpa
-spec:
-  scaleTargetRef:
-    apiVersion: apps/v1
-    kind: Deployment
-    name: llm-server
-  minReplicas: 2
-  maxReplicas: 10
-  metrics:
-    # Scale based on CPU utilization
-    - type: Resource
-      resource:
-        name: cpu
-        target:
-          type: Utilization
-          averageUtilization: 60  # Lower threshold for LLM - inference is expensive
-
-    # Scale based on memory utilization
-    - type: Resource
-      resource:
-        name: memory
-        target:
-          type: Utilization
-          averageUtilization: 75
-
-    # Scale based on inference queue length (requires custom metrics)
-    # Uncomment if using Prometheus Adapter
-    # - type: Pods
-    #   pods:
-    #     metric:
-    #       name: llm_inference_queue_length
-    #     target:
-    #       type: AverageValue
-    #       averageValue: 5
-
-  behavior:
-    scaleDown:
-      stabilizationWindowSeconds: 600  # 10 minutes - LLM pods are expensive to recreate
-      policies:
-        - type: Pods
-          value: 1
-          periodSeconds: 120
-      selectPolicy: Min
-
-    scaleUp:
-      stabilizationWindowSeconds: 120  # 2 minutes
-      policies:
-        - type: Pods
-          value: 2
-          periodSeconds: 60
-      selectPolicy: Max
-
----
-# HPA for embedding server (if deployed separately)
-apiVersion: autoscaling/v2
-kind: HorizontalPodAutoscaler
-metadata:
-  name: embedding-server-hpa
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: hpa
-spec:
-  scaleTargetRef:
-    apiVersion: apps/v1
-    kind: Deployment
-    name: embedding-server
-  minReplicas: 2
-  maxReplicas: 8
-  metrics:
-    - type: Resource
-      resource:
-        name: cpu
-        target:
-          type: Utilization
-          averageUtilization: 70
-
-    - type: Resource
-      resource:
-        name: memory
-        target:
-          type: Utilization
-          averageUtilization: 80
-
-  behavior:
-    scaleDown:
-      stabilizationWindowSeconds: 300
-      policies:
-        - type: Pods
-          value: 1
-          periodSeconds: 60
-      selectPolicy: Min
-
-    scaleUp:
-      stabilizationWindowSeconds: 60
-      policies:
-        - type: Pods
-          value: 2
-          periodSeconds: 30
-      selectPolicy: Max
-
----
-# Vertical Pod Autoscaler for botserver (optional - requires VPA installed)
-# Automatically adjusts resource requests/limits
-apiVersion: autoscaling.k8s.io/v1
-kind: VerticalPodAutoscaler
-metadata:
-  name: botserver-vpa
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: vpa
-spec:
-  targetRef:
-    apiVersion: apps/v1
-    kind: Deployment
-    name: botserver
-  updatePolicy:
-    updateMode: "Auto"  # Options: Off, Initial, Recreate, Auto
-  resourcePolicy:
-    containerPolicies:
-      - containerName: botserver
-        minAllowed:
-          cpu: 250m
-          memory: 512Mi
-        maxAllowed:
-          cpu: 4000m
-          memory: 8Gi
-        controlledResources: ["cpu", "memory"]
-        controlledValues: RequestsAndLimits
-
----
-# Vertical Pod Autoscaler for LLM server
-apiVersion: autoscaling.k8s.io/v1
-kind: VerticalPodAutoscaler
-metadata:
-  name: llm-server-vpa
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: vpa
-spec:
-  targetRef:
-    apiVersion: apps/v1
-    kind: Deployment
-    name: llm-server
-  updatePolicy:
-    updateMode: "Off"  # Manual for LLM - too disruptive to auto-update
-  resourcePolicy:
-    containerPolicies:
-      - containerName: llm-server
-        minAllowed:
-          cpu: 2000m
-          memory: 8Gi
-        maxAllowed:
-          cpu: 16000m
-          memory: 64Gi
-        controlledResources: ["cpu", "memory"]
-        controlledValues: RequestsOnly  # Only adjust requests, not limits
-
----
-# Custom metrics for HPA (requires Prometheus + Prometheus Adapter)
-# This ServiceMonitor tells Prometheus to scrape botserver metrics
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
-  name: botserver-metrics
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: monitoring
-spec:
-  selector:
-    matchLabels:
-      app: botserver
-  endpoints:
-    - port: metrics
-      interval: 30s
-      path: /metrics
-  namespaceSelector:
-    matchNames:
-      - generalbots
-
----
-# PrometheusRule for alerting on scaling events
-apiVersion: monitoring.coreos.com/v1
-kind: PrometheusRule
-metadata:
-  name: botserver-scaling-alerts
-  namespace: generalbots
-  labels:
-    app.kubernetes.io/name: generalbots
-    app.kubernetes.io/component: alerts
-spec:
-  groups:
-    - name: botserver-scaling
-      rules:
-        # Alert when approaching max replicas
-        - alert: BotserverNearMaxReplicas
-          expr: |
-            kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="botserver-hpa"}
-            / kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="botserver-hpa"}
-            > 0.8
-          for: 5m
-          labels:
-            severity: warning
-          annotations:
-            summary: "Botserver near maximum replicas"
-            description: "Botserver HPA is at {{ $value | humanizePercentage }} of max replicas"
-
-        # Alert when at max replicas
-        - alert: BotserverAtMaxReplicas
-          expr: |
-            kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="botserver-hpa"}
-            == kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="botserver-hpa"}
-          for: 10m
-          labels:
-            severity: critical
-          annotations:
-            summary: "Botserver at maximum replicas"
-            description: "Botserver HPA has been at max replicas for 10 minutes - consider increasing max"
-
-        # Alert on rapid scaling
-        - alert: BotserverRapidScaling
-          expr: |
-            increase(kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="botserver-hpa"}[10m])
-            > 5
-          for: 1m
-          labels:
-            severity: warning
-          annotations:
-            summary: "Botserver scaling rapidly"
-            description: "Botserver has scaled by {{ $value }} replicas in 10 minutes"
-
-        # Alert on LLM server max replicas
-        - alert: LLMServerAtMaxReplicas
-          expr: |
-            kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="llm-server-hpa"}
-            == kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="llm-server-hpa"}
-          for: 5m
-          labels:
-            severity: critical
-          annotations:
-            summary: "LLM Server at maximum replicas"
-            description: "LLM Server HPA is at max - inference capacity may be constrained"