This commit introduces comprehensive documentation and implementation for multi-agent orchestration capabilities: - Add IMPLEMENTATION-PLAN.md with 4-phase roadmap - Add Kubernetes deployment manifests (deployment.yaml, hpa.yaml) - Add database migrations for multi-agent tables (6.1.1, 6.1.2) - Implement A2A protocol for agent-to-agent communication - Implement user memory keywords for cross-session persistence - Implement model routing for dynamic L
331 lines
8.8 KiB
YAML
331 lines
8.8 KiB
YAML
# General Bots Kubernetes HorizontalPodAutoscaler Configuration
|
|
# This file contains autoscaling configurations for General Bots components.
|
|
#
|
|
# Usage:
|
|
# kubectl apply -f hpa.yaml
|
|
#
|
|
# Prerequisites:
|
|
# - Metrics Server installed in cluster
|
|
# - deployment.yaml already applied
|
|
|
|
---
|
|
# HPA for botserver - scales based on CPU and memory
|
|
apiVersion: autoscaling/v2
|
|
kind: HorizontalPodAutoscaler
|
|
metadata:
|
|
name: botserver-hpa
|
|
namespace: generalbots
|
|
labels:
|
|
app.kubernetes.io/name: generalbots
|
|
app.kubernetes.io/component: hpa
|
|
spec:
|
|
scaleTargetRef:
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
name: botserver
|
|
minReplicas: 3
|
|
maxReplicas: 20
|
|
metrics:
|
|
# Scale based on CPU utilization
|
|
- type: Resource
|
|
resource:
|
|
name: cpu
|
|
target:
|
|
type: Utilization
|
|
averageUtilization: 70
|
|
|
|
# Scale based on memory utilization
|
|
- type: Resource
|
|
resource:
|
|
name: memory
|
|
target:
|
|
type: Utilization
|
|
averageUtilization: 80
|
|
|
|
# Scale based on requests per second (requires custom metrics)
|
|
# Uncomment if using Prometheus Adapter
|
|
# - type: Pods
|
|
# pods:
|
|
# metric:
|
|
# name: http_requests_per_second
|
|
# target:
|
|
# type: AverageValue
|
|
# averageValue: 100
|
|
|
|
behavior:
|
|
scaleDown:
|
|
stabilizationWindowSeconds: 300 # 5 minutes cooldown before scaling down
|
|
policies:
|
|
- type: Percent
|
|
value: 10
|
|
periodSeconds: 60
|
|
- type: Pods
|
|
value: 2
|
|
periodSeconds: 60
|
|
selectPolicy: Min # Use the most conservative policy
|
|
|
|
scaleUp:
|
|
stabilizationWindowSeconds: 60 # 1 minute before scaling up
|
|
policies:
|
|
- type: Percent
|
|
value: 100
|
|
periodSeconds: 30
|
|
- type: Pods
|
|
value: 4
|
|
periodSeconds: 30
|
|
selectPolicy: Max # Scale up aggressively when needed
|
|
|
|
---
|
|
# HPA for LLM server - scales based on CPU (inference is CPU/GPU bound)
|
|
apiVersion: autoscaling/v2
|
|
kind: HorizontalPodAutoscaler
|
|
metadata:
|
|
name: llm-server-hpa
|
|
namespace: generalbots
|
|
labels:
|
|
app.kubernetes.io/name: generalbots
|
|
app.kubernetes.io/component: hpa
|
|
spec:
|
|
scaleTargetRef:
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
name: llm-server
|
|
minReplicas: 2
|
|
maxReplicas: 10
|
|
metrics:
|
|
# Scale based on CPU utilization
|
|
- type: Resource
|
|
resource:
|
|
name: cpu
|
|
target:
|
|
type: Utilization
|
|
averageUtilization: 60 # Lower threshold for LLM - inference is expensive
|
|
|
|
# Scale based on memory utilization
|
|
- type: Resource
|
|
resource:
|
|
name: memory
|
|
target:
|
|
type: Utilization
|
|
averageUtilization: 75
|
|
|
|
# Scale based on inference queue length (requires custom metrics)
|
|
# Uncomment if using Prometheus Adapter
|
|
# - type: Pods
|
|
# pods:
|
|
# metric:
|
|
# name: llm_inference_queue_length
|
|
# target:
|
|
# type: AverageValue
|
|
# averageValue: 5
|
|
|
|
behavior:
|
|
scaleDown:
|
|
stabilizationWindowSeconds: 600 # 10 minutes - LLM pods are expensive to recreate
|
|
policies:
|
|
- type: Pods
|
|
value: 1
|
|
periodSeconds: 120
|
|
selectPolicy: Min
|
|
|
|
scaleUp:
|
|
stabilizationWindowSeconds: 120 # 2 minutes
|
|
policies:
|
|
- type: Pods
|
|
value: 2
|
|
periodSeconds: 60
|
|
selectPolicy: Max
|
|
|
|
---
|
|
# HPA for embedding server (if deployed separately)
|
|
apiVersion: autoscaling/v2
|
|
kind: HorizontalPodAutoscaler
|
|
metadata:
|
|
name: embedding-server-hpa
|
|
namespace: generalbots
|
|
labels:
|
|
app.kubernetes.io/name: generalbots
|
|
app.kubernetes.io/component: hpa
|
|
spec:
|
|
scaleTargetRef:
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
name: embedding-server
|
|
minReplicas: 2
|
|
maxReplicas: 8
|
|
metrics:
|
|
- type: Resource
|
|
resource:
|
|
name: cpu
|
|
target:
|
|
type: Utilization
|
|
averageUtilization: 70
|
|
|
|
- type: Resource
|
|
resource:
|
|
name: memory
|
|
target:
|
|
type: Utilization
|
|
averageUtilization: 80
|
|
|
|
behavior:
|
|
scaleDown:
|
|
stabilizationWindowSeconds: 300
|
|
policies:
|
|
- type: Pods
|
|
value: 1
|
|
periodSeconds: 60
|
|
selectPolicy: Min
|
|
|
|
scaleUp:
|
|
stabilizationWindowSeconds: 60
|
|
policies:
|
|
- type: Pods
|
|
value: 2
|
|
periodSeconds: 30
|
|
selectPolicy: Max
|
|
|
|
---
|
|
# Vertical Pod Autoscaler for botserver (optional - requires VPA installed)
|
|
# Automatically adjusts resource requests/limits
|
|
apiVersion: autoscaling.k8s.io/v1
|
|
kind: VerticalPodAutoscaler
|
|
metadata:
|
|
name: botserver-vpa
|
|
namespace: generalbots
|
|
labels:
|
|
app.kubernetes.io/name: generalbots
|
|
app.kubernetes.io/component: vpa
|
|
spec:
|
|
targetRef:
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
name: botserver
|
|
updatePolicy:
|
|
updateMode: "Auto" # Options: Off, Initial, Recreate, Auto
|
|
resourcePolicy:
|
|
containerPolicies:
|
|
- containerName: botserver
|
|
minAllowed:
|
|
cpu: 250m
|
|
memory: 512Mi
|
|
maxAllowed:
|
|
cpu: 4000m
|
|
memory: 8Gi
|
|
controlledResources: ["cpu", "memory"]
|
|
controlledValues: RequestsAndLimits
|
|
|
|
---
|
|
# Vertical Pod Autoscaler for LLM server
|
|
apiVersion: autoscaling.k8s.io/v1
|
|
kind: VerticalPodAutoscaler
|
|
metadata:
|
|
name: llm-server-vpa
|
|
namespace: generalbots
|
|
labels:
|
|
app.kubernetes.io/name: generalbots
|
|
app.kubernetes.io/component: vpa
|
|
spec:
|
|
targetRef:
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
name: llm-server
|
|
updatePolicy:
|
|
updateMode: "Off" # Manual for LLM - too disruptive to auto-update
|
|
resourcePolicy:
|
|
containerPolicies:
|
|
- containerName: llm-server
|
|
minAllowed:
|
|
cpu: 2000m
|
|
memory: 8Gi
|
|
maxAllowed:
|
|
cpu: 16000m
|
|
memory: 64Gi
|
|
controlledResources: ["cpu", "memory"]
|
|
controlledValues: RequestsOnly # Only adjust requests, not limits
|
|
|
|
---
|
|
# Custom metrics for HPA (requires Prometheus + Prometheus Adapter)
|
|
# This ServiceMonitor tells Prometheus to scrape botserver metrics
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: ServiceMonitor
|
|
metadata:
|
|
name: botserver-metrics
|
|
namespace: generalbots
|
|
labels:
|
|
app.kubernetes.io/name: generalbots
|
|
app.kubernetes.io/component: monitoring
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
app: botserver
|
|
endpoints:
|
|
- port: metrics
|
|
interval: 30s
|
|
path: /metrics
|
|
namespaceSelector:
|
|
matchNames:
|
|
- generalbots
|
|
|
|
---
|
|
# PrometheusRule for alerting on scaling events
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: botserver-scaling-alerts
|
|
namespace: generalbots
|
|
labels:
|
|
app.kubernetes.io/name: generalbots
|
|
app.kubernetes.io/component: alerts
|
|
spec:
|
|
groups:
|
|
- name: botserver-scaling
|
|
rules:
|
|
# Alert when approaching max replicas
|
|
- alert: BotserverNearMaxReplicas
|
|
expr: |
|
|
kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="botserver-hpa"}
|
|
/ kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="botserver-hpa"}
|
|
> 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Botserver near maximum replicas"
|
|
description: "Botserver HPA is at {{ $value | humanizePercentage }} of max replicas"
|
|
|
|
# Alert when at max replicas
|
|
- alert: BotserverAtMaxReplicas
|
|
expr: |
|
|
kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="botserver-hpa"}
|
|
== kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="botserver-hpa"}
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Botserver at maximum replicas"
|
|
description: "Botserver HPA has been at max replicas for 10 minutes - consider increasing max"
|
|
|
|
# Alert on rapid scaling
|
|
- alert: BotserverRapidScaling
|
|
expr: |
|
|
increase(kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="botserver-hpa"}[10m])
|
|
> 5
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Botserver scaling rapidly"
|
|
description: "Botserver has scaled by {{ $value }} replicas in 10 minutes"
|
|
|
|
# Alert on LLM server max replicas
|
|
- alert: LLMServerAtMaxReplicas
|
|
expr: |
|
|
kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="llm-server-hpa"}
|
|
== kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="llm-server-hpa"}
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "LLM Server at maximum replicas"
|
|
description: "LLM Server HPA is at max - inference capacity may be constrained"
|