diff --git a/Cargo.toml b/Cargo.toml index e5079057a..dc368fed0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ features = ["database", "i18n"] [features] # ===== DEFAULT ===== -default = ["chat", "automation", "drive", "tasks", "cache", "directory", "llm"] +default = ["chat", "automation", "drive", "tasks", "cache", "directory", "llm", "crawler"] # ===== CORE INFRASTRUCTURE (Can be used standalone) ===== scripting = ["dep:rhai"] @@ -18,6 +18,7 @@ automation = ["scripting", "dep:cron"] drive = ["dep:aws-config", "dep:aws-sdk-s3", "dep:aws-smithy-async", "dep:pdf-extract"] cache = ["dep:redis"] directory = [] +crawler = ["drive", "cache"] # ===== APPS (Each includes what it needs from core) ===== # Communication @@ -48,7 +49,7 @@ player = ["automation", "drive", "cache"] canvas = ["automation", "drive", "cache"] # Learning -learn = ["automation", "drive", "cache"] +learn = ["automation", "drive", "cache", "crawler"] research = ["automation", "drive", "cache", "llm", "vectordb"] sources = ["automation", "drive", "cache"] diff --git a/README.md b/README.md index f02ba8554..921972cd3 100644 --- a/README.md +++ b/README.md @@ -324,7 +324,7 @@ When a file grows beyond this limit: ## 🗄️ Database Standards -- **TABLES AND INDEXES ONLY** (no views, triggers, functions) +- **TABLES AND INDEXES ONLY** (no stored procedures, nothing, no views, no triggers, no functions) - **JSON columns:** use TEXT with `_json` suffix - **ORM:** Use diesel - no sqlx - **Migrations:** Located in `botserver/migrations/` @@ -492,4 +492,4 @@ According to our dual licensing model, this program can be used either under the **General Bots Code Name:** [Guaribas](https://en.wikipedia.org/wiki/Guaribas) -> "No one should have to do work that can be done by a machine." - Roberto Mangabeira Unger \ No newline at end of file +> "No one should have to do work that can be done by a machine." - Roberto Mangabeira Unger diff --git a/build.rs b/build.rs new file mode 100644 index 000000000..609c7150d --- /dev/null +++ b/build.rs @@ -0,0 +1,3 @@ +fn main() { + println!("cargo:rerun-if-changed=../botui/ui/suite/"); +} \ No newline at end of file diff --git a/config/directory_config.json b/config/directory_config.json index abe26a194..3cdcd2525 100644 --- a/config/directory_config.json +++ b/config/directory_config.json @@ -1,7 +1,7 @@ { "base_url": "http://localhost:8300", "default_org": { - "id": "354799954578898958", + "id": "357870945618100238", "name": "default", "domain": "default.localhost" }, @@ -13,8 +13,8 @@ "first_name": "Admin", "last_name": "User" }, - "admin_token": "6nMpG1E-H-hqlSjrbFB5n2yx8BoEpnl3a3-F3HJoc5bvria3nwiz8vURPndzS4lQWKwaz_8", + "admin_token": "RflPqOgYM-BtinaBTyCaY8hX-_koTwC65gCg1Kpf7Sfhlc0ZOLZvIr-XsOYXmckPLBAWzjU", "project_id": "", - "client_id": "354799955384270862", - "client_secret": "z6WFb1qshdCQ1y4Gw5EpOHzARgHicz6XkrazZwJdDcxMJrc6iRdHlhf5rf5LTzgi" + "client_id": "357870946289254414", + "client_secret": "q20LOjW5Vdjzp57Cw8EuFt7sILEd8VeSeGPvrhB63880GLgaJZpcWeRgUwdGET2x" } \ No newline at end of file diff --git a/examples/test_cron.rs b/examples/test_cron.rs new file mode 100644 index 000000000..0bb057284 --- /dev/null +++ b/examples/test_cron.rs @@ -0,0 +1,18 @@ +use cron::Schedule; +use std::str::FromStr; + +fn main() { + let schedules = vec![ + "59 * * * *", + "0 * * * *", + "0 11 * * *", + ]; + + for schedule_str in schedules { + println!("\nTesting: {}", schedule_str); + match Schedule::from_str(schedule_str) { + Ok(_) => println!(" ✓ OK"), + Err(e) => println!(" ✗ Error: {}", e), + } + } +} diff --git a/migrations/core/6.0.0/down.sql b/migrations/6.0.0-01-core/down.sql similarity index 100% rename from migrations/core/6.0.0/down.sql rename to migrations/6.0.0-01-core/down.sql diff --git a/migrations/core/6.0.0/up.sql b/migrations/6.0.0-01-core/up.sql similarity index 99% rename from migrations/core/6.0.0/up.sql rename to migrations/6.0.0-01-core/up.sql index 3a7189943..1b48e5e41 100644 --- a/migrations/core/6.0.0/up.sql +++ b/migrations/6.0.0-01-core/up.sql @@ -2748,7 +2748,7 @@ CREATE INDEX IF NOT EXISTS idx_designer_pending_changes_expires_at ON designer_p -- Add role-based access control columns to dynamic table definitions and fields -- -- Syntax in .gbdialog TABLE definitions: --- TABLE Contatos ON maria READ BY "admin;manager" +-- TABLE Contatos READ BY "admin;manager" -- Id number key -- Nome string(150) -- NumeroDocumento string(25) READ BY "admin" diff --git a/migrations/core/6.0.1/down.sql b/migrations/6.0.1-01-core/down.sql similarity index 100% rename from migrations/core/6.0.1/down.sql rename to migrations/6.0.1-01-core/down.sql diff --git a/migrations/core/6.0.1/up.sql b/migrations/6.0.1-01-core/up.sql similarity index 100% rename from migrations/core/6.0.1/up.sql rename to migrations/6.0.1-01-core/up.sql diff --git a/migrations/products/6.0.10/down.sql b/migrations/6.0.10-01-products/down.sql similarity index 100% rename from migrations/products/6.0.10/down.sql rename to migrations/6.0.10-01-products/down.sql diff --git a/migrations/products/6.0.10/up.sql b/migrations/6.0.10-01-products/up.sql similarity index 100% rename from migrations/products/6.0.10/up.sql rename to migrations/6.0.10-01-products/up.sql diff --git a/migrations/people/6.0.11/down.sql b/migrations/6.0.11-01-people/down.sql similarity index 100% rename from migrations/people/6.0.11/down.sql rename to migrations/6.0.11-01-people/down.sql diff --git a/migrations/people/6.0.11/up.sql b/migrations/6.0.11-01-people/up.sql similarity index 100% rename from migrations/people/6.0.11/up.sql rename to migrations/6.0.11-01-people/up.sql diff --git a/migrations/attendant/6.0.12/down.sql b/migrations/6.0.12-01-attendant/down.sql similarity index 100% rename from migrations/attendant/6.0.12/down.sql rename to migrations/6.0.12-01-attendant/down.sql diff --git a/migrations/attendant/6.0.12/up.sql b/migrations/6.0.12-01-attendant/up.sql similarity index 100% rename from migrations/attendant/6.0.12/up.sql rename to migrations/6.0.12-01-attendant/up.sql diff --git a/migrations/calendar/6.0.13/down.sql b/migrations/6.0.13-01-calendar/down.sql similarity index 100% rename from migrations/calendar/6.0.13/down.sql rename to migrations/6.0.13-01-calendar/down.sql diff --git a/migrations/calendar/6.0.13/up.sql b/migrations/6.0.13-01-calendar/up.sql similarity index 100% rename from migrations/calendar/6.0.13/up.sql rename to migrations/6.0.13-01-calendar/up.sql diff --git a/migrations/goals/6.0.14/down.sql b/migrations/6.0.14-01-goals/down.sql similarity index 100% rename from migrations/goals/6.0.14/down.sql rename to migrations/6.0.14-01-goals/down.sql diff --git a/migrations/goals/6.0.14/up.sql b/migrations/6.0.14-01-goals/up.sql similarity index 100% rename from migrations/goals/6.0.14/up.sql rename to migrations/6.0.14-01-goals/up.sql diff --git a/migrations/canvas/6.0.15/down.sql b/migrations/6.0.15-01-canvas/down.sql similarity index 100% rename from migrations/canvas/6.0.15/down.sql rename to migrations/6.0.15-01-canvas/down.sql diff --git a/migrations/canvas/6.0.15/up.sql b/migrations/6.0.15-01-canvas/up.sql similarity index 100% rename from migrations/canvas/6.0.15/up.sql rename to migrations/6.0.15-01-canvas/up.sql diff --git a/migrations/workspaces/6.0.16/down.sql b/migrations/6.0.16-01-workspaces/down.sql similarity index 100% rename from migrations/workspaces/6.0.16/down.sql rename to migrations/6.0.16-01-workspaces/down.sql diff --git a/migrations/workspaces/6.0.16/up.sql b/migrations/6.0.16-01-workspaces/up.sql similarity index 100% rename from migrations/workspaces/6.0.16/up.sql rename to migrations/6.0.16-01-workspaces/up.sql diff --git a/migrations/social/6.0.17/down.sql b/migrations/6.0.17-01-social/down.sql similarity index 100% rename from migrations/social/6.0.17/down.sql rename to migrations/6.0.17-01-social/down.sql diff --git a/migrations/social/6.0.17/up.sql b/migrations/6.0.17-01-social/up.sql similarity index 100% rename from migrations/social/6.0.17/up.sql rename to migrations/6.0.17-01-social/up.sql diff --git a/migrations/research/6.0.18/down.sql b/migrations/6.0.18-01-research/down.sql similarity index 100% rename from migrations/research/6.0.18/down.sql rename to migrations/6.0.18-01-research/down.sql diff --git a/migrations/research/6.0.18/up.sql b/migrations/6.0.18-01-research/up.sql similarity index 100% rename from migrations/research/6.0.18/up.sql rename to migrations/6.0.18-01-research/up.sql diff --git a/migrations/dashboards/6.0.19/down.sql b/migrations/6.0.19-01-dashboards/down.sql similarity index 100% rename from migrations/dashboards/6.0.19/down.sql rename to migrations/6.0.19-01-dashboards/down.sql diff --git a/migrations/dashboards/6.0.19/up.sql b/migrations/6.0.19-01-dashboards/up.sql similarity index 100% rename from migrations/dashboards/6.0.19/up.sql rename to migrations/6.0.19-01-dashboards/up.sql diff --git a/migrations/core/6.0.2/down.sql b/migrations/6.0.2-01-core/down.sql similarity index 100% rename from migrations/core/6.0.2/down.sql rename to migrations/6.0.2-01-core/down.sql diff --git a/migrations/core/6.0.2/up.sql b/migrations/6.0.2-01-core/up.sql similarity index 100% rename from migrations/core/6.0.2/up.sql rename to migrations/6.0.2-01-core/up.sql diff --git a/migrations/compliance/6.0.20/down.sql b/migrations/6.0.20-01-compliance/down.sql similarity index 100% rename from migrations/compliance/6.0.20/down.sql rename to migrations/6.0.20-01-compliance/down.sql diff --git a/migrations/compliance/6.0.20/up.sql b/migrations/6.0.20-01-compliance/up.sql similarity index 100% rename from migrations/compliance/6.0.20/up.sql rename to migrations/6.0.20-01-compliance/up.sql diff --git a/migrations/compliance/6.0.21/down.sql b/migrations/6.0.21-01-compliance/down.sql similarity index 100% rename from migrations/compliance/6.0.21/down.sql rename to migrations/6.0.21-01-compliance/down.sql diff --git a/migrations/compliance/6.0.21/up.sql b/migrations/6.0.21-01-compliance/up.sql similarity index 100% rename from migrations/compliance/6.0.21/up.sql rename to migrations/6.0.21-01-compliance/up.sql diff --git a/migrations/billing/6.0.22/down.sql b/migrations/6.0.22-01-billing/down.sql similarity index 100% rename from migrations/billing/6.0.22/down.sql rename to migrations/6.0.22-01-billing/down.sql diff --git a/migrations/billing/6.0.22/up.sql b/migrations/6.0.22-01-billing/up.sql similarity index 94% rename from migrations/billing/6.0.22/up.sql rename to migrations/6.0.22-01-billing/up.sql index b36fcdc14..ec44d56c5 100644 --- a/migrations/billing/6.0.22/up.sql +++ b/migrations/6.0.22-01-billing/up.sql @@ -22,7 +22,7 @@ CREATE INDEX idx_billing_usage_alerts_org_id ON billing_usage_alerts(org_id); CREATE INDEX idx_billing_usage_alerts_bot_id ON billing_usage_alerts(bot_id); CREATE INDEX idx_billing_usage_alerts_severity ON billing_usage_alerts(severity); CREATE INDEX idx_billing_usage_alerts_created_at ON billing_usage_alerts(created_at); -CREATE INDEX idx_billing_usage_alerts_acknowledged ON billing_usage_alerts(acknowledged_at) WHERE acknowledged_at IS NULL; +CREATE INDEX idx_billing_usage_alerts_acknowledged ON billing_usage_alerts(acknowledged_at); -- Billing Alert History table CREATE TABLE IF NOT EXISTS billing_alert_history ( @@ -87,9 +87,9 @@ CREATE TABLE IF NOT EXISTS billing_grace_periods ( end_reason VARCHAR(50), created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - UNIQUE(org_id, metric, is_active) WHERE is_active = TRUE + UNIQUE(org_id, metric, is_active) ); CREATE INDEX idx_billing_grace_periods_org_id ON billing_grace_periods(org_id); -CREATE INDEX idx_billing_grace_periods_active ON billing_grace_periods(is_active) WHERE is_active = TRUE; -CREATE INDEX idx_billing_grace_periods_expires ON billing_grace_periods(expires_at) WHERE is_active = TRUE; +CREATE INDEX idx_billing_grace_periods_active ON billing_grace_periods(is_active); +CREATE INDEX idx_billing_grace_periods_expires ON billing_grace_periods(expires_at); diff --git a/migrations/meet/6.0.23/down.sql b/migrations/6.0.23-01-meet/down.sql similarity index 100% rename from migrations/meet/6.0.23/down.sql rename to migrations/6.0.23-01-meet/down.sql diff --git a/migrations/meet/6.0.23/up.sql b/migrations/6.0.23-01-meet/up.sql similarity index 100% rename from migrations/meet/6.0.23/up.sql rename to migrations/6.0.23-01-meet/up.sql diff --git a/migrations/core/6.0.24/down.sql b/migrations/6.0.24-01-core/down.sql similarity index 100% rename from migrations/core/6.0.24/down.sql rename to migrations/6.0.24-01-core/down.sql diff --git a/migrations/core/6.0.24/up.sql b/migrations/6.0.24-01-core/up.sql similarity index 96% rename from migrations/core/6.0.24/up.sql rename to migrations/6.0.24-01-core/up.sql index 4a85d0465..1b7a2d03c 100644 --- a/migrations/core/6.0.24/up.sql +++ b/migrations/6.0.24-01-core/up.sql @@ -3,7 +3,7 @@ CREATE TABLE IF NOT EXISTS organization_invitations ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - org_id UUID NOT NULL REFERENCES organizations(id) ON DELETE CASCADE, + org_id UUID NOT NULL REFERENCES organizations(org_id) ON DELETE CASCADE, email VARCHAR(255) NOT NULL, role VARCHAR(50) NOT NULL DEFAULT 'member', status VARCHAR(20) NOT NULL DEFAULT 'pending', diff --git a/migrations/core/6.0.3/down.sql b/migrations/6.0.3-01-core/down.sql similarity index 100% rename from migrations/core/6.0.3/down.sql rename to migrations/6.0.3-01-core/down.sql diff --git a/migrations/core/6.0.3/up.sql b/migrations/6.0.3-01-core/up.sql similarity index 100% rename from migrations/core/6.0.3/up.sql rename to migrations/6.0.3-01-core/up.sql diff --git a/migrations/core/6.0.4/down.sql b/migrations/6.0.4-01-core/down.sql similarity index 100% rename from migrations/core/6.0.4/down.sql rename to migrations/6.0.4-01-core/down.sql diff --git a/migrations/core/6.0.4/up.sql b/migrations/6.0.4-01-core/up.sql similarity index 100% rename from migrations/core/6.0.4/up.sql rename to migrations/6.0.4-01-core/up.sql diff --git a/migrations/learn/6.0.5/down.sql b/migrations/6.0.5-01-learn/down.sql similarity index 100% rename from migrations/learn/6.0.5/down.sql rename to migrations/6.0.5-01-learn/down.sql diff --git a/migrations/learn/6.0.5/up.sql b/migrations/6.0.5-01-learn/up.sql similarity index 100% rename from migrations/learn/6.0.5/up.sql rename to migrations/6.0.5-01-learn/up.sql diff --git a/migrations/video/6.0.6/down.sql b/migrations/6.0.6-01-video/down.sql similarity index 100% rename from migrations/video/6.0.6/down.sql rename to migrations/6.0.6-01-video/down.sql diff --git a/migrations/video/6.0.6/up.sql b/migrations/6.0.6-01-video/up.sql similarity index 100% rename from migrations/video/6.0.6/up.sql rename to migrations/6.0.6-01-video/up.sql diff --git a/migrations/people/6.0.7/down.sql b/migrations/6.0.7-01-people/down.sql similarity index 100% rename from migrations/people/6.0.7/down.sql rename to migrations/6.0.7-01-people/down.sql diff --git a/migrations/people/6.0.7/up.sql b/migrations/6.0.7-01-people/up.sql similarity index 96% rename from migrations/people/6.0.7/up.sql rename to migrations/6.0.7-01-people/up.sql index fb5c9714b..40c30a158 100644 --- a/migrations/people/6.0.7/up.sql +++ b/migrations/6.0.7-01-people/up.sql @@ -189,42 +189,42 @@ CREATE INDEX idx_crm_notes_opportunity ON crm_notes(opportunity_id); CREATE INDEX idx_crm_notes_account ON crm_notes(account_id); INSERT INTO crm_pipeline_stages (org_id, bot_id, name, stage_order, probability, is_won, is_lost, color) -SELECT org_id, b.id, 'New', 1, 10, FALSE, FALSE, '#94a3b8' +SELECT o.org_id, b.id, 'New', 1, 10, FALSE, FALSE, '#94a3b8' FROM organizations o CROSS JOIN bots b LIMIT 1 ON CONFLICT DO NOTHING; INSERT INTO crm_pipeline_stages (org_id, bot_id, name, stage_order, probability, is_won, is_lost, color) -SELECT org_id, b.id, 'Qualified', 2, 25, FALSE, FALSE, '#3b82f6' +SELECT o.org_id, b.id, 'Qualified', 2, 25, FALSE, FALSE, '#3b82f6' FROM organizations o CROSS JOIN bots b LIMIT 1 ON CONFLICT DO NOTHING; INSERT INTO crm_pipeline_stages (org_id, bot_id, name, stage_order, probability, is_won, is_lost, color) -SELECT org_id, b.id, 'Proposal', 3, 50, FALSE, FALSE, '#8b5cf6' +SELECT o.org_id, b.id, 'Proposal', 3, 50, FALSE, FALSE, '#8b5cf6' FROM organizations o CROSS JOIN bots b LIMIT 1 ON CONFLICT DO NOTHING; INSERT INTO crm_pipeline_stages (org_id, bot_id, name, stage_order, probability, is_won, is_lost, color) -SELECT org_id, b.id, 'Negotiation', 4, 75, FALSE, FALSE, '#f59e0b' +SELECT o.org_id, b.id, 'Negotiation', 4, 75, FALSE, FALSE, '#f59e0b' FROM organizations o CROSS JOIN bots b LIMIT 1 ON CONFLICT DO NOTHING; INSERT INTO crm_pipeline_stages (org_id, bot_id, name, stage_order, probability, is_won, is_lost, color) -SELECT org_id, b.id, 'Won', 5, 100, TRUE, FALSE, '#22c55e' +SELECT o.org_id, b.id, 'Won', 5, 100, TRUE, FALSE, '#22c55e' FROM organizations o CROSS JOIN bots b LIMIT 1 ON CONFLICT DO NOTHING; INSERT INTO crm_pipeline_stages (org_id, bot_id, name, stage_order, probability, is_won, is_lost, color) -SELECT org_id, b.id, 'Lost', 6, 0, FALSE, TRUE, '#ef4444' +SELECT o.org_id, b.id, 'Lost', 6, 0, FALSE, TRUE, '#ef4444' FROM organizations o CROSS JOIN bots b LIMIT 1 diff --git a/migrations/tickets/6.0.8/down.sql b/migrations/6.0.8-01-tickets/down.sql similarity index 100% rename from migrations/tickets/6.0.8/down.sql rename to migrations/6.0.8-01-tickets/down.sql diff --git a/migrations/tickets/6.0.8/up.sql b/migrations/6.0.8-01-tickets/up.sql similarity index 100% rename from migrations/tickets/6.0.8/up.sql rename to migrations/6.0.8-01-tickets/up.sql diff --git a/migrations/billing/6.0.9/down.sql b/migrations/6.0.9-01-billing/down.sql similarity index 100% rename from migrations/billing/6.0.9/down.sql rename to migrations/6.0.9-01-billing/down.sql diff --git a/migrations/billing/6.0.9/up.sql b/migrations/6.0.9-01-billing/up.sql similarity index 100% rename from migrations/billing/6.0.9/up.sql rename to migrations/6.0.9-01-billing/up.sql diff --git a/migrations/automation/6.1.0/down.sql b/migrations/6.1.0-01-automation/down.sql similarity index 100% rename from migrations/automation/6.1.0/down.sql rename to migrations/6.1.0-01-automation/down.sql diff --git a/migrations/automation/6.1.0/up.sql b/migrations/6.1.0-01-automation/up.sql similarity index 100% rename from migrations/automation/6.1.0/up.sql rename to migrations/6.1.0-01-automation/up.sql diff --git a/migrations/calendar/6.1.1/down.sql b/migrations/6.1.1-01-calendar/down.sql similarity index 100% rename from migrations/calendar/6.1.1/down.sql rename to migrations/6.1.1-01-calendar/down.sql diff --git a/migrations/calendar/6.1.1/up.sql b/migrations/6.1.1-01-calendar/up.sql similarity index 69% rename from migrations/calendar/6.1.1/up.sql rename to migrations/6.1.1-01-calendar/up.sql index 50b25bc75..1508a3053 100644 --- a/migrations/calendar/6.1.1/up.sql +++ b/migrations/6.1.1-01-calendar/up.sql @@ -37,16 +37,16 @@ CREATE TABLE IF NOT EXISTS calendar_resource_bookings ( CREATE INDEX IF NOT EXISTS idx_resource_bookings_resource ON calendar_resource_bookings(resource_id, start_time, end_time); CREATE INDEX IF NOT EXISTS idx_resource_bookings_user ON calendar_resource_bookings(booked_by); --- Calendar sharing -CREATE TABLE IF NOT EXISTS calendar_shares ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - owner_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE, - shared_with_user UUID REFERENCES users(id) ON DELETE CASCADE, - shared_with_email VARCHAR(255), - permission_level VARCHAR(20) DEFAULT 'view', - created_at TIMESTAMPTZ DEFAULT NOW(), - CONSTRAINT check_cal_permission CHECK (permission_level IN ('free_busy', 'view', 'edit', 'admin')) -); +-- Calendar sharing (skip - already exists from 6.0.13-01-calendar) +-- CREATE TABLE IF NOT EXISTS calendar_shares ( +-- id UUID PRIMARY KEY DEFAULT gen_random_uuid(), +-- owner_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE, +-- shared_with_user UUID REFERENCES users(id) ON DELETE CASCADE, +-- shared_with_email VARCHAR(255), +-- permission_level VARCHAR(20) DEFAULT 'view', +-- created_at TIMESTAMPTZ DEFAULT NOW(), +-- CONSTRAINT check_cal_permission CHECK (permission_level IN ('free_busy', 'view', 'edit', 'admin')) +-- ); -CREATE INDEX IF NOT EXISTS idx_calendar_shares_owner ON calendar_shares(owner_id); -CREATE INDEX IF NOT EXISTS idx_calendar_shares_shared ON calendar_shares(shared_with_user); +-- CREATE INDEX IF NOT EXISTS idx_calendar_shares_owner ON calendar_shares(owner_id); +-- CREATE INDEX IF NOT EXISTS idx_calendar_shares_shared ON calendar_shares(shared_with_user); diff --git a/migrations/designer/6.1.2/down.sql b/migrations/6.1.2-01-designer/down.sql similarity index 100% rename from migrations/designer/6.1.2/down.sql rename to migrations/6.1.2-01-designer/down.sql diff --git a/migrations/designer/6.1.2/up.sql b/migrations/6.1.2-01-designer/up.sql similarity index 100% rename from migrations/designer/6.1.2/up.sql rename to migrations/6.1.2-01-designer/up.sql diff --git a/migrations/drive/6.1.3/down.sql b/migrations/6.1.3-01-drive/down.sql similarity index 100% rename from migrations/drive/6.1.3/down.sql rename to migrations/6.1.3-01-drive/down.sql diff --git a/migrations/drive/6.1.3/up.sql b/migrations/6.1.3-01-drive/up.sql similarity index 100% rename from migrations/drive/6.1.3/up.sql rename to migrations/6.1.3-01-drive/up.sql diff --git a/migrations/llm/6.1.4/down.sql b/migrations/6.1.4-01-llm/down.sql similarity index 100% rename from migrations/llm/6.1.4/down.sql rename to migrations/6.1.4-01-llm/down.sql diff --git a/migrations/llm/6.1.4/up.sql b/migrations/6.1.4-01-llm/up.sql similarity index 100% rename from migrations/llm/6.1.4/up.sql rename to migrations/6.1.4-01-llm/up.sql diff --git a/migrations/mail/6.1.5/down.sql b/migrations/6.1.5-01-mail/down.sql similarity index 100% rename from migrations/mail/6.1.5/down.sql rename to migrations/6.1.5-01-mail/down.sql diff --git a/migrations/mail/6.1.5/up.sql b/migrations/6.1.5-01-mail/up.sql similarity index 100% rename from migrations/mail/6.1.5/up.sql rename to migrations/6.1.5-01-mail/up.sql diff --git a/migrations/meet/6.1.6/down.sql b/migrations/6.1.6-01-meet/down.sql similarity index 70% rename from migrations/meet/6.1.6/down.sql rename to migrations/6.1.6-01-meet/down.sql index 9a70b14dc..732809efa 100644 --- a/migrations/meet/6.1.6/down.sql +++ b/migrations/6.1.6-01-meet/down.sql @@ -4,4 +4,5 @@ DROP TABLE IF EXISTS meeting_waiting_room; DROP TABLE IF EXISTS meeting_questions; DROP TABLE IF EXISTS meeting_polls; DROP TABLE IF EXISTS meeting_breakout_rooms; -DROP TABLE IF EXISTS meeting_recordings; +-- Note: meeting_recordings table is from 6.0.23 migration, don't drop it +DROP TABLE IF EXISTS meetings; diff --git a/migrations/meet/6.1.6/up.sql b/migrations/6.1.6-01-meet/up.sql similarity index 75% rename from migrations/meet/6.1.6/up.sql rename to migrations/6.1.6-01-meet/up.sql index e7532d85b..57fa17aa9 100644 --- a/migrations/meet/6.1.6/up.sql +++ b/migrations/6.1.6-01-meet/up.sql @@ -1,32 +1,29 @@ -- Legacy Meet Tables extracted from consolidated --- Meeting recordings -CREATE TABLE IF NOT EXISTS meeting_recordings ( +-- Core meetings table (if not exists from scheduled_meetings) +CREATE TABLE IF NOT EXISTS meetings ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - meeting_id UUID NOT NULL, - bot_id UUID NOT NULL REFERENCES bots(id) ON DELETE CASCADE, - recorded_by UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE, - file_path TEXT NOT NULL, - file_size BIGINT NOT NULL DEFAULT 0, - duration_seconds INTEGER, - format VARCHAR(20) DEFAULT 'mp4', - thumbnail_path TEXT, - transcription_path TEXT, - transcription_status VARCHAR(20) DEFAULT 'pending', - is_shared BOOLEAN DEFAULT false, - shared_with_json TEXT DEFAULT '[]', - retention_until TIMESTAMPTZ, + scheduled_meeting_id UUID REFERENCES scheduled_meetings(id) ON DELETE SET NULL, + room_id UUID, + title VARCHAR(255) NOT NULL, + status VARCHAR(20) DEFAULT 'active', + started_at TIMESTAMPTZ DEFAULT NOW(), + ended_at TIMESTAMPTZ, created_at TIMESTAMPTZ DEFAULT NOW(), - CONSTRAINT check_transcription_status CHECK (transcription_status IN ('pending', 'processing', 'completed', 'failed')) + CONSTRAINT check_meeting_status CHECK (status IN ('active', 'ended', 'cancelled')) ); -CREATE INDEX IF NOT EXISTS idx_meeting_recordings_meeting ON meeting_recordings(meeting_id); -CREATE INDEX IF NOT EXISTS idx_meeting_recordings_bot ON meeting_recordings(bot_id); +CREATE INDEX IF NOT EXISTS idx_meetings_scheduled ON meetings(scheduled_meeting_id); +CREATE INDEX IF NOT EXISTS idx_meetings_status ON meetings(status); + +-- Meeting recordings (legacy table already exists, skip creation) +-- Note: meeting_recordings table already exists from 6.0.23 migration with different schema +-- This migration creates additional meeting-related tables that reference the new meetings table -- Breakout rooms CREATE TABLE IF NOT EXISTS meeting_breakout_rooms ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - meeting_id UUID NOT NULL, + meeting_id UUID NOT NULL REFERENCES meetings(id) ON DELETE CASCADE, name VARCHAR(100) NOT NULL, room_number INTEGER NOT NULL, participants_json TEXT DEFAULT '[]', @@ -41,7 +38,7 @@ CREATE INDEX IF NOT EXISTS idx_breakout_rooms_meeting ON meeting_breakout_rooms( -- Meeting polls CREATE TABLE IF NOT EXISTS meeting_polls ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - meeting_id UUID NOT NULL, + meeting_id UUID NOT NULL REFERENCES meetings(id) ON DELETE CASCADE, created_by UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE, question TEXT NOT NULL, poll_type VARCHAR(20) DEFAULT 'single', @@ -60,7 +57,7 @@ CREATE INDEX IF NOT EXISTS idx_meeting_polls_meeting ON meeting_polls(meeting_id -- Meeting Q&A CREATE TABLE IF NOT EXISTS meeting_questions ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - meeting_id UUID NOT NULL, + meeting_id UUID NOT NULL REFERENCES meetings(id) ON DELETE CASCADE, asked_by UUID REFERENCES users(id) ON DELETE SET NULL, question TEXT NOT NULL, is_anonymous BOOLEAN DEFAULT false, @@ -78,7 +75,7 @@ CREATE INDEX IF NOT EXISTS idx_meeting_questions_unanswered ON meeting_questions -- Meeting waiting room CREATE TABLE IF NOT EXISTS meeting_waiting_room ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - meeting_id UUID NOT NULL, + meeting_id UUID NOT NULL REFERENCES meetings(id) ON DELETE CASCADE, user_id UUID REFERENCES users(id) ON DELETE CASCADE, guest_name VARCHAR(255), guest_email VARCHAR(255), @@ -96,7 +93,7 @@ CREATE INDEX IF NOT EXISTS idx_waiting_room_status ON meeting_waiting_room(meeti -- Meeting live captions CREATE TABLE IF NOT EXISTS meeting_captions ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - meeting_id UUID NOT NULL, + meeting_id UUID NOT NULL REFERENCES meetings(id) ON DELETE CASCADE, speaker_id UUID REFERENCES users(id) ON DELETE SET NULL, speaker_name VARCHAR(255), caption_text TEXT NOT NULL, diff --git a/migrations/paper/6.1.7/down.sql b/migrations/6.1.7-01-paper/down.sql similarity index 100% rename from migrations/paper/6.1.7/down.sql rename to migrations/6.1.7-01-paper/down.sql diff --git a/migrations/paper/6.1.7/up.sql b/migrations/6.1.7-01-paper/up.sql similarity index 100% rename from migrations/paper/6.1.7/up.sql rename to migrations/6.1.7-01-paper/up.sql diff --git a/migrations/research/6.1.8/down.sql b/migrations/6.1.8-01-research/down.sql similarity index 100% rename from migrations/research/6.1.8/down.sql rename to migrations/6.1.8-01-research/down.sql diff --git a/migrations/research/6.1.8/up.sql b/migrations/6.1.8-01-research/up.sql similarity index 100% rename from migrations/research/6.1.8/up.sql rename to migrations/6.1.8-01-research/up.sql diff --git a/migrations/tasks/6.1.9/down.sql b/migrations/6.1.9-01-tasks/down.sql similarity index 100% rename from migrations/tasks/6.1.9/down.sql rename to migrations/6.1.9-01-tasks/down.sql diff --git a/migrations/tasks/6.1.9/up.sql b/migrations/6.1.9-01-tasks/up.sql similarity index 100% rename from migrations/tasks/6.1.9/up.sql rename to migrations/6.1.9-01-tasks/up.sql diff --git a/migrations/automation/6.2.0/down.sql b/migrations/6.2.0-01-automation/down.sql similarity index 100% rename from migrations/automation/6.2.0/down.sql rename to migrations/6.2.0-01-automation/down.sql diff --git a/migrations/automation/6.2.0/up.sql b/migrations/6.2.0-01-automation/up.sql similarity index 92% rename from migrations/automation/6.2.0/up.sql rename to migrations/6.2.0-01-automation/up.sql index e77bf8d9f..5c3a48bb1 100644 --- a/migrations/automation/6.2.0/up.sql +++ b/migrations/6.2.0-01-automation/up.sql @@ -1,3 +1,6 @@ +-- Drop existing workflow_executions table if it exists (from older schema) +DROP TABLE IF EXISTS workflow_executions CASCADE; + -- Workflow state persistence (survives server restart) CREATE TABLE workflow_executions ( id UUID PRIMARY KEY, diff --git a/migrations/core/6.2.1/down.sql b/migrations/6.2.1-00-refresh-policy/down.sql similarity index 100% rename from migrations/core/6.2.1/down.sql rename to migrations/6.2.1-00-refresh-policy/down.sql diff --git a/migrations/core/6.2.1/up.sql b/migrations/6.2.1-00-refresh-policy/up.sql similarity index 100% rename from migrations/core/6.2.1/up.sql rename to migrations/6.2.1-00-refresh-policy/up.sql diff --git a/migrations/6.2.1-01-core/down.sql b/migrations/6.2.1-01-core/down.sql new file mode 100644 index 000000000..3bb7d2cc3 --- /dev/null +++ b/migrations/6.2.1-01-core/down.sql @@ -0,0 +1,3 @@ +-- Remove the refresh_policy column from website_crawls table +ALTER TABLE website_crawls +DROP COLUMN IF EXISTS refresh_policy; diff --git a/migrations/6.2.1-01-core/up.sql b/migrations/6.2.1-01-core/up.sql new file mode 100644 index 000000000..8be92cbf5 --- /dev/null +++ b/migrations/6.2.1-01-core/up.sql @@ -0,0 +1,13 @@ +-- Add refresh_policy column to website_crawls table +-- This column stores the user-configured refresh interval (e.g., "1d", "1w", "1m", "1y") + +ALTER TABLE website_crawls +ADD COLUMN IF NOT EXISTS refresh_policy VARCHAR(20); + +-- Update existing records to have a default refresh policy (1 month) +UPDATE website_crawls +SET refresh_policy = '1m' +WHERE refresh_policy IS NULL; + +-- Add comment for documentation +COMMENT ON COLUMN website_crawls.refresh_policy IS 'User-configured refresh interval (e.g., "1d", "1w", "1m", "1y") - shortest interval is used when duplicates exist'; diff --git a/migrations/6.2.2-01-fix-database-name/down.sql b/migrations/6.2.2-01-fix-database-name/down.sql new file mode 100644 index 000000000..7f81ff1fa --- /dev/null +++ b/migrations/6.2.2-01-fix-database-name/down.sql @@ -0,0 +1,2 @@ +DROP INDEX IF EXISTS idx_bots_database_name; +ALTER TABLE bots DROP COLUMN IF EXISTS database_name; diff --git a/migrations/6.2.2-01-fix-database-name/up.sql b/migrations/6.2.2-01-fix-database-name/up.sql new file mode 100644 index 000000000..48d970cb3 --- /dev/null +++ b/migrations/6.2.2-01-fix-database-name/up.sql @@ -0,0 +1,2 @@ +ALTER TABLE bots ADD COLUMN IF NOT EXISTS database_name VARCHAR(255); +CREATE INDEX IF NOT EXISTS idx_bots_database_name ON bots(database_name); diff --git a/src/auto_task/app_generator.rs b/src/auto_task/app_generator.rs index 2bc41a9f5..1f6f162c2 100644 --- a/src/auto_task/app_generator.rs +++ b/src/auto_task/app_generator.rs @@ -1,14 +1,13 @@ use crate::auto_task::app_logs::{log_generator_error, log_generator_info}; -use std::sync::OnceLock; use crate::auto_task::task_manifest::{ - create_manifest_from_llm_response, FieldDefinition as ManifestField, - FileDefinition, ManifestStatus, MonitorDefinition, PageDefinition, - SchedulerDefinition, SectionStatus, SectionType, TableDefinition as ManifestTable, - TaskManifest, TerminalLineType, ToolDefinition, + create_manifest_from_llm_response, FieldDefinition as ManifestField, FileDefinition, + ManifestData, ManifestStatus, MonitorDefinition, PageDefinition, SchedulerDefinition, SectionStatus, + SectionType, TableDefinition as ManifestTable, TaskManifest, TerminalLineType, ToolDefinition, }; use crate::basic::keywords::table_definition::{ generate_create_table_sql, FieldDefinition, TableDefinition, }; +use std::sync::OnceLock; use crate::core::shared::get_content_type; use crate::core::shared::models::UserSession; @@ -200,8 +199,13 @@ impl AppGenerator { fn create_manifest_from_llm_app(&mut self, llm_app: &LlmGeneratedApp) { use crate::auto_task::task_manifest::ManifestSection; - log::info!("[MANIFEST_CREATE] Creating manifest from LLM app: {} tables, {} files, {} tools, {} schedulers", - llm_app.tables.len(), llm_app.files.len(), llm_app.tools.len(), llm_app.schedulers.len()); + log::info!( + "Creating manifest from LLM app: {} tables, {} files, {} tools, {} schedulers", + llm_app.tables.len(), + llm_app.files.len(), + llm_app.tools.len(), + llm_app.schedulers.len() + ); let tables: Vec = llm_app .tables @@ -262,40 +266,58 @@ impl AppGenerator { let monitors: Vec = Vec::new(); // Create new manifest from LLM response - log::info!("[MANIFEST_CREATE] Calling create_manifest_from_llm_response with {} tables, {} files, {} pages, {} tools", + log::info!("Calling create_manifest_from_llm_response with {} tables, {} files, {} pages, {} tools", tables.len(), files.len(), pages.len(), tools.len()); let mut new_manifest = create_manifest_from_llm_response( &llm_app.name, &llm_app.description, - tables, - files, - pages, - tools, - schedulers, - monitors, + ManifestData { + tables, + files, + pages, + tools, + schedulers, + monitors, + }, ); - log::info!("[MANIFEST_CREATE] New manifest created with {} sections:", new_manifest.sections.len()); + log::info!( + "[MANIFEST_CREATE] New manifest created with {} sections:", + new_manifest.sections.len() + ); for section in &new_manifest.sections { - log::info!("[MANIFEST_CREATE] Section '{}': {} children, {} items, {} item_groups", - section.name, section.children.len(), section.items.len(), section.item_groups.len()); + log::info!( + " Section '{}': {} children, {} items, {} item_groups", + section.name, + section.children.len(), + section.items.len(), + section.item_groups.len() + ); for child in §ion.children { - log::info!("[MANIFEST_CREATE] Child '{}': {} items, {} item_groups", - child.name, child.items.len(), child.item_groups.len()); + log::info!( + " Child '{}': {} items, {} item_groups", + child.name, + child.items.len(), + child.item_groups.len() + ); } } // Mark "Analyzing Request" as completed and add it to the beginning - let mut analyzing_section = ManifestSection::new("Analyzing Request", SectionType::Validation); + let mut analyzing_section = + ManifestSection::new("Analyzing Request", SectionType::Validation); analyzing_section.total_steps = 1; analyzing_section.current_step = 1; analyzing_section.status = SectionStatus::Completed; - analyzing_section.started_at = self.manifest.as_ref() + analyzing_section.started_at = self + .manifest + .as_ref() .and_then(|m| m.sections.first()) .and_then(|s| s.started_at); analyzing_section.completed_at = Some(Utc::now()); - analyzing_section.duration_seconds = analyzing_section.started_at + analyzing_section.duration_seconds = analyzing_section + .started_at .map(|started| (Utc::now() - started).num_seconds() as u64); // Insert "Analyzing Request" at the beginning of sections @@ -311,20 +333,26 @@ impl AppGenerator { } new_manifest.start(); - new_manifest.add_terminal_line(&format!("AI planned: {} tables, {} files, {} tools", - llm_app.tables.len(), llm_app.files.len(), llm_app.tools.len()), - TerminalLineType::Success); + new_manifest.add_terminal_line( + &format!( + "AI planned: {} tables, {} files, {} tools", + llm_app.tables.len(), + llm_app.files.len(), + llm_app.tools.len() + ), + TerminalLineType::Success, + ); self.manifest = Some(new_manifest); if let (Some(ref task_id), Some(ref manifest)) = (&self.task_id, &self.manifest) { if let Ok(mut manifests) = self.state.task_manifests.write() { - log::info!("[MANIFEST_CREATE] Storing manifest for task_id: {}", task_id); + log::info!("Storing manifest for task_id: {}", task_id); manifests.insert(task_id.clone(), manifest.clone()); } } - log::info!("[MANIFEST_CREATE] Broadcasting manifest update"); + log::info!("Broadcasting manifest update"); self.broadcast_manifest_update(); } @@ -332,14 +360,10 @@ impl AppGenerator { if let (Some(ref task_id), Some(ref manifest)) = (&self.task_id, &self.manifest) { // Log the TASK.md structure for debugging let task_md = manifest.to_task_md(); - log::info!( - "[TASK.md] task={}\n{}", - task_id, - task_md - ); + log::info!("task={}\n{}", task_id, task_md); log::info!( - "[MANIFEST_BROADCAST] task={} completed={}/{} sections={}", + "task={} completed={}/{} sections={}", task_id, manifest.completed_steps, manifest.total_steps, @@ -350,7 +374,7 @@ impl AppGenerator { for section in &manifest.sections { let status = format!("{:?}", section.status); log::info!( - "[MANIFEST_BROADCAST] Section '{}': status={}, children={}, items={}, item_groups={}", + " Section '{}': status={}, children={}, items={}, item_groups={}", section.name, status, section.children.len(), @@ -360,7 +384,7 @@ impl AppGenerator { for child in §ion.children { let child_status = format!("{:?}", child.status); log::info!( - "[MANIFEST_BROADCAST] Child '{}': status={}, items={}, item_groups={}", + " Child '{}': status={}, items={}, item_groups={}", child.name, child_status, child.items.len(), @@ -375,7 +399,7 @@ impl AppGenerator { let json_details = serde_json::to_string(&manifest.to_web_json()).unwrap_or_default(); let json_size = json_details.len(); - log::info!("[MANIFEST_BROADCAST] JSON size: {} bytes", json_size); + log::info!("JSON size: {} bytes", json_size); // Persist manifest to database for historical viewing self.persist_manifest_to_db(task_id, &json_details); @@ -383,7 +407,10 @@ impl AppGenerator { // Build the event - if manifest JSON is too large (> 64KB), send without details // to avoid WebSocket frame size issues. Client will fetch full manifest via API. let event = if json_size > 65536 { - log::warn!("[MANIFEST_BROADCAST] Manifest too large ({} bytes), sending without details", json_size); + log::warn!( + "Manifest too large ({} bytes), sending without details", + json_size + ); crate::core::shared::state::TaskProgressEvent::new( task_id, "manifest_update", @@ -404,8 +431,11 @@ impl AppGenerator { // Log the final serialized event size if let Ok(event_json) = serde_json::to_string(&event) { - log::info!("[MANIFEST_BROADCAST] Final event size: {} bytes (has_details={})", - event_json.len(), json_size <= 65536); + log::info!( + "Final event size: {} bytes (has_details={})", + event_json.len(), + json_size <= 65536 + ); } self.state.broadcast_task_progress(event); @@ -414,48 +444,47 @@ impl AppGenerator { fn persist_manifest_to_db(&self, task_id: &str, manifest_json: &str) { let Ok(task_uuid) = Uuid::parse_str(task_id) else { - log::warn!("[MANIFEST_PERSIST] Invalid task_id: {}", task_id); + log::warn!("Invalid task_id: {}", task_id); return; }; let Ok(mut conn) = self.state.conn.get() else { - log::warn!("[MANIFEST_PERSIST] Failed to get DB connection for task: {}", task_id); + log::warn!("Failed to get DB connection for task: {}", task_id); return; }; let manifest_value: serde_json::Value = match serde_json::from_str(manifest_json) { Ok(v) => v, Err(e) => { - log::warn!("[MANIFEST_PERSIST] Failed to parse manifest JSON: {}", e); + log::warn!("Failed to parse manifest JSON: {}", e); return; } }; - let result = sql_query( - "UPDATE auto_tasks SET manifest_json = $1, updated_at = NOW() WHERE id = $2", - ) - .bind::(manifest_value) - .bind::(task_uuid) - .execute(&mut conn); + let result = + sql_query("UPDATE auto_tasks SET manifest_json = $1, updated_at = NOW() WHERE id = $2") + .bind::(manifest_value) + .bind::(task_uuid) + .execute(&mut conn); match result { - Ok(_) => log::trace!("[MANIFEST_PERSIST] Saved manifest for task: {}", task_id), - Err(e) => log::warn!("[MANIFEST_PERSIST] Failed to save manifest: {}", e), + Ok(_) => log::trace!("Saved manifest for task: {}", task_id), + Err(e) => log::warn!("Failed to save manifest: {}", e), } } fn update_manifest_section(&mut self, section_type: SectionType, status: SectionStatus) { if let Some(ref mut manifest) = self.manifest { - log::info!("[UPDATE_SECTION] Looking for {:?} to set {:?}", section_type, status); - log::info!("[UPDATE_SECTION] Manifest has {} sections:", manifest.sections.len()); + log::info!("Looking for {:?} to set {:?}", section_type, status); + log::info!("Manifest has {} sections:", manifest.sections.len()); for (i, s) in manifest.sections.iter().enumerate() { - log::info!("[UPDATE_SECTION] [{}] {:?} = '{}'", i, s.section_type, s.name); + log::info!(" [{}] {:?} = '{}'", i, s.section_type, s.name); } let mut found = false; for section in &mut manifest.sections { if section.section_type == section_type { found = true; - log::info!("[UPDATE_SECTION] Found section '{}'! Setting to {:?}", section.name, status); + log::info!("Found section '{}'! Setting to {:?}", section.name, status); section.status = status.clone(); if status == SectionStatus::Running { section.started_at = Some(Utc::now()); @@ -476,17 +505,22 @@ impl AppGenerator { } } if !found { - log::warn!("[UPDATE_SECTION] Section {:?} NOT FOUND in manifest!", section_type); + log::warn!("Section {:?} NOT FOUND in manifest!", section_type); } manifest.updated_at = Utc::now(); self.broadcast_manifest_update(); } else { - log::warn!("[UPDATE_SECTION] No manifest exists! Cannot update {:?}", section_type); + log::warn!("No manifest exists! Cannot update {:?}", section_type); } } /// Update a child section within a parent section - fn update_manifest_child(&mut self, parent_type: SectionType, child_type: SectionType, status: SectionStatus) { + fn update_manifest_child( + &mut self, + parent_type: SectionType, + child_type: SectionType, + status: SectionStatus, + ) { if let Some(ref mut manifest) = self.manifest { for section in &mut manifest.sections { if section.section_type == parent_type { @@ -515,7 +549,13 @@ impl AppGenerator { } /// Mark a range of item groups as completed with duration - fn complete_item_group_range(&mut self, parent_type: SectionType, child_type: SectionType, start_idx: usize, end_idx: usize) { + fn complete_item_group_range( + &mut self, + parent_type: SectionType, + child_type: SectionType, + start_idx: usize, + end_idx: usize, + ) { if let Some(ref mut manifest) = self.manifest { for section in &mut manifest.sections { if section.section_type == parent_type { @@ -535,36 +575,42 @@ impl AppGenerator { } } // Update child step progress - child.current_step = child.item_groups.iter() + child.current_step = child + .item_groups + .iter() .filter(|g| g.status == crate::auto_task::ItemStatus::Completed) .count() as u32; // Check if all item_groups in child are completed, then mark child as completed - let all_groups_completed = child.item_groups.iter() + let all_groups_completed = child + .item_groups + .iter() .all(|g| g.status == crate::auto_task::ItemStatus::Completed); if all_groups_completed && !child.item_groups.is_empty() { child.status = SectionStatus::Completed; child.completed_at = Some(Utc::now()); if let Some(started) = child.started_at { - child.duration_seconds = Some((Utc::now() - started).num_seconds() as u64); + child.duration_seconds = + Some((Utc::now() - started).num_seconds() as u64); } } break; } } // Update parent step progress - section.current_step = section.children.iter() - .map(|c| c.current_step) - .sum(); + section.current_step = section.children.iter().map(|c| c.current_step).sum(); // Check if all children in section are completed, then mark section as completed - let all_children_completed = section.children.iter() + let all_children_completed = section + .children + .iter() .all(|c| c.status == SectionStatus::Completed); if all_children_completed && !section.children.is_empty() { section.status = SectionStatus::Completed; section.completed_at = Some(Utc::now()); if let Some(started) = section.started_at { - section.duration_seconds = Some((Utc::now() - started).num_seconds() as u64); + section.duration_seconds = + Some((Utc::now() - started).num_seconds() as u64); } } break; @@ -577,11 +623,11 @@ impl AppGenerator { fn add_terminal_output(&mut self, content: &str, line_type: TerminalLineType) { if let Some(ref mut manifest) = self.manifest { - log::info!("[TERMINAL_OUTPUT] Adding line: {:?} - '{}'", line_type, content); + log::info!("Adding line: {:?} - '{}'", line_type, content); manifest.add_terminal_line(content, line_type); self.broadcast_manifest_update(); } else { - log::warn!("[TERMINAL_OUTPUT] No manifest! Cannot add: '{}'", content); + log::warn!("No manifest! Cannot add: '{}'", content); } } @@ -598,25 +644,24 @@ impl AppGenerator { let mut manifest = TaskManifest::new(&app_name, intent); // Section 1: Analyzing Request (LLM call) - let mut analyzing_section = ManifestSection::new("Analyzing Request", SectionType::Validation); + let mut analyzing_section = + ManifestSection::new("Analyzing Request", SectionType::Validation); analyzing_section.total_steps = 1; analyzing_section.status = SectionStatus::Running; analyzing_section.started_at = Some(Utc::now()); manifest.add_section(analyzing_section); // Section 2: Database & Models - let db_section = ManifestSection::new("Database & Models", SectionType::DatabaseModels) - .with_steps(1); + let db_section = + ManifestSection::new("Database & Models", SectionType::DatabaseModels).with_steps(1); manifest.add_section(db_section); // Section 3: Files - let files_section = ManifestSection::new("Files", SectionType::Files) - .with_steps(1); + let files_section = ManifestSection::new("Files", SectionType::Files).with_steps(1); manifest.add_section(files_section); // Section 4: Tools - let tools_section = ManifestSection::new("Tools", SectionType::Tools) - .with_steps(1); + let tools_section = ManifestSection::new("Tools", SectionType::Tools).with_steps(1); manifest.add_section(tools_section); manifest.status = ManifestStatus::Running; @@ -640,7 +685,7 @@ impl AppGenerator { if let (Some(ref task_id), Some(ref manifest)) = (&self.task_id, &self.manifest) { if let Ok(mut manifests) = self.state.task_manifests.write() { - log::info!("[MANIFEST] Storing preliminary manifest for task_id: {}", task_id); + log::info!("Storing preliminary manifest for task_id: {}", task_id); manifests.insert(task_id.clone(), manifest.clone()); } } @@ -651,7 +696,8 @@ impl AppGenerator { fn update_manifest_stats_real(&mut self, broadcast: bool) { if let Some(ref mut manifest) = self.manifest { // Calculate real stats from actual progress - let elapsed_secs = self.generation_start + let elapsed_secs = self + .generation_start .map(|s| s.elapsed().as_secs_f64()) .unwrap_or(0.0); @@ -661,7 +707,8 @@ impl AppGenerator { // Real processing speed based on actual items processed if elapsed_secs > 0.0 { - manifest.processing_stats.sources_per_min = (data_points as f64 / elapsed_secs) * 60.0; + manifest.processing_stats.sources_per_min = + (data_points as f64 / elapsed_secs) * 60.0; } // Estimate remaining time based on current progress @@ -670,7 +717,8 @@ impl AppGenerator { if completed > 0.0 && elapsed_secs > 0.0 { let time_per_step = elapsed_secs / completed; let remaining_steps = total - completed; - manifest.processing_stats.estimated_remaining_seconds = (time_per_step * remaining_steps) as u64; + manifest.processing_stats.estimated_remaining_seconds = + (time_per_step * remaining_steps) as u64; } // Update runtime @@ -683,7 +731,13 @@ impl AppGenerator { } /// Update a specific item's status within a section (with optional broadcast) - fn update_item_status_internal(&mut self, section_type: SectionType, item_name: &str, status: crate::auto_task::ItemStatus, broadcast: bool) { + fn update_item_status_internal( + &mut self, + section_type: SectionType, + item_name: &str, + status: crate::auto_task::ItemStatus, + broadcast: bool, + ) { let mut found = false; if let Some(ref mut manifest) = self.manifest { for section in &mut manifest.sections { @@ -697,14 +751,17 @@ impl AppGenerator { } else if status == crate::auto_task::ItemStatus::Completed { item.completed_at = Some(Utc::now()); if let Some(started) = item.started_at { - item.duration_seconds = Some((Utc::now() - started).num_seconds() as u64); + item.duration_seconds = + Some((Utc::now() - started).num_seconds() as u64); } } found = true; break; } } - if found { break; } + if found { + break; + } // Check items in children for child in &mut section.children { for item in &mut child.items { @@ -715,17 +772,21 @@ impl AppGenerator { } else if status == crate::auto_task::ItemStatus::Completed { item.completed_at = Some(Utc::now()); if let Some(started) = item.started_at { - item.duration_seconds = Some((Utc::now() - started).num_seconds() as u64); + item.duration_seconds = + Some((Utc::now() - started).num_seconds() as u64); } child.current_step += 1; // Check if all items in child are completed, then mark child as completed - let all_completed = child.items.iter().all(|i| i.status == crate::auto_task::ItemStatus::Completed); + let all_completed = child.items.iter().all(|i| { + i.status == crate::auto_task::ItemStatus::Completed + }); if all_completed && !child.items.is_empty() { child.status = SectionStatus::Completed; child.completed_at = Some(Utc::now()); if let Some(started) = child.started_at { - child.duration_seconds = Some((Utc::now() - started).num_seconds() as u64); + child.duration_seconds = + Some((Utc::now() - started).num_seconds() as u64); } } } @@ -733,10 +794,14 @@ impl AppGenerator { break; } } - if found { break; } + if found { + break; + } } } - if found { break; } + if found { + break; + } } } // Broadcast update so UI shows real-time file progress @@ -746,13 +811,26 @@ impl AppGenerator { } /// Update a specific item's status within a section (always broadcasts) - fn update_item_status(&mut self, section_type: SectionType, item_name: &str, status: crate::auto_task::ItemStatus) { + fn update_item_status( + &mut self, + section_type: SectionType, + item_name: &str, + status: crate::auto_task::ItemStatus, + ) { self.update_item_status_internal(section_type, item_name, status, true); } - fn emit_activity(&self, step: &str, message: &str, current: u8, total: u8, activity: AgentActivity) { + fn emit_activity( + &self, + step: &str, + message: &str, + current: u8, + total: u8, + activity: AgentActivity, + ) { if let Some(ref task_id) = self.task_id { - self.state.emit_activity(task_id, step, message, current, total, activity); + self.state + .emit_activity(task_id, step, message, current, total, activity); } } @@ -767,7 +845,13 @@ impl AppGenerator { (0.0, None) } - fn build_activity(&self, phase: &str, items_done: u32, items_total: Option, current_item: Option<&str>) -> AgentActivity { + fn build_activity( + &self, + phase: &str, + items_done: u32, + items_total: Option, + current_item: Option<&str>, + ) -> AgentActivity { let (speed, eta) = self.calculate_speed(items_done); let mut activity = AgentActivity::new(phase) .with_progress(items_done, items_total) @@ -805,38 +889,44 @@ impl AppGenerator { self.bytes_generated = 0; let intent_preview: String = intent.chars().take(100).collect(); - info!( - "Generating app from intent: {}", - intent_preview - ); + info!("Generating app from intent: {}", intent_preview); let intent_short: String = intent.chars().take(50).collect(); log_generator_info( "pending", - &format!( - "Starting app generation: {}", - intent_short - ), + &format!("Starting app generation: {}", intent_short), ); if let Some(ref task_id) = self.task_id { let intent_msg: String = intent.chars().take(50).collect(); - self.state.emit_task_started(task_id, &format!("Generating app: {}", intent_msg), TOTAL_STEPS); + self.state.emit_task_started( + task_id, + &format!("Generating app: {}", intent_msg), + TOTAL_STEPS, + ); self.create_preliminary_manifest(intent); } - let activity = self.build_activity("analyzing", 0, Some(TOTAL_STEPS as u32), Some("Sending request to LLM")); + let activity = self.build_activity( + "analyzing", + 0, + Some(TOTAL_STEPS as u32), + Some("Sending request to LLM"), + ); self.emit_activity( "llm_request", "Analyzing request with AI...", 1, TOTAL_STEPS, - activity + activity, ); // ========== PHASE 1: Get project plan (structure only) ========== let intent_trace: String = intent.chars().take(50).collect(); - trace!("APP_GENERATOR [PHASE1] Getting project plan for: {}", intent_trace); + trace!( + "APP_GENERATOR [PHASE1] Getting project plan for: {}", + intent_trace + ); let plan_start = std::time::Instant::now(); let mut llm_app = match self.get_project_plan_from_llm(intent, session.bot_id).await { @@ -845,17 +935,29 @@ impl AppGenerator { info!("APP_GENERATOR [PHASE1] Plan received in {:?}: app={}, tables={}, files={}, tools={}", plan_elapsed, plan.name, plan.tables.len(), plan.files.len(), plan.tools.len()); - let is_empty_plan = plan.files.is_empty() && plan.tables.is_empty() && plan.tools.is_empty(); + let is_empty_plan = + plan.files.is_empty() && plan.tables.is_empty() && plan.tools.is_empty(); if is_empty_plan { warn!("APP_GENERATOR [PHASE1] Empty plan received, falling back to single-phase generation"); - self.add_terminal_output("Plan parsing returned empty, trying full generation...", TerminalLineType::Warning); - match self.generate_complete_app_with_llm(intent, session.bot_id).await { + self.add_terminal_output( + "Plan parsing returned empty, trying full generation...", + TerminalLineType::Warning, + ); + match self + .generate_complete_app_with_llm(intent, session.bot_id) + .await + { Ok(app) => app, Err(e2) => { - log_generator_error("unknown", "LLM app generation failed", &e2.to_string()); + log_generator_error( + "unknown", + "LLM app generation failed", + &e2.to_string(), + ); if let Some(ref task_id) = self.task_id { - self.state.emit_task_error(task_id, "llm_request", &e2.to_string()); + self.state + .emit_task_error(task_id, "llm_request", &e2.to_string()); } return Err(e2); } @@ -865,27 +967,46 @@ impl AppGenerator { "planning", 1, Some(TOTAL_STEPS as u32), - Some(&format!("Planned {} with {} files", plan.name, plan.files.len())) + Some(&format!( + "Planned {} with {} files", + plan.name, + plan.files.len() + )), ); self.emit_activity( "plan_complete", - &format!("Project plan ready: {} tables, {} files", plan.tables.len(), plan.files.len()), + &format!( + "Project plan ready: {} tables, {} files", + plan.tables.len(), + plan.files.len() + ), 2, TOTAL_STEPS, - activity + activity, ); plan } } Err(e) => { error!("APP_GENERATOR [PHASE1] Planning failed: {}", e); - self.add_terminal_output(&format!("Planning error: {e}, trying full generation..."), TerminalLineType::Warning); - match self.generate_complete_app_with_llm(intent, session.bot_id).await { + self.add_terminal_output( + &format!("Planning error: {e}, trying full generation..."), + TerminalLineType::Warning, + ); + match self + .generate_complete_app_with_llm(intent, session.bot_id) + .await + { Ok(app) => app, Err(e2) => { - log_generator_error("unknown", "LLM app generation failed", &e2.to_string()); + log_generator_error( + "unknown", + "LLM app generation failed", + &e2.to_string(), + ); if let Some(ref task_id) = self.task_id { - self.state.emit_task_error(task_id, "llm_request", &e2.to_string()); + self.state + .emit_task_error(task_id, "llm_request", &e2.to_string()); } return Err(e2); } @@ -903,18 +1024,47 @@ impl AppGenerator { self.broadcast_manifest_update(); info!("APP_GENERATOR [PHASE1->2] Manifest created with full structure, starting content generation"); - self.add_terminal_output(&format!("## Project Plan: {}", llm_app.name), TerminalLineType::Info); - self.add_terminal_output(&format!("- Tables: {}", llm_app.tables.len()), TerminalLineType::Info); - self.add_terminal_output(&format!("- Files: {}", llm_app.files.len()), TerminalLineType::Info); - self.add_terminal_output(&format!("- Tools: {}", llm_app.tools.len()), TerminalLineType::Info); - self.add_terminal_output(&format!("- Schedulers: {}", llm_app.schedulers.len()), TerminalLineType::Info); + self.add_terminal_output( + &format!("## Project Plan: {}", llm_app.name), + TerminalLineType::Info, + ); + self.add_terminal_output( + &format!("- Tables: {}", llm_app.tables.len()), + TerminalLineType::Info, + ); + self.add_terminal_output( + &format!("- Files: {}", llm_app.files.len()), + TerminalLineType::Info, + ); + self.add_terminal_output( + &format!("- Tools: {}", llm_app.tools.len()), + TerminalLineType::Info, + ); + self.add_terminal_output( + &format!("- Schedulers: {}", llm_app.schedulers.len()), + TerminalLineType::Info, + ); self.add_terminal_output("", TerminalLineType::Info); - self.add_terminal_output("## Phase 2: Generating content...", TerminalLineType::Progress); + self.add_terminal_output( + "## Phase 2: Generating content...", + TerminalLineType::Progress, + ); self.update_manifest_stats_real(true); // ========== PHASE 2A: DATABASE & MODELS (must come first!) ========== - let activity = self.build_activity("parsing", 2, Some(TOTAL_STEPS as u32), Some(&format!("Processing {} structure", llm_app.name))); - self.emit_activity("parse_structure", &format!("Parsing {} structure...", llm_app.name), 3, TOTAL_STEPS, activity); + let activity = self.build_activity( + "parsing", + 2, + Some(TOTAL_STEPS as u32), + Some(&format!("Processing {} structure", llm_app.name)), + ); + self.emit_activity( + "parse_structure", + &format!("Parsing {} structure...", llm_app.name), + 3, + TOTAL_STEPS, + activity, + ); let tables = Self::convert_llm_tables(&llm_app.tables); @@ -922,7 +1072,11 @@ impl AppGenerator { info!("[PHASE2] Setting Database & Models section to Running"); self.update_manifest_section(SectionType::DatabaseModels, SectionStatus::Running); self.broadcast_manifest_update(); - self.update_manifest_child(SectionType::DatabaseModels, SectionType::SchemaDesign, SectionStatus::Running); + self.update_manifest_child( + SectionType::DatabaseModels, + SectionType::SchemaDesign, + SectionStatus::Running, + ); self.add_terminal_output("## Creating database schema...", TerminalLineType::Progress); self.update_manifest_stats_real(true); @@ -931,14 +1085,14 @@ impl AppGenerator { "database", 3, Some(TOTAL_STEPS as u32), - Some(&format!("Creating tables: {}", table_names.join(", "))) + Some(&format!("Creating tables: {}", table_names.join(", "))), ); self.emit_activity( "create_tables", &format!("Creating {} database tables...", tables.len()), 4, TOTAL_STEPS, - activity + activity, ); let tables_bas_content = Self::generate_table_definitions(&tables)?; @@ -957,10 +1111,17 @@ impl AppGenerator { for (idx, table) in tables.iter().enumerate() { // Update current action to show which table is being processed - self.add_terminal_output(&format!(" Creating table `{}`...", table.name), TerminalLineType::Info); + self.add_terminal_output( + &format!(" Creating table `{}`...", table.name), + TerminalLineType::Info, + ); // Mark this specific item as running - self.update_item_status(SectionType::DatabaseModels, &table.name, crate::auto_task::ItemStatus::Running); + self.update_item_status( + SectionType::DatabaseModels, + &table.name, + crate::auto_task::ItemStatus::Running, + ); self.broadcast_manifest_update(); // Sync the individual table to the bot's specific database @@ -970,13 +1131,28 @@ impl AppGenerator { fields_added += field_count; // Mark item as completed and broadcast immediately - self.update_item_status(SectionType::DatabaseModels, &table.name, crate::auto_task::ItemStatus::Completed); - self.add_terminal_output(&format!(" ✓ Table `{}` ({} fields)", table.name, field_count), TerminalLineType::Success); + self.update_item_status( + SectionType::DatabaseModels, + &table.name, + crate::auto_task::ItemStatus::Completed, + ); + self.add_terminal_output( + &format!(" ✓ Table `{}` ({} fields)", table.name, field_count), + TerminalLineType::Success, + ); // Update child progress if let Some(ref mut manifest) = self.manifest { - if let Some(section) = manifest.sections.iter_mut().find(|s| s.section_type == SectionType::DatabaseModels) { - if let Some(child) = section.children.iter_mut().find(|c| c.section_type == SectionType::SchemaDesign) { + if let Some(section) = manifest + .sections + .iter_mut() + .find(|s| s.section_type == SectionType::DatabaseModels) + { + if let Some(child) = section + .children + .iter_mut() + .find(|c| c.section_type == SectionType::SchemaDesign) + { child.current_step = (idx + 1) as u32; } section.current_step = (idx + 1) as u32; @@ -984,7 +1160,12 @@ impl AppGenerator { } // Complete item group if it exists - self.complete_item_group_range(SectionType::DatabaseModels, SectionType::SchemaDesign, idx, idx); + self.complete_item_group_range( + SectionType::DatabaseModels, + SectionType::SchemaDesign, + idx, + idx, + ); self.broadcast_manifest_update(); @@ -993,21 +1174,33 @@ impl AppGenerator { "database", 3, Some(total_tables as u32), - Some(&format!("Created table {} ({}/{})", table.name, idx + 1, total_tables)) + Some(&format!( + "Created table {} ({}/{})", + table.name, + idx + 1, + total_tables + )), ); self.emit_activity( "table_created", &format!("Created table {}", table.name), 4, TOTAL_STEPS, - activity + activity, ); } Err(e) => { warn!("Table {} may already exist or failed: {}", table.name, e); // Still mark as completed (table likely exists) - self.update_item_status(SectionType::DatabaseModels, &table.name, crate::auto_task::ItemStatus::Completed); - self.add_terminal_output(&format!(" ⚠ Table `{}` (may exist)", table.name), TerminalLineType::Info); + self.update_item_status( + SectionType::DatabaseModels, + &table.name, + crate::auto_task::ItemStatus::Completed, + ); + self.add_terminal_output( + &format!(" ⚠ Table `{}` (may exist)", table.name), + TerminalLineType::Info, + ); self.broadcast_manifest_update(); } } @@ -1024,7 +1217,11 @@ impl AppGenerator { ); // Mark child and parent as completed - self.update_manifest_child(SectionType::DatabaseModels, SectionType::SchemaDesign, SectionStatus::Completed); + self.update_manifest_child( + SectionType::DatabaseModels, + SectionType::SchemaDesign, + SectionStatus::Completed, + ); self.update_manifest_section(SectionType::DatabaseModels, SectionStatus::Completed); self.update_manifest_stats_real(true); @@ -1032,14 +1229,17 @@ impl AppGenerator { "database", 4, Some(TOTAL_STEPS as u32), - Some(&format!("{} tables, {} fields created", tables_created, fields_added)) + Some(&format!( + "{} tables, {} fields created", + tables_created, fields_added + )), ); self.emit_activity( "tables_synced", "Database tables created", 4, TOTAL_STEPS, - activity + activity, ); } else { // No tables - mark database section as skipped @@ -1052,59 +1252,119 @@ impl AppGenerator { let mut generated_count = 0; // Generate content for files that don't have it yet - let files_needing_content: Vec = llm_app.files.iter() + let files_needing_content: Vec = llm_app + .files + .iter() .enumerate() .filter(|(_, f)| f.content.is_empty()) .map(|(i, _)| i) .collect(); - info!("[PHASE2B] Files needing content: {} out of {} total files", files_needing_content.len(), llm_app.files.len()); + info!( + "[PHASE2B] Files needing content: {} out of {} total files", + files_needing_content.len(), + llm_app.files.len() + ); for (i, file) in llm_app.files.iter().enumerate() { - info!("[PHASE2B] File {}: {} - content_len={}", i, file.filename, file.content.len()); + info!( + "[PHASE2B] File {}: {} - content_len={}", + i, + file.filename, + file.content.len() + ); } if !files_needing_content.is_empty() { - info!("[PHASE2B] Setting Files section to Running - manifest exists: {}", self.manifest.is_some()); + info!( + "[PHASE2B] Setting Files section to Running - manifest exists: {}", + self.manifest.is_some() + ); // Debug: List all sections before update if let Some(ref manifest) = self.manifest { info!("[PHASE2B] Current manifest sections:"); for (i, s) in manifest.sections.iter().enumerate() { - info!("[PHASE2B] [{}] {:?} = '{}' status={:?}", i, s.section_type, s.name, s.status); + info!( + "[PHASE2B] [{}] {:?} = '{}' status={:?}", + i, s.section_type, s.name, s.status + ); } } self.update_manifest_section(SectionType::Files, SectionStatus::Running); self.broadcast_manifest_update(); - self.add_terminal_output(&format!("## Generating {} files...", files_needing_content.len()), TerminalLineType::Progress); + self.add_terminal_output( + &format!("## Generating {} files...", files_needing_content.len()), + TerminalLineType::Progress, + ); for idx in files_needing_content { let filename = llm_app.files[idx].filename.clone(); generated_count += 1; info!("[PHASE2B] Starting generation for file: {}", filename); - self.add_terminal_output(&format!("Generating `{filename}`..."), TerminalLineType::Info); - self.update_item_status(SectionType::Files, &filename, crate::auto_task::ItemStatus::Running); + self.add_terminal_output( + &format!("Generating `{filename}`..."), + TerminalLineType::Info, + ); + self.update_item_status( + SectionType::Files, + &filename, + crate::auto_task::ItemStatus::Running, + ); - match self.generate_file_content(&llm_app, &filename, session.bot_id).await { + match self + .generate_file_content(&llm_app, &filename, session.bot_id) + .await + { Ok(content) => { let content_len = content.len(); - info!("[PHASE2B] Generated file {} with {} bytes", filename, content_len); + info!( + "[PHASE2B] Generated file {} with {} bytes", + filename, content_len + ); llm_app.files[idx].content = content; - self.add_terminal_output(&format!("✓ `{filename}` ({content_len} bytes)"), TerminalLineType::Success); - self.update_item_status(SectionType::Files, &filename, crate::auto_task::ItemStatus::Completed); + self.add_terminal_output( + &format!("✓ `{filename}` ({content_len} bytes)"), + TerminalLineType::Success, + ); + self.update_item_status( + SectionType::Files, + &filename, + crate::auto_task::ItemStatus::Completed, + ); } Err(e) => { error!("[PHASE2B] Failed to generate {}: {}", filename, e); - self.add_terminal_output(&format!("✗ `{filename}` failed: {e}"), TerminalLineType::Error); + self.add_terminal_output( + &format!("✗ `{filename}` failed: {e}"), + TerminalLineType::Error, + ); } } - let activity = self.build_activity("generating", generated_count as u32, Some(total_items as u32), Some(&filename)); - self.emit_activity("file_generated", &format!("Generated {filename}"), 3, TOTAL_STEPS, activity); + let activity = self.build_activity( + "generating", + generated_count as u32, + Some(total_items as u32), + Some(&filename), + ); + self.emit_activity( + "file_generated", + &format!("Generated {filename}"), + 3, + TOTAL_STEPS, + activity, + ); } } else { - info!("[PHASE2B] No files need content generation - all {} files already have content", llm_app.files.len()); - self.add_terminal_output(&format!("All {} files already generated", llm_app.files.len()), TerminalLineType::Success); + info!( + "[PHASE2B] No files need content generation - all {} files already have content", + llm_app.files.len() + ); + self.add_terminal_output( + &format!("All {} files already generated", llm_app.files.len()), + TerminalLineType::Success, + ); } // Mark Files content generation as completed (writing happens next) @@ -1115,10 +1375,14 @@ impl AppGenerator { let sanitized_name = bucket_name.trim_end_matches(".gbai").to_string(); let drive_app_path = format!("{}.gbapp/{}", sanitized_name, llm_app.name); - info!("Writing app files to bucket: {}, path: {}", bucket_name, drive_app_path); + info!( + "Writing app files to bucket: {}, path: {}", + bucket_name, drive_app_path + ); // Build list of files to generate for progress tracking - let mut files_to_generate: Vec = llm_app.files.iter().map(|f| f.filename.clone()).collect(); + let mut files_to_generate: Vec = + llm_app.files.iter().map(|f| f.filename.clone()).collect(); files_to_generate.push("designer.js".to_string()); // Update task with file list before starting @@ -1129,17 +1393,25 @@ impl AppGenerator { } let total_files = llm_app.files.len(); - let activity = self.build_activity("writing", 0, Some(total_files as u32), Some("Preparing files")); + let activity = self.build_activity( + "writing", + 0, + Some(total_files as u32), + Some("Preparing files"), + ); self.emit_activity( "write_files", &format!("Writing {} app files...", total_files), 5, TOTAL_STEPS, - activity + activity, ); self.update_manifest_section(SectionType::Files, SectionStatus::Running); - self.add_terminal_output(&format!("## Writing {} files...", total_files), TerminalLineType::Progress); + self.add_terminal_output( + &format!("## Writing {} files...", total_files), + TerminalLineType::Progress, + ); self.update_manifest_stats_real(true); let mut pages = Vec::new(); @@ -1150,21 +1422,28 @@ impl AppGenerator { self.bytes_generated += file.content.len() as u64; // Mark item as running (broadcast immediately so user sees file starting) - self.update_item_status(SectionType::Files, &file.filename, crate::auto_task::ItemStatus::Running); - self.add_terminal_output(&format!("Writing `{}`...", file.filename), TerminalLineType::Info); + self.update_item_status( + SectionType::Files, + &file.filename, + crate::auto_task::ItemStatus::Running, + ); + self.add_terminal_output( + &format!("Writing `{}`...", file.filename), + TerminalLineType::Info, + ); let activity = self.build_activity( "writing", (idx + 1) as u32, Some(total_files as u32), - Some(&file.filename) + Some(&file.filename), ); self.emit_activity( "write_file", &format!("Writing {}", file.filename), 5, TOTAL_STEPS, - activity + activity, ); // Write to MinIO - drive monitor will sync to SITES_ROOT @@ -1181,14 +1460,22 @@ impl AppGenerator { // Update progress in database if let Some(ref task_id) = self.task_id { if let Ok(task_uuid) = uuid::Uuid::parse_str(task_id) { - let _ = self.update_task_step_results(task_uuid, &files_to_generate, idx + 1); + let _ = + self.update_task_step_results(task_uuid, &files_to_generate, idx + 1); } } } // Mark item as completed (broadcast immediately so user sees progress) - self.update_item_status(SectionType::Files, &file.filename, crate::auto_task::ItemStatus::Completed); - self.add_terminal_output(&format!("✓ `{}` ({} bytes)", file.filename, file.content.len()), TerminalLineType::Success); + self.update_item_status( + SectionType::Files, + &file.filename, + crate::auto_task::ItemStatus::Completed, + ); + self.add_terminal_output( + &format!("✓ `{}` ({} bytes)", file.filename, file.content.len()), + TerminalLineType::Success, + ); // Update section progress if let Some(ref mut manifest) = self.manifest { @@ -1219,8 +1506,19 @@ impl AppGenerator { self.update_manifest_section(SectionType::Pages, SectionStatus::Completed); self.files_written.push("designer.js".to_string()); - let activity = self.build_activity("configuring", total_files as u32, Some(total_files as u32), Some("designer.js")); - self.emit_activity("write_designer", "Creating designer configuration...", 6, TOTAL_STEPS, activity); + let activity = self.build_activity( + "configuring", + total_files as u32, + Some(total_files as u32), + Some("designer.js"), + ); + self.emit_activity( + "write_designer", + "Creating designer configuration...", + 6, + TOTAL_STEPS, + activity, + ); let designer_js = Self::generate_designer_js(&llm_app.name); self.bytes_generated += designer_js.len() as u64; @@ -1239,13 +1537,18 @@ impl AppGenerator { self.add_terminal_output("Creating automation tools...", TerminalLineType::Progress); let tools_count = llm_app.tools.len(); - let activity = self.build_activity("tools", 0, Some(tools_count as u32), Some("Creating BASIC tools")); + let activity = self.build_activity( + "tools", + 0, + Some(tools_count as u32), + Some("Creating BASIC tools"), + ); self.emit_activity( "write_tools", &format!("Creating {} tools...", tools_count), 7, TOTAL_STEPS, - activity + activity, ); for (idx, tool) in llm_app.tools.iter().enumerate() { @@ -1253,8 +1556,19 @@ impl AppGenerator { self.files_written.push(format!("tools/{}", tool.filename)); self.bytes_generated += tool.content.len() as u64; - let activity = self.build_activity("tools", (idx + 1) as u32, Some(tools_count as u32), Some(&tool.filename)); - self.emit_activity("write_tool", &format!("Writing tool {}", tool.filename), 7, TOTAL_STEPS, activity); + let activity = self.build_activity( + "tools", + (idx + 1) as u32, + Some(tools_count as u32), + Some(&tool.filename), + ); + self.emit_activity( + "write_tool", + &format!("Writing tool {}", tool.filename), + 7, + TOTAL_STEPS, + activity, + ); if let Err(e) = self .write_to_drive(&bucket_name, &tool_path, &tool.content) @@ -1266,8 +1580,15 @@ impl AppGenerator { &e.to_string(), ); } - self.update_item_status(SectionType::Tools, &tool.filename, crate::auto_task::ItemStatus::Completed); - self.add_terminal_output(&format!("✓ Tool `{}`", tool.filename), TerminalLineType::Success); + self.update_item_status( + SectionType::Tools, + &tool.filename, + crate::auto_task::ItemStatus::Completed, + ); + self.add_terminal_output( + &format!("✓ Tool `{}`", tool.filename), + TerminalLineType::Success, + ); tools.push(GeneratedFile { filename: tool.filename.clone(), @@ -1288,22 +1609,39 @@ impl AppGenerator { self.add_terminal_output("Creating scheduled tasks...", TerminalLineType::Progress); let sched_count = llm_app.schedulers.len(); - let activity = self.build_activity("schedulers", 0, Some(sched_count as u32), Some("Creating schedulers")); + let activity = self.build_activity( + "schedulers", + 0, + Some(sched_count as u32), + Some("Creating schedulers"), + ); self.emit_activity( "write_schedulers", &format!("Creating {} schedulers...", sched_count), 7, TOTAL_STEPS, - activity + activity, ); for (idx, scheduler) in llm_app.schedulers.iter().enumerate() { let scheduler_path = format!(".gbdialog/schedulers/{}", scheduler.filename); - self.files_written.push(format!("schedulers/{}", scheduler.filename)); + self.files_written + .push(format!("schedulers/{}", scheduler.filename)); self.bytes_generated += scheduler.content.len() as u64; - let activity = self.build_activity("schedulers", (idx + 1) as u32, Some(sched_count as u32), Some(&scheduler.filename)); - self.emit_activity("write_scheduler", &format!("Writing scheduler {}", scheduler.filename), 7, TOTAL_STEPS, activity); + let activity = self.build_activity( + "schedulers", + (idx + 1) as u32, + Some(sched_count as u32), + Some(&scheduler.filename), + ); + self.emit_activity( + "write_scheduler", + &format!("Writing scheduler {}", scheduler.filename), + 7, + TOTAL_STEPS, + activity, + ); if let Err(e) = self .write_to_drive(&bucket_name, &scheduler_path, &scheduler.content) @@ -1315,8 +1653,15 @@ impl AppGenerator { &e.to_string(), ); } - self.update_item_status(SectionType::Schedulers, &scheduler.filename, crate::auto_task::ItemStatus::Completed); - self.add_terminal_output(&format!("✓ Scheduler `{}`", scheduler.filename), TerminalLineType::Success); + self.update_item_status( + SectionType::Schedulers, + &scheduler.filename, + crate::auto_task::ItemStatus::Completed, + ); + self.add_terminal_output( + &format!("✓ Scheduler `{}`", scheduler.filename), + TerminalLineType::Success, + ); schedulers.push(GeneratedFile { filename: scheduler.filename.clone(), @@ -1342,13 +1687,30 @@ impl AppGenerator { manifest.complete(); } self.add_terminal_output("## Complete!", TerminalLineType::Success); - self.add_terminal_output(&format!("✓ App **{}** ready at `{}`", llm_app.name, app_url), TerminalLineType::Success); + self.add_terminal_output( + &format!("✓ App **{}** ready at `{}`", llm_app.name, app_url), + TerminalLineType::Success, + ); self.update_manifest_stats_real(true); - let activity = self.build_activity("complete", TOTAL_STEPS as u32, Some(TOTAL_STEPS as u32), Some("App ready")); - self.emit_activity("complete", &format!("App ready at {}", app_url), 8, TOTAL_STEPS, activity); + let activity = self.build_activity( + "complete", + TOTAL_STEPS as u32, + Some(TOTAL_STEPS as u32), + Some("App ready"), + ); + self.emit_activity( + "complete", + &format!("App ready at {}", app_url), + 8, + TOTAL_STEPS, + activity, + ); - let elapsed = self.generation_start.map(|s| s.elapsed().as_secs()).unwrap_or(0); + let elapsed = self + .generation_start + .map(|s| s.elapsed().as_secs()) + .unwrap_or(0); log_generator_info( &llm_app.name, @@ -1380,10 +1742,18 @@ impl AppGenerator { .with_tables(self.tables_synced.clone()); // Include app_url in the completion event - let event = crate::core::shared::state::TaskProgressEvent::new(task_id, "complete", format!( - "App '{}' created: {} files, {} tables, {} bytes in {}s", - llm_app.name, pages.len(), tables.len(), self.bytes_generated, elapsed - )) + let event = crate::core::shared::state::TaskProgressEvent::new( + task_id, + "complete", + format!( + "App '{}' created: {} files, {} tables, {} bytes in {}s", + llm_app.name, + pages.len(), + tables.len(), + self.bytes_generated, + elapsed + ), + ) .with_progress(TOTAL_STEPS, TOTAL_STEPS) .with_activity(final_activity) .with_details(format!("app_url:{}", app_url)) @@ -1501,7 +1871,10 @@ RESPOND ONLY WITH THE PLAN STRUCTURE. NO QUESTIONS."# ); let intent_preview: String = intent.chars().take(50).collect(); - info!("[PHASE1] Getting project plan from LLM for: {}", intent_preview); + info!( + "[PHASE1] Getting project plan from LLM for: {}", + intent_preview + ); let response = self.call_llm(&prompt, bot_id).await?; info!("[PHASE1] Project plan received, parsing..."); @@ -1517,7 +1890,10 @@ RESPOND ONLY WITH THE PLAN STRUCTURE. NO QUESTIONS."# // Debug: Log the raw response to understand what LLM returned let response_preview: String = response.chars().take(500).collect(); - info!("[PHASE1_PARSE] Response preview: {}", response_preview.replace('\n', "\\n")); + info!( + "[PHASE1_PARSE] Response preview: {}", + response_preview.replace('\n', "\\n") + ); info!("[PHASE1_PARSE] Has APP_START: {}, Has TABLES_START: {}, Has FILES_PLAN: {}, Has TOOLS_PLAN: {}", response.contains("<<>>"), response.contains("<<>>"), @@ -1543,14 +1919,22 @@ RESPOND ONLY WITH THE PLAN STRUCTURE. NO QUESTIONS."# // Default name if not found if app.name.is_empty() { - app.name = intent.split_whitespace().take(3).collect::>().join("-").to_lowercase(); + app.name = intent + .split_whitespace() + .take(3) + .collect::>() + .join("-") + .to_lowercase(); } if app.description.is_empty() { app.description = intent.to_string(); } // Parse tables - if let (Some(start), Some(end)) = (response.find("<<>>"), response.find("<<>>")) { + if let (Some(start), Some(end)) = ( + response.find("<<>>"), + response.find("<<>>"), + ) { let tables_section = response.get(start..end).unwrap_or(""); let mut current_table: Option = None; @@ -1562,7 +1946,10 @@ RESPOND ONLY WITH THE PLAN STRUCTURE. NO QUESTIONS."# app.tables.push(table); } } - let table_name = line.get(9..line.len().saturating_sub(3)).unwrap_or("").trim(); + let table_name = line + .get(9..line.len().saturating_sub(3)) + .unwrap_or("") + .trim(); current_table = Some(LlmTable { name: table_name.to_string(), fields: Vec::new(), @@ -1610,16 +1997,28 @@ RESPOND ONLY WITH THE PLAN STRUCTURE. NO QUESTIONS."# } // Parse file plan (just names, no content yet) - if let (Some(start), Some(end)) = (response.find("<<>>"), response.find("<<>>")) { + if let (Some(start), Some(end)) = ( + response.find("<<>>"), + response.find("<<>>"), + ) { let files_section = response.get(start + 16..end).unwrap_or(""); - info!("[PHASE1_PARSE] FILES_PLAN section: {}", files_section.replace('\n', "\\n")); + info!( + "[PHASE1_PARSE] FILES_PLAN section: {}", + files_section.replace('\n', "\\n") + ); for line in files_section.lines() { let line = line.trim(); if line.contains(':') { let parts: Vec<&str> = line.splitn(2, ':').collect(); let filename = parts[0].trim().to_string(); // Accept common web file extensions - if !filename.is_empty() && (filename.ends_with(".html") || filename.ends_with(".css") || filename.ends_with(".js") || filename.ends_with(".bas") || filename.ends_with(".json")) { + if !filename.is_empty() + && (filename.ends_with(".html") + || filename.ends_with(".css") + || filename.ends_with(".js") + || filename.ends_with(".bas") + || filename.ends_with(".json")) + { info!("[PHASE1_PARSE] Adding file: {}", filename); app.files.push(LlmFile { filename, @@ -1631,7 +2030,9 @@ RESPOND ONLY WITH THE PLAN STRUCTURE. NO QUESTIONS."# } } } else { - info!("[PHASE1_PARSE] FILES_PLAN section not found! Looking for <<>> delimiters directly for file_match in response.match_indices("<<>>"), response.find("<<>>")) { + if let (Some(start), Some(end)) = ( + response.find("<<>>"), + response.find("<<>>"), + ) { let tools_section = response.get(start + 16..end).unwrap_or(""); for line in tools_section.lines() { let line = line.trim(); @@ -1661,7 +2065,11 @@ RESPOND ONLY WITH THE PLAN STRUCTURE. NO QUESTIONS."# let parts: Vec<&str> = line.splitn(2, ':').collect(); let filename = parts[0].trim().to_string(); if !filename.is_empty() { - let filename = if filename.ends_with(".bas") { filename } else { format!("{}.bas", filename) }; + let filename = if filename.ends_with(".bas") { + filename + } else { + format!("{}.bas", filename) + }; app.tools.push(LlmFile { filename, content: String::new(), @@ -1678,7 +2086,11 @@ RESPOND ONLY WITH THE PLAN STRUCTURE. NO QUESTIONS."# if let Some(tool_name) = rest.get(8..end_offset) { let tool_name = tool_name.trim(); if !tool_name.is_empty() { - let filename = if tool_name.ends_with(".bas") { tool_name.to_string() } else { format!("{}.bas", tool_name) }; + let filename = if tool_name.ends_with(".bas") { + tool_name.to_string() + } else { + format!("{}.bas", tool_name) + }; info!("[PHASE1_PARSE] Found tool from delimiter: {}", filename); app.tools.push(LlmFile { filename, @@ -1692,7 +2104,10 @@ RESPOND ONLY WITH THE PLAN STRUCTURE. NO QUESTIONS."# } // Parse schedulers plan - if let (Some(start), Some(end)) = (response.find("<<>>"), response.find("<<>>")) { + if let (Some(start), Some(end)) = ( + response.find("<<>>"), + response.find("<<>>"), + ) { let sched_section = response.get(start + 21..end).unwrap_or(""); for line in sched_section.lines() { let line = line.trim(); @@ -1700,7 +2115,11 @@ RESPOND ONLY WITH THE PLAN STRUCTURE. NO QUESTIONS."# let parts: Vec<&str> = line.splitn(2, ':').collect(); let filename = parts[0].trim().to_string(); if !filename.is_empty() { - let filename = if filename.ends_with(".bas") { filename } else { format!("{}.bas", filename) }; + let filename = if filename.ends_with(".bas") { + filename + } else { + format!("{}.bas", filename) + }; app.schedulers.push(LlmFile { filename, content: String::new(), @@ -1709,7 +2128,9 @@ RESPOND ONLY WITH THE PLAN STRUCTURE. NO QUESTIONS."# } } } else { - info!("[PHASE1_PARSE] SCHEDULERS_PLAN not found, trying <<>> delimiters..."); + info!( + "[PHASE1_PARSE] SCHEDULERS_PLAN not found, trying <<>> delimiters..." + ); for sched_match in response.match_indices("<< Result> { let platform = Self::get_platform_prompt(); - let tables_desc = app.tables.iter() - .map(|t| format!("- {}: {}", t.name, t.fields.iter().map(|f| f.name.clone()).collect::>().join(", "))) + let tables_desc = app + .tables + .iter() + .map(|t| { + format!( + "- {}: {}", + t.name, + t.fields + .iter() + .map(|f| f.name.clone()) + .collect::>() + .join(", ") + ) + }) .collect::>() .join("\n"); @@ -1778,7 +2223,11 @@ RESPOND WITH ONLY THE FILE CONTENT. NO EXPLANATIONS."#, platform = platform, app_name = app.name, description = app.description, - tables_desc = if tables_desc.is_empty() { "None".to_string() } else { tables_desc }, + tables_desc = if tables_desc.is_empty() { + "None".to_string() + } else { + tables_desc + }, filename = filename, ); @@ -1819,30 +2268,32 @@ RESPOND WITH ONLY THE FILE CONTENT. NO EXPLANATIONS."#, } if filename.ends_with(".css") { - let lines: Vec<&str> = content.lines() + let lines: Vec<&str> = content + .lines() .filter(|line| { let trimmed = line.trim(); - !trimmed.starts_with("Here") && - !trimmed.starts_with("This") && - !trimmed.starts_with("The ") && - !trimmed.starts_with("Note:") && - !trimmed.starts_with("I've") && - !trimmed.starts_with("```") + !trimmed.starts_with("Here") + && !trimmed.starts_with("This") + && !trimmed.starts_with("The ") + && !trimmed.starts_with("Note:") + && !trimmed.starts_with("I've") + && !trimmed.starts_with("```") }) .collect(); return lines.join("\n"); } if filename.ends_with(".js") { - let lines: Vec<&str> = content.lines() + let lines: Vec<&str> = content + .lines() .filter(|line| { let trimmed = line.trim(); - !trimmed.starts_with("Here") && - !trimmed.starts_with("This") && - !trimmed.starts_with("The ") && - !trimmed.starts_with("Note:") && - !trimmed.starts_with("I've") && - !trimmed.starts_with("```") + !trimmed.starts_with("Here") + && !trimmed.starts_with("This") + && !trimmed.starts_with("The ") + && !trimmed.starts_with("Note:") + && !trimmed.starts_with("I've") + && !trimmed.starts_with("```") }) .collect(); return lines.join("\n"); @@ -1885,9 +2336,7 @@ NO QUESTIONS. JUST BUILD."# let end_idx = response.find(DELIM_APP_END); let content = match (start_idx, end_idx) { - (Some(s), Some(e)) => { - response.get(s + DELIM_APP_START.len()..e).unwrap_or("") - } + (Some(s), Some(e)) => response.get(s + DELIM_APP_START.len()..e).unwrap_or(""), (Some(s), None) => { warn!("No APP_END found, using rest of response"); response.get(s + DELIM_APP_START.len()..).unwrap_or("") @@ -1946,7 +2395,10 @@ NO QUESTIONS. JUST BUILD."# app.tables.push(table); } } - let table_name = line.get(DELIM_TABLE_PREFIX.len()..line.len().saturating_sub(DELIM_END.len())).unwrap_or("").trim(); + let table_name = line + .get(DELIM_TABLE_PREFIX.len()..line.len().saturating_sub(DELIM_END.len())) + .unwrap_or("") + .trim(); current_table = Some(LlmTable { name: table_name.to_string(), fields: Vec::new(), @@ -1955,7 +2407,11 @@ NO QUESTIONS. JUST BUILD."# } // Table field (when in tables section with active table) - if current_section == "tables" && current_table.is_some() && !line.is_empty() && !line.starts_with("<<<") { + if current_section == "tables" + && current_table.is_some() + && !line.is_empty() + && !line.starts_with("<<<") + { if let Some(ref mut table) = current_table { if let Some(field) = Self::parse_field_line(line) { table.fields.push(field); @@ -1970,7 +2426,10 @@ NO QUESTIONS. JUST BUILD."# if let Some((file_type, filename, content)) = current_file.take() { Self::save_parsed_file(&mut app, &file_type, filename, content); } - let filename = line.get(DELIM_FILE_PREFIX.len()..line.len().saturating_sub(DELIM_END.len())).unwrap_or("").trim(); + let filename = line + .get(DELIM_FILE_PREFIX.len()..line.len().saturating_sub(DELIM_END.len())) + .unwrap_or("") + .trim(); current_file = Some(("file".to_string(), filename.to_string(), String::new())); continue; } @@ -1980,7 +2439,10 @@ NO QUESTIONS. JUST BUILD."# if let Some((file_type, filename, content)) = current_file.take() { Self::save_parsed_file(&mut app, &file_type, filename, content); } - let filename = line.get(DELIM_TOOL_PREFIX.len()..line.len().saturating_sub(DELIM_END.len())).unwrap_or("").trim(); + let filename = line + .get(DELIM_TOOL_PREFIX.len()..line.len().saturating_sub(DELIM_END.len())) + .unwrap_or("") + .trim(); current_file = Some(("tool".to_string(), filename.to_string(), String::new())); continue; } @@ -1990,7 +2452,10 @@ NO QUESTIONS. JUST BUILD."# if let Some((file_type, filename, content)) = current_file.take() { Self::save_parsed_file(&mut app, &file_type, filename, content); } - let filename = line.get(DELIM_SCHEDULER_PREFIX.len()..line.len().saturating_sub(DELIM_END.len())).unwrap_or("").trim(); + let filename = line + .get(DELIM_SCHEDULER_PREFIX.len()..line.len().saturating_sub(DELIM_END.len())) + .unwrap_or("") + .trim(); current_file = Some(("scheduler".to_string(), filename.to_string(), String::new())); continue; } @@ -2061,7 +2526,12 @@ NO QUESTIONS. JUST BUILD."# } /// Save a parsed file to the appropriate collection - fn save_parsed_file(app: &mut LlmGeneratedApp, file_type: &str, filename: String, content: String) { + fn save_parsed_file( + app: &mut LlmGeneratedApp, + file_type: &str, + filename: String, + content: String, + ) { let file = LlmFile { filename, content: content.trim().to_string(), @@ -2150,28 +2620,50 @@ NO QUESTIONS. JUST BUILD."# name: json_app.name, description: json_app.description, domain: json_app.domain, - tables: json_app.tables.into_iter().map(|t| LlmTable { - name: t.name, - fields: t.fields.into_iter().map(|f| LlmField { - name: f.name, - field_type: f.field_type, - nullable: f.nullable.unwrap_or(true), - reference: f.reference, - default: f.default, - }).collect(), - }).collect(), - files: json_app.files.into_iter().map(|f| LlmFile { - filename: f.filename, - content: f.content, - }).collect(), - tools: json_app.tools.unwrap_or_default().into_iter().map(|f| LlmFile { - filename: f.filename, - content: f.content, - }).collect(), - schedulers: json_app.schedulers.unwrap_or_default().into_iter().map(|f| LlmFile { - filename: f.filename, - content: f.content, - }).collect(), + tables: json_app + .tables + .into_iter() + .map(|t| LlmTable { + name: t.name, + fields: t + .fields + .into_iter() + .map(|f| LlmField { + name: f.name, + field_type: f.field_type, + nullable: f.nullable.unwrap_or(true), + reference: f.reference, + default: f.default, + }) + .collect(), + }) + .collect(), + files: json_app + .files + .into_iter() + .map(|f| LlmFile { + filename: f.filename, + content: f.content, + }) + .collect(), + tools: json_app + .tools + .unwrap_or_default() + .into_iter() + .map(|f| LlmFile { + filename: f.filename, + content: f.content, + }) + .collect(), + schedulers: json_app + .schedulers + .unwrap_or_default() + .into_iter() + .map(|f| LlmFile { + filename: f.filename, + content: f.content, + }) + .collect(), }; if app.files.is_empty() { @@ -2260,7 +2752,11 @@ NO QUESTIONS. JUST BUILD."# }); let prompt_len = prompt.len(); - trace!("APP_GENERATOR Starting LLM streaming: model={}, prompt_len={}", model, prompt_len); + trace!( + "APP_GENERATOR Starting LLM streaming: model={}, prompt_len={}", + model, + prompt_len + ); let start = std::time::Instant::now(); // Use streaming to provide real-time feedback @@ -2287,9 +2783,12 @@ NO QUESTIONS. JUST BUILD."# chunk_buffer.push_str(&chunk); // Detect section markers using full_response (not chunk_buffer which gets trimmed) - let in_files_plan = full_response.contains("<<>>") && !full_response.contains("<<>>"); - let in_tools_plan = full_response.contains("<<>>") && !full_response.contains("<<>>"); - let in_schedulers_plan = full_response.contains("<<>>") && !full_response.contains("<<>>"); + let in_files_plan = full_response.contains("<<>>") + && !full_response.contains("<<>>"); + let in_tools_plan = full_response.contains("<<>>") + && !full_response.contains("<<>>"); + let in_schedulers_plan = full_response.contains("<<>>") + && !full_response.contains("<<>>"); // Detect items being generated in real-time (full generation format) // Use full_response for reliable detection with safe string extraction @@ -2299,7 +2798,9 @@ NO QUESTIONS. JUST BUILD."# if let Some(end_offset) = rest.find(">>>") { if let Some(table_name) = rest.get(9..end_offset) { let table_name = table_name.trim(); - if !table_name.is_empty() && !detected_tables.contains(&table_name.to_string()) { + if !table_name.is_empty() + && !detected_tables.contains(&table_name.to_string()) + { detected_tables.push(table_name.to_string()); info!("[LLM_STREAM] Detected table: {table_name}"); } @@ -2314,7 +2815,9 @@ NO QUESTIONS. JUST BUILD."# if let Some(end_offset) = rest.find(">>>") { if let Some(file_name) = rest.get(8..end_offset) { let file_name = file_name.trim(); - if !file_name.is_empty() && !detected_files.contains(&file_name.to_string()) { + if !file_name.is_empty() + && !detected_files.contains(&file_name.to_string()) + { detected_files.push(file_name.to_string()); info!("[LLM_STREAM] Detected file: {file_name}"); } @@ -2329,7 +2832,9 @@ NO QUESTIONS. JUST BUILD."# if let Some(end_offset) = rest.find(">>>") { if let Some(tool_name) = rest.get(8..end_offset) { let tool_name = tool_name.trim(); - if !tool_name.is_empty() && !detected_tools.contains(&tool_name.to_string()) { + if !tool_name.is_empty() + && !detected_tools.contains(&tool_name.to_string()) + { detected_tools.push(tool_name.to_string()); info!("[LLM_STREAM] Detected tool: {tool_name}"); } @@ -2342,7 +2847,9 @@ NO QUESTIONS. JUST BUILD."# // Parse from full_response for FILES_PLAN section if in_files_plan { if let Some(plan_start) = full_response.find("<<>>") { - if let Some(plan_content) = full_response.get(plan_start.saturating_add(16)..) { + if let Some(plan_content) = + full_response.get(plan_start.saturating_add(16)..) + { for line in plan_content.lines() { let line = line.trim(); if line.starts_with("<<<") { @@ -2351,12 +2858,15 @@ NO QUESTIONS. JUST BUILD."# if line.contains(':') { let parts: Vec<&str> = line.splitn(2, ':').collect(); let name = parts[0].trim(); - if !name.is_empty() && (name.ends_with(".html") || name.ends_with(".css") || name.ends_with(".js") || name.ends_with(".bas")) { - if !detected_files.contains(&name.to_string()) { + if !name.is_empty() + && (name.ends_with(".html") + || name.ends_with(".css") + || name.ends_with(".js") + || name.ends_with(".bas")) + && !detected_files.contains(&name.to_string()) { detected_files.push(name.to_string()); info!("[LLM_STREAM] Detected planned file: {name}"); } - } } } } @@ -2365,7 +2875,9 @@ NO QUESTIONS. JUST BUILD."# if in_tools_plan { if let Some(plan_start) = full_response.find("<<>>") { - if let Some(plan_content) = full_response.get(plan_start.saturating_add(16)..) { + if let Some(plan_content) = + full_response.get(plan_start.saturating_add(16)..) + { for line in plan_content.lines() { let line = line.trim(); if line.starts_with("<<<") { @@ -2375,7 +2887,11 @@ NO QUESTIONS. JUST BUILD."# let parts: Vec<&str> = line.splitn(2, ':').collect(); let name = parts[0].trim(); if !name.is_empty() { - let tool_name = if name.ends_with(".bas") { name.to_string() } else { format!("{name}.bas") }; + let tool_name = if name.ends_with(".bas") { + name.to_string() + } else { + format!("{name}.bas") + }; if !detected_tools.contains(&tool_name) { detected_tools.push(tool_name.clone()); info!("[LLM_STREAM] Detected planned tool: {tool_name}"); @@ -2389,7 +2905,9 @@ NO QUESTIONS. JUST BUILD."# if in_schedulers_plan { if let Some(plan_start) = full_response.find("<<>>") { - if let Some(plan_content) = full_response.get(plan_start.saturating_add(21)..) { + if let Some(plan_content) = + full_response.get(plan_start.saturating_add(21)..) + { for line in plan_content.lines() { let line = line.trim(); if line.starts_with("<<<") { @@ -2399,7 +2917,11 @@ NO QUESTIONS. JUST BUILD."# let parts: Vec<&str> = line.splitn(2, ':').collect(); let name = parts[0].trim(); if !name.is_empty() { - let sched_name = if name.ends_with(".bas") { name.to_string() } else { format!("{name}.bas") }; + let sched_name = if name.ends_with(".bas") { + name.to_string() + } else { + format!("{name}.bas") + }; if !detected_tools.contains(&sched_name) { detected_tools.push(sched_name.clone()); info!("[LLM_STREAM] Detected planned scheduler: {sched_name}"); @@ -2412,15 +2934,20 @@ NO QUESTIONS. JUST BUILD."# } // Log progress periodically - if chunk_count == 1 || chunk_count % 500 == 0 { - trace!("APP_GENERATOR Stream progress: {} chunks, {} chars, {:?}", - chunk_count, full_response.len(), stream_start.elapsed()); + if chunk_count == 1 || chunk_count.is_multiple_of(500) { + trace!( + "APP_GENERATOR Stream progress: {} chunks, {} chars, {:?}", + chunk_count, + full_response.len(), + stream_start.elapsed() + ); } // Emit progress updates every 2 seconds if last_progress_update.elapsed().as_secs() >= 2 { if let Some(ref tid) = task_id { - let total_detected = detected_tables.len() + detected_files.len() + detected_tools.len(); + let total_detected = + detected_tables.len() + detected_files.len() + detected_tools.len(); let progress_msg = if total_detected > 0 { format!( "AI generating... {} tables, {} files, {} tools detected", @@ -2430,7 +2957,10 @@ NO QUESTIONS. JUST BUILD."# ) } else { let chars_received = full_response.len(); - format!("AI generating content... {} chars received", chars_received) + format!( + "AI generating content... {} chars received", + chars_received + ) }; info!("[LLM_STREAM] Progress: {}", progress_msg); let event = crate::core::shared::state::TaskProgressEvent::new( @@ -2449,7 +2979,10 @@ NO QUESTIONS. JUST BUILD."# if last_emit.elapsed().as_millis() > 100 || chunk_buffer.len() > 500 { // Keep last 200 chars for detecting split delimiters (Unicode-safe) if chunk_buffer.chars().count() > 200 { - chunk_buffer = chunk_buffer.chars().skip(chunk_buffer.chars().count() - 200).collect(); + chunk_buffer = chunk_buffer + .chars() + .skip(chunk_buffer.chars().count() - 200) + .collect(); } last_emit = std::time::Instant::now(); } @@ -2457,7 +2990,8 @@ NO QUESTIONS. JUST BUILD."# // Final progress update if let Some(ref tid) = task_id { - let total_detected = detected_tables.len() + detected_files.len() + detected_tools.len(); + let total_detected = + detected_tables.len() + detected_files.len() + detected_tools.len(); let final_msg = if total_detected > 0 { format!( "AI complete: {} tables, {} files, {} tools", @@ -2478,19 +3012,34 @@ NO QUESTIONS. JUST BUILD."# state.broadcast_task_progress(event); } - trace!("APP_GENERATOR Stream finished: {} chunks, {} chars in {:?}", - chunk_count, full_response.len(), stream_start.elapsed()); + trace!( + "APP_GENERATOR Stream finished: {} chunks, {} chars in {:?}", + chunk_count, + full_response.len(), + stream_start.elapsed() + ); // Don't emit remaining buffer - it's raw code/HTML if !chunk_buffer.is_empty() { - trace!("APP_GENERATOR Final buffer (not emitting): {} chars", chunk_buffer.len()); + trace!( + "APP_GENERATOR Final buffer (not emitting): {} chars", + chunk_buffer.len() + ); } // Log response preview (Unicode-safe) if !full_response.is_empty() { let preview: String = full_response.chars().take(200).collect(); - let suffix = if full_response.chars().count() > 200 { "..." } else { "" }; - trace!("APP_GENERATOR Response preview: {}{}", preview.replace('\n', "\\n"), suffix); + let suffix = if full_response.chars().count() > 200 { + "..." + } else { + "" + }; + trace!( + "APP_GENERATOR Response preview: {}{}", + preview.replace('\n', "\\n"), + suffix + ); } full_response @@ -2510,25 +3059,35 @@ NO QUESTIONS. JUST BUILD."# match stream_task.await { Ok(response) => { let elapsed = start.elapsed(); - trace!("APP_GENERATOR LLM streaming succeeded: {} chars in {:?}", response.len(), elapsed); + trace!( + "APP_GENERATOR LLM streaming succeeded: {} chars in {:?}", + response.len(), + elapsed + ); if response.is_empty() { error!("APP_GENERATOR Empty response from LLM"); } - return Ok(response); + Ok(response) } Err(e) => { let elapsed = start.elapsed(); - error!("APP_GENERATOR LLM stream task failed after {:?}: {}", elapsed, e); - return Err(format!("Stream task failed: {}", e).into()); + error!( + "APP_GENERATOR LLM stream task failed after {:?}: {}", + elapsed, e + ); + Err(format!("Stream task failed: {}", e).into()) } } } Err(e) => { let elapsed = start.elapsed(); - error!("APP_GENERATOR LLM streaming failed after {:?}: {}", elapsed, e); + error!( + "APP_GENERATOR LLM streaming failed after {:?}: {}", + elapsed, e + ); // Abort the stream task stream_task.abort(); - return Err(e); + Err(e) } } } @@ -2639,7 +3198,9 @@ NO QUESTIONS. JUST BUILD."# Err(e) => { // Check if error is "bucket already exists" (race condition) let err_str = format!("{:?}", e); - if err_str.contains("BucketAlreadyExists") || err_str.contains("BucketAlreadyOwnedByYou") { + if err_str.contains("BucketAlreadyExists") + || err_str.contains("BucketAlreadyOwnedByYou") + { trace!("Bucket {} already exists (race condition)", bucket); return Ok(()); } @@ -2669,16 +3230,25 @@ NO QUESTIONS. JUST BUILD."# path: &str, content: &str, ) -> Result<(), Box> { - info!("write_to_drive: bucket={}, path={}, content_len={}", bucket, path, content.len()); + info!( + "write_to_drive: bucket={}, path={}, content_len={}", + bucket, + path, + content.len() + ); #[cfg(feature = "drive")] if let Some(ref s3) = self.state.drive { let body = ByteStream::from(content.as_bytes().to_vec()); let content_type = get_content_type(path); - info!("S3 client available, attempting put_object to s3://{}/{}", bucket, path); + info!( + "S3 client available, attempting put_object to s3://{}/{}", + bucket, path + ); - match s3.put_object() + match s3 + .put_object() .bucket(bucket) .key(path) .body(body) @@ -2691,7 +3261,10 @@ NO QUESTIONS. JUST BUILD."# } Err(e) => { // Log detailed error info - error!("S3 put_object failed: bucket={}, path={}, error={:?}", bucket, path, e); + error!( + "S3 put_object failed: bucket={}, path={}, error={:?}", + bucket, path, e + ); error!("S3 error details: {}", e); // If bucket doesn't exist, try to create it and retry @@ -2709,7 +3282,10 @@ NO QUESTIONS. JUST BUILD."# .content_type(get_content_type(path)) .send() .await?; - info!("Wrote to S3 after creating bucket: s3://{}/{}", bucket, path); + info!( + "Wrote to S3 after creating bucket: s3://{}/{}", + bucket, path + ); } else { error!("S3 write failed (not a bucket issue): {}", err_str); return Err(Box::new(e)); @@ -2717,13 +3293,19 @@ NO QUESTIONS. JUST BUILD."# } } } else { - warn!("No S3/drive client available, using DB fallback for {}/{}", bucket, path); + warn!( + "No S3/drive client available, using DB fallback for {}/{}", + bucket, path + ); self.write_to_db_fallback(bucket, path, content)?; } #[cfg(not(feature = "drive"))] { - warn!("Drive feature not enabled, using DB fallback for {}/{}", bucket, path); + warn!( + "Drive feature not enabled, using DB fallback for {}/{}", + bucket, path + ); self.write_to_db_fallback(bucket, path, content)?; } @@ -2769,9 +3351,16 @@ NO QUESTIONS. JUST BUILD."# let create_sql = generate_create_table_sql(table, "postgres"); // Try to use bot's specific database first - match self.state.bot_database_manager.create_table_in_bot_database(bot_id, &create_sql) { + match self + .state + .bot_database_manager + .create_table_in_bot_database(bot_id, &create_sql) + { Ok(()) => { - info!("Created table '{}' in bot database (bot_id: {})", table.name, bot_id); + info!( + "Created table '{}' in bot database (bot_id: {})", + table.name, bot_id + ); Ok(table.fields.len()) } Err(e) => { @@ -2837,7 +3426,10 @@ NO QUESTIONS. JUST BUILD."# .bind::(task_id) .execute(&mut conn)?; - info!("Updated task {} completed with app_url: {}", task_id, app_url); + info!( + "Updated task {} completed with app_url: {}", + task_id, app_url + ); Ok(()) } @@ -2850,27 +3442,35 @@ NO QUESTIONS. JUST BUILD."# let mut conn = self.state.conn.get()?; // Build step_results JSON with file status - let step_results: Vec = files.iter().enumerate().map(|(idx, filename)| { - let status = if idx < completed_count { - "Completed" - } else if idx == completed_count { - "Running" - } else { - "Pending" - }; - serde_json::json!({ - "step_id": format!("file_{}", idx), - "step_order": idx + 1, - "step_name": format!("Write {}", filename), - "status": status, - "started_at": chrono::Utc::now().to_rfc3339(), - "duration_ms": if idx < completed_count { Some(100) } else { None:: }, - "logs": [] + let step_results: Vec = files + .iter() + .enumerate() + .map(|(idx, filename)| { + let status = if idx < completed_count { + "Completed" + } else if idx == completed_count { + "Running" + } else { + "Pending" + }; + serde_json::json!({ + "step_id": format!("file_{}", idx), + "step_order": idx + 1, + "step_name": format!("Write {}", filename), + "status": status, + "started_at": chrono::Utc::now().to_rfc3339(), + "duration_ms": if idx < completed_count { Some(100) } else { None:: }, + "logs": [] + }) }) - }).collect(); + .collect(); let step_results_json = serde_json::to_value(&step_results)?; - let progress = if files.is_empty() { 0.0 } else { completed_count as f64 / files.len() as f64 }; + let progress = if files.is_empty() { + 0.0 + } else { + completed_count as f64 / files.len() as f64 + }; sql_query( "UPDATE auto_tasks SET @@ -2888,7 +3488,12 @@ NO QUESTIONS. JUST BUILD."# .bind::(task_id) .execute(&mut conn)?; - trace!("Updated task {} step_results: {}/{} files", task_id, completed_count, files.len()); + trace!( + "Updated task {} step_results: {}/{} files", + task_id, + completed_count, + files.len() + ); Ok(()) } diff --git a/src/auto_task/app_logs.rs b/src/auto_task/app_logs.rs index 020ff182e..53c50e72d 100644 --- a/src/auto_task/app_logs.rs +++ b/src/auto_task/app_logs.rs @@ -121,19 +121,18 @@ impl AppLogStore { source: LogSource, message: &str, details: Option, - bot_id: Option, - user_id: Option, + ids: (Option, Option), // (bot_id, user_id) ) { let entry = AppLogEntry { id: Uuid::new_v4().to_string(), timestamp: Utc::now(), level, source, - app_name: app_name.to_string(), - bot_id, - user_id, message: message.to_string(), details, + bot_id: ids.0, + user_id: ids.1, + app_name: app_name.to_string(), file_path: None, line_number: None, stack_trace: None, @@ -157,9 +156,7 @@ impl AppLogStore { source: LogSource, message: &str, error: &str, - file_path: Option<&str>, - line_number: Option, - stack_trace: Option<&str>, + location: (Option<&str>, Option, Option<&str>), // (file_path, line_number, stack_trace) ) { let entry = AppLogEntry { id: Uuid::new_v4().to_string(), @@ -171,9 +168,9 @@ impl AppLogStore { user_id: None, message: message.to_string(), details: Some(error.to_string()), - file_path: file_path.map(String::from), - line_number, - stack_trace: stack_trace.map(String::from), + file_path: location.0.map(String::from), + line_number: location.1, + stack_trace: location.2.map(String::from), }; self.add_entry(entry); @@ -184,8 +181,8 @@ impl AppLogStore { source, message, error, - file_path.unwrap_or("unknown"), - line_number.unwrap_or(0) + location.0.unwrap_or("unknown"), + location.1.unwrap_or(0) ); } @@ -454,8 +451,7 @@ pub fn log_generator_info(app_name: &str, message: &str) { LogSource::Generator, message, None, - None, - None, + (None, None), ); } @@ -465,9 +461,7 @@ pub fn log_generator_error(app_name: &str, message: &str, error: &str) { LogSource::Generator, message, error, - None, - None, - None, + (None, None, None), ); } @@ -482,9 +476,7 @@ pub fn log_validation_error( LogSource::Validation, message, "Validation failed", - file_path, - line_number, - None, + (file_path, line_number, None), ); } @@ -494,9 +486,7 @@ pub fn log_runtime_error(app_name: &str, message: &str, error: &str, stack_trace LogSource::Runtime, message, error, - None, - None, - stack_trace, + (None, None, stack_trace), ); } diff --git a/src/auto_task/designer_ai.rs b/src/auto_task/designer_ai.rs index 72073176b..20c1f475d 100644 --- a/src/auto_task/designer_ai.rs +++ b/src/auto_task/designer_ai.rs @@ -1075,7 +1075,7 @@ Respond ONLY with valid JSON."# .llm_provider .generate(prompt, &llm_config, &model, &key) .await?; - return Ok(response); + Ok(response) } #[cfg(not(feature = "llm"))] diff --git a/src/auto_task/intent_classifier.rs b/src/auto_task/intent_classifier.rs index ed78c3b9e..3958eba99 100644 --- a/src/auto_task/intent_classifier.rs +++ b/src/auto_task/intent_classifier.rs @@ -1129,7 +1129,7 @@ END TRIGGER .llm_provider .generate(prompt, &llm_config, &model, &key) .await?; - return Ok(response); + Ok(response) } #[cfg(not(feature = "llm"))] diff --git a/src/auto_task/intent_compiler.rs b/src/auto_task/intent_compiler.rs index 37ebadb7a..9e736ab36 100644 --- a/src/auto_task/intent_compiler.rs +++ b/src/auto_task/intent_compiler.rs @@ -708,7 +708,7 @@ Respond ONLY with valid JSON."#, .llm_provider .generate(prompt, &llm_config, &model, &key) .await?; - return Ok(response); + Ok(response) } #[cfg(not(feature = "llm"))] diff --git a/src/auto_task/task_manifest.rs b/src/auto_task/task_manifest.rs index c4fb21236..a726083de 100644 --- a/src/auto_task/task_manifest.rs +++ b/src/auto_task/task_manifest.rs @@ -935,25 +935,28 @@ pub struct MonitorDefinition { pub target: String, } +pub struct ManifestData { + pub tables: Vec, + pub files: Vec, + pub pages: Vec, + pub tools: Vec, + pub schedulers: Vec, + pub monitors: Vec, +} + pub fn create_manifest_from_llm_response( app_name: &str, description: &str, - tables: Vec, - files: Vec, - pages: Vec, - tools: Vec, - schedulers: Vec, - monitors: Vec, + data: ManifestData, ) -> TaskManifest { - let estimated_time = estimate_generation_time(&tables, &files, &tools, &schedulers); + let estimated_time = estimate_generation_time(&data.tables, &data.files, &data.tools, &data.schedulers); ManifestBuilder::new(app_name, description) - .with_tables(tables) - .with_files(files) - .with_pages(pages) - .with_tools(tools) - .with_schedulers(schedulers) - .with_monitors(monitors) + .with_tables(data.tables) + .with_files(data.files) + .with_pages(data.pages) + .with_tools(data.tools) + .with_schedulers(data.schedulers) .with_estimated_time(estimated_time) .build() } diff --git a/src/basic/compiler/mod.rs b/src/basic/compiler/mod.rs index 84ce2c25a..12f01bcb8 100644 --- a/src/basic/compiler/mod.rs +++ b/src/basic/compiler/mod.rs @@ -8,6 +8,7 @@ use diesel::ExpressionMethods; use diesel::QueryDsl; use diesel::RunQueryDsl; use log::{trace, warn}; +use regex::Regex; pub mod goto_transform; use serde::{Deserialize, Serialize}; @@ -406,26 +407,32 @@ impl BasicCompiler { continue; } - if trimmed.starts_with("USE WEBSITE") { - let parts: Vec<&str> = normalized.split('"').collect(); - if parts.len() >= 2 { - let url = parts[1]; - let mut conn = self - .state - .conn - .get() - .map_err(|e| format!("Failed to get database connection: {}", e))?; - if let Err(e) = - crate::basic::keywords::use_website::execute_use_website_preprocessing( - &mut conn, url, bot_id, - ) - { - log::error!("Failed to register USE_WEBSITE during preprocessing: {}", e); - } else { - log::info!( - "Registered website {} for crawling during preprocessing", - url - ); + if trimmed.to_uppercase().starts_with("USE WEBSITE") { + let re = Regex::new(r#"(?i)USE\s+WEBSITE\s+"([^"]+)"(?:\s+REFRESH\s+"([^"]+)")?"#).unwrap(); + if let Some(caps) = re.captures(&normalized) { + if let Some(url_match) = caps.get(1) { + let url = url_match.as_str(); + let refresh = caps.get(2).map(|m| m.as_str()).unwrap_or("1m"); + let mut conn = self + .state + .conn + .get() + .map_err(|e| format!("Failed to get database connection: {}", e))?; + if let Err(e) = + crate::basic::keywords::use_website::execute_use_website_preprocessing_with_refresh( + &mut conn, url, bot_id, refresh, + ) + { + log::error!("Failed to register USE_WEBSITE during preprocessing: {}", e); + } else { + log::info!( + "Registered website {} for crawling during preprocessing (refresh: {})", + url, refresh + ); + } + + result.push_str(&format!("USE_WEBSITE(\"{}\", \"{}\");\n", url, refresh)); + continue; } } else { log::warn!("Malformed USE_WEBSITE line ignored: {}", normalized); diff --git a/src/basic/keywords/add_bot.rs b/src/basic/keywords/add_bot.rs index 4c21d2c75..ca71aafec 100644 --- a/src/basic/keywords/add_bot.rs +++ b/src/basic/keywords/add_bot.rs @@ -1,7 +1,6 @@ use crate::shared::models::UserSession; use crate::shared::state::AppState; use diesel::prelude::*; -use diesel::sql_query; use log::{info, trace}; use rhai::{Dynamic, Engine}; use serde::{Deserialize, Serialize}; @@ -593,26 +592,10 @@ fn add_bot_to_session( .map(|r| r.id) .map_err(|e| format!("Failed to get bot ID: {e}"))? } else { - let new_bot_id = Uuid::new_v4(); - let db_name = format!("bot_{}", bot_name.replace(['-', ' '], "_").to_lowercase()); - diesel::sql_query( - "INSERT INTO bots (id, name, description, is_active, database_name, created_at) - VALUES ($1, $2, $3, true, $4, NOW()) - ON CONFLICT (name) DO UPDATE SET is_active = true, database_name = COALESCE(bots.database_name, $4) - RETURNING id", - ) - .bind::(new_bot_id.to_string()) - .bind::(bot_name) - .bind::(format!("Bot agent: {bot_name}")) - .bind::(&db_name) - .execute(&mut *conn) - .map_err(|e| format!("Failed to create bot: {e}"))?; - - if let Err(e) = create_bot_database(&mut conn, &db_name) { - log::warn!("Failed to create database for bot {bot_name}: {e}"); - } - - new_bot_id.to_string() + return Err(format!( + "Bot '{}' does not exist in database. Please create it first using the import process.", + bot_name + )); }; let trigger_json = @@ -852,48 +835,3 @@ struct BotConfigRow { #[diesel(sql_type = diesel::sql_types::Nullable)] model_config: Option, } - -fn create_bot_database(conn: &mut PgConnection, db_name: &str) -> Result<(), String> { - let safe_db_name: String = db_name - .chars() - .filter(|c| c.is_alphanumeric() || *c == '_') - .collect(); - - if safe_db_name.is_empty() || safe_db_name.len() > 63 { - return Err("Invalid database name".into()); - } - - #[derive(QueryableByName)] - struct DbExists { - #[diesel(sql_type = diesel::sql_types::Bool)] - exists: bool, - } - - let check_query = format!( - "SELECT EXISTS (SELECT 1 FROM pg_database WHERE datname = '{}') as exists", - safe_db_name - ); - - let exists = sql_query(&check_query) - .get_result::(conn) - .map(|r| r.exists) - .unwrap_or(false); - - if exists { - info!("Database {} already exists", safe_db_name); - return Ok(()); - } - - let create_query = format!("CREATE DATABASE {}", safe_db_name); - if let Err(e) = sql_query(&create_query).execute(conn) { - let err_str = e.to_string(); - if err_str.contains("already exists") { - info!("Database {} already exists", safe_db_name); - return Ok(()); - } - return Err(format!("Failed to create database: {}", e)); - } - - info!("Created database: {}", safe_db_name); - Ok(()) -} diff --git a/src/basic/keywords/create_site.rs b/src/basic/keywords/create_site.rs index ebbdcb147..322d7245a 100644 --- a/src/basic/keywords/create_site.rs +++ b/src/basic/keywords/create_site.rs @@ -47,7 +47,7 @@ pub fn create_site_keyword(state: &AppState, user: UserSession, engine: &mut Eng } }; - let s3 = state_clone.s3_client.clone().map(std::sync::Arc::new); + let s3 = state_clone.drive.clone().map(std::sync::Arc::new); let bucket = state_clone.bucket_name.clone(); let bot_id = user_clone.bot_id.to_string(); diff --git a/src/basic/keywords/episodic_memory.rs b/src/basic/keywords/episodic_memory.rs index 4d7cfd9b8..8dccc9cf5 100644 --- a/src/basic/keywords/episodic_memory.rs +++ b/src/basic/keywords/episodic_memory.rs @@ -53,18 +53,15 @@ pub struct ActionItem { #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[serde(rename_all = "lowercase")] +#[derive(Default)] pub enum Priority { Low, + #[default] Medium, High, Critical, } -impl Default for Priority { - fn default() -> Self { - Self::Medium - } -} #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Sentiment { @@ -77,19 +74,16 @@ pub struct Sentiment { #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[serde(rename_all = "lowercase")] +#[derive(Default)] pub enum SentimentLabel { VeryNegative, Negative, + #[default] Neutral, Positive, VeryPositive, } -impl Default for SentimentLabel { - fn default() -> Self { - Self::Neutral - } -} impl Default for Sentiment { fn default() -> Self { @@ -103,19 +97,16 @@ impl Default for Sentiment { #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[serde(rename_all = "lowercase")] +#[derive(Default)] pub enum ResolutionStatus { Resolved, Unresolved, Escalated, Pending, + #[default] Unknown, } -impl Default for ResolutionStatus { - fn default() -> Self { - Self::Unknown - } -} #[derive(Debug, Clone)] pub struct EpisodicMemoryConfig { diff --git a/src/basic/keywords/errors/on_error.rs b/src/basic/keywords/errors/on_error.rs index 920f9d60d..b69ec79c5 100644 --- a/src/basic/keywords/errors/on_error.rs +++ b/src/basic/keywords/errors/on_error.rs @@ -71,15 +71,20 @@ pub fn register_on_error_keywords(_state: Arc, _user: UserSession, eng engine .register_custom_syntax( - ["ON", "ERROR", "GOTO", "0"], + ["ON", "ERROR", "GOTO", "$ident$"], false, - move |_context, _inputs| { - trace!("ON ERROR GOTO 0 - Error handling disabled"); - set_error_resume_next(false); + move |context, inputs| { + let label = context.eval_expression_tree(&inputs[0])?.to_string(); + if label == "0" { + trace!("ON ERROR GOTO 0 - Error handling disabled"); + set_error_resume_next(false); + } else { + trace!("ON ERROR GOTO {} - Error handler set", label); + } Ok(Dynamic::UNIT) }, ) - .expect("Failed to register ON ERROR GOTO 0"); + .expect("Failed to register ON ERROR GOTO"); engine .register_custom_syntax(["CLEAR", "ERROR"], false, move |_context, _inputs| { diff --git a/src/basic/keywords/events.rs b/src/basic/keywords/events.rs index a124c4dca..92aa8310f 100644 --- a/src/basic/keywords/events.rs +++ b/src/basic/keywords/events.rs @@ -146,7 +146,7 @@ async fn publish_event( if let Some(redis_client) = &state.cache { if let Ok(mut redis_conn) = redis_client.get_multiplexed_async_connection().await { let channel = format!("events:{event_name}"); - let _: Result<(), _> = redis_conn.publish(&channel, &new_event.id.to_string()).await; + let _: Result<(), _> = redis_conn.publish(&channel, new_event.id.to_string()).await; } } diff --git a/src/basic/keywords/hear_talk.rs b/src/basic/keywords/hear_talk.rs index ac86faf20..a6274e5f3 100644 --- a/src/basic/keywords/hear_talk.rs +++ b/src/basic/keywords/hear_talk.rs @@ -8,6 +8,9 @@ use serde::{Deserialize, Serialize}; use std::sync::Arc; use uuid::Uuid; +// Import the send_message_to_recipient function from universal_messaging +use super::universal_messaging::send_message_to_recipient; + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum InputType { Any, @@ -1082,7 +1085,7 @@ pub async fn execute_talk( session_id: user_session.id.to_string(), channel: "web".to_string(), content: message, - message_type: MessageType::USER, + message_type: MessageType::BOT_RESPONSE, stream_token: None, is_complete: true, suggestions, @@ -1111,8 +1114,42 @@ pub async fn execute_talk( pub fn talk_keyword(state: Arc, user: UserSession, engine: &mut Engine) { let state_clone = Arc::clone(&state); - let user_clone = user; + let user_clone = user.clone(); + // Register TALK TO "recipient", "message" syntax FIRST (more specific pattern) + let state_clone2 = Arc::clone(&state); + let user_clone2 = user.clone(); + + engine + .register_custom_syntax( + ["TALK", "TO", "$expr$", ",", "$expr$"], + true, + move |context, inputs| { + let recipient = context.eval_expression_tree(&inputs[0])?.to_string(); + let message = context.eval_expression_tree(&inputs[1])?.to_string(); + + trace!("TALK TO: Sending message to {}", recipient); + + let state_for_send = Arc::clone(&state_clone2); + let user_for_send = user_clone2.clone(); + + tokio::spawn(async move { + if let Err(e) = send_message_to_recipient( + state_for_send, + &user_for_send, + &recipient, + &message, + ).await { + error!("Failed to send TALK TO message: {}", e); + } + }); + + Ok(Dynamic::UNIT) + }, + ) + .expect("valid syntax registration"); + + // Register simple TALK "message" syntax SECOND (fallback pattern) engine .register_custom_syntax(["TALK", "$expr$"], true, move |context, inputs| { let message = context.eval_expression_tree(&inputs[0])?.to_string(); diff --git a/src/basic/keywords/knowledge_graph.rs b/src/basic/keywords/knowledge_graph.rs index f43da2885..80810ba01 100644 --- a/src/basic/keywords/knowledge_graph.rs +++ b/src/basic/keywords/knowledge_graph.rs @@ -30,18 +30,15 @@ pub struct KgEntity { #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[serde(rename_all = "lowercase")] +#[derive(Default)] pub enum EntitySource { + #[default] Manual, Extracted, Imported, Inferred, } -impl Default for EntitySource { - fn default() -> Self { - Self::Manual - } -} #[derive(Debug, Clone, Serialize, Deserialize)] pub struct KgRelationship { diff --git a/src/basic/keywords/model_routing.rs b/src/basic/keywords/model_routing.rs index 4baf492e6..8a5b5f6b4 100644 --- a/src/basic/keywords/model_routing.rs +++ b/src/basic/keywords/model_routing.rs @@ -19,18 +19,15 @@ pub struct ModelConfig { } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Default)] pub enum RoutingStrategy { + #[default] Manual, Auto, LoadBalanced, Fallback, } -impl Default for RoutingStrategy { - fn default() -> Self { - Self::Manual - } -} #[derive(Debug, Clone)] pub struct ModelRouter { diff --git a/src/basic/keywords/set_schedule.rs b/src/basic/keywords/set_schedule.rs index 535ae8702..1acfc6f04 100644 --- a/src/basic/keywords/set_schedule.rs +++ b/src/basic/keywords/set_schedule.rs @@ -8,7 +8,7 @@ pub fn parse_natural_schedule(input: &str) -> Result { let input = input.trim().to_lowercase(); let parts: Vec<&str> = input.split_whitespace().collect(); - if parts.len() == 5 && is_cron_expression(&parts) { + if (parts.len() == 5 || parts.len() == 6) && is_cron_expression(&parts) { return Ok(input); } @@ -16,9 +16,14 @@ pub fn parse_natural_schedule(input: &str) -> Result { } fn is_cron_expression(parts: &[&str]) -> bool { + if parts.len() != 5 && parts.len() != 6 { + return false; + } + parts.iter().all(|part| { - part.chars() - .all(|c| c.is_ascii_digit() || c == '*' || c == '/' || c == '-' || c == ',') + part.chars().all(|c| { + c.is_ascii_digit() || c == '*' || c == '/' || c == '-' || c == ',' || c.is_ascii_alphabetic() + }) }) } diff --git a/src/basic/keywords/universal_messaging.rs b/src/basic/keywords/universal_messaging.rs index a1d5a0fab..14e5f6320 100644 --- a/src/basic/keywords/universal_messaging.rs +++ b/src/basic/keywords/universal_messaging.rs @@ -10,12 +10,14 @@ use serde_json::json; use std::sync::Arc; pub fn register_universal_messaging(state: Arc, user: UserSession, engine: &mut Engine) { - register_talk_to(state.clone(), user.clone(), engine); register_send_file_to(state.clone(), user.clone(), engine); register_send_to(state.clone(), user.clone(), engine); register_broadcast(state, user, engine); } +// DEPRECATED: TALK TO functionality moved to hear_talk.rs talk_keyword function +// to avoid syntax conflicts between TALK and TALK TO +/* fn register_talk_to(state: Arc, user: UserSession, engine: &mut Engine) { let state_clone = Arc::clone(&state); @@ -50,6 +52,7 @@ fn register_talk_to(state: Arc, user: UserSession, engine: &mut Engine ) .expect("valid syntax registration"); } +*/ fn register_send_file_to(state: Arc, user: UserSession, engine: &mut Engine) { let state_clone = Arc::clone(&state); @@ -179,7 +182,7 @@ fn register_broadcast(state: Arc, user: UserSession, engine: &mut Engi .expect("valid syntax registration"); } -async fn send_message_to_recipient( +pub async fn send_message_to_recipient( state: Arc, user: &UserSession, recipient: &str, @@ -461,7 +464,7 @@ async fn send_instagram_file( let file_key = format!("temp/instagram/{}_{}.bin", user.id, uuid::Uuid::new_v4()); - if let Some(s3) = &state.s3_client { + if let Some(s3) = &state.drive { s3.put_object() .bucket("uploads") .key(&file_key) @@ -483,7 +486,7 @@ async fn send_instagram_file( tokio::spawn(async move { tokio::time::sleep(tokio::time::Duration::from_secs(3600)).await; - if let Some(s3) = &state.s3_client { + if let Some(s3) = &state.drive { let _ = s3 .delete_object() .bucket("uploads") diff --git a/src/basic/keywords/use_website.rs b/src/basic/keywords/use_website.rs index d5286f25c..b36f8f82f 100644 --- a/src/basic/keywords/use_website.rs +++ b/src/basic/keywords/use_website.rs @@ -48,7 +48,8 @@ pub fn use_website_keyword(state: Arc, user: UserSession, engine: &mut let state_clone = Arc::clone(&state); let user_clone = user.clone(); - // Register syntax for USE WEBSITE "url" REFRESH "interval" + // Register syntax for USE WEBSITE "url" REFRESH "interval" (case insensitive) + // Register both uppercase and lowercase variants engine .register_custom_syntax( ["USE", "WEBSITE", "$expr$", "REFRESH", "$expr$"], @@ -202,6 +203,240 @@ pub fn use_website_keyword(state: Arc, user: UserSession, engine: &mut .expect("valid syntax registration"); } +/// Register USE_WEBSITE as a regular function instead of custom syntax +/// This avoids conflicts with other USE keywords (USE MODEL, USE KB, etc.) +pub fn register_use_website_function(state: Arc, user: UserSession, engine: &mut Engine) { + let state_clone = Arc::clone(&state); + let user_clone = user.clone(); + + // Register USE_WEBSITE(url, refresh) with both parameters (uppercase) + engine.register_fn( + "USE_WEBSITE", + move |url: &str, refresh: &str| -> Dynamic { + trace!( + "USE_WEBSITE function called: {} REFRESH {} for session: {}", + url, + refresh, + user_clone.id + ); + + let is_valid = url.starts_with("http://") || url.starts_with("https://"); + if !is_valid { + return Dynamic::from(format!( + "ERROR: Invalid URL format: {}. Must start with http:// or https://", + url + )); + } + + let state_for_task = Arc::clone(&state_clone); + let user_for_task = user_clone.clone(); + let url_for_task = url.to_string(); + let refresh_for_task = refresh.to_string(); + let (tx, rx) = std::sync::mpsc::channel(); + + std::thread::spawn(move || { + let _rt = match tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + { + Ok(_rt) => _rt, + Err(e) => { + let _ = tx.send(Err(format!("Failed to build tokio runtime: {}", e))); + return; + } + }; + let result = associate_website_with_session_refresh( + &state_for_task, + &user_for_task, + &url_for_task, + &refresh_for_task, + ); + let _ = tx.send(result); + }); + + match rx.recv_timeout(std::time::Duration::from_secs(10)) { + Ok(Ok(message)) => Dynamic::from(message), + Ok(Err(e)) => Dynamic::from(format!("ERROR: {}", e)), + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + Dynamic::from("ERROR: USE_WEBSITE timed out") + } + Err(e) => Dynamic::from(format!("ERROR: USE_WEBSITE failed: {}", e)), + } + }, + ); + + let state_clone2 = Arc::clone(&state); + let user_clone2 = user.clone(); + + // Register use_website(url, refresh) with both parameters (lowercase for preprocessor) + engine.register_fn( + "use_website", + move |url: &str, refresh: &str| -> Dynamic { + trace!( + "use_website function called: {} REFRESH {} for session: {}", + url, + refresh, + user_clone2.id + ); + + let is_valid = url.starts_with("http://") || url.starts_with("https://"); + if !is_valid { + return Dynamic::from(format!( + "ERROR: Invalid URL format: {}. Must start with http:// or https://", + url + )); + } + + let state_for_task = Arc::clone(&state_clone2); + let user_for_task = user_clone2.clone(); + let url_for_task = url.to_string(); + let refresh_for_task = refresh.to_string(); + let (tx, rx) = std::sync::mpsc::channel(); + + std::thread::spawn(move || { + let _rt = match tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + { + Ok(_rt) => _rt, + Err(e) => { + let _ = tx.send(Err(format!("Failed to build tokio runtime: {}", e))); + return; + } + }; + let result = associate_website_with_session_refresh( + &state_for_task, + &user_for_task, + &url_for_task, + &refresh_for_task, + ); + let _ = tx.send(result); + }); + + match rx.recv_timeout(std::time::Duration::from_secs(10)) { + Ok(Ok(message)) => Dynamic::from(message), + Ok(Err(e)) => Dynamic::from(format!("ERROR: {}", e)), + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + Dynamic::from("ERROR: use_website timed out") + } + Err(e) => Dynamic::from(format!("ERROR: use_website failed: {}", e)), + } + }, + ); + + let state_clone3 = Arc::clone(&state); + let user_clone3 = user.clone(); + + // Register USE_WEBSITE(url) with just URL (default refresh) + engine.register_fn("USE_WEBSITE", move |url: &str| -> Dynamic { + trace!( + "USE_WEBSITE function called: {} for session: {}", + url, + user_clone3.id + ); + + let is_valid = url.starts_with("http://") || url.starts_with("https://"); + if !is_valid { + return Dynamic::from(format!( + "ERROR: Invalid URL format: {}. Must start with http:// or https://", + url + )); + } + + let state_for_task = Arc::clone(&state_clone3); + let user_for_task = user_clone3.clone(); + let url_for_task = url.to_string(); + let (tx, rx) = std::sync::mpsc::channel(); + + std::thread::spawn(move || { + let _rt = match tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + { + Ok(_rt) => _rt, + Err(e) => { + let _ = tx.send(Err(format!("Failed to build tokio runtime: {}", e))); + return; + } + }; + let result = associate_website_with_session( + &state_for_task, + &user_for_task, + &url_for_task, + ); + let _ = tx.send(result); + }); + + match rx.recv_timeout(std::time::Duration::from_secs(10)) { + Ok(Ok(message)) => Dynamic::from(message), + Ok(Err(e)) => Dynamic::from(format!("ERROR: {}", e)), + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + Dynamic::from("ERROR: USE_WEBSITE timed out") + } + Err(e) => Dynamic::from(format!("ERROR: USE_WEBSITE failed: {}", e)), + } + }); + + let state_clone4 = Arc::clone(&state); + let user_clone4 = user; + + // Register use_website(url) with just URL (default refresh, lowercase) + engine.register_fn("use_website", move |url: &str| -> Dynamic { + trace!( + "use_website function called: {} for session: {}", + url, + user_clone4.id + ); + + let is_valid = url.starts_with("http://") || url.starts_with("https://"); + if !is_valid { + return Dynamic::from(format!( + "ERROR: Invalid URL format: {}. Must start with http:// or https://", + url + )); + } + + let state_for_task = Arc::clone(&state_clone4); + let user_for_task = user_clone4.clone(); + let url_for_task = url.to_string(); + let (tx, rx) = std::sync::mpsc::channel(); + + std::thread::spawn(move || { + let _rt = match tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + { + Ok(_rt) => _rt, + Err(e) => { + let _ = tx.send(Err(format!("Failed to build tokio runtime: {}", e))); + return; + } + }; + let result = associate_website_with_session( + &state_for_task, + &user_for_task, + &url_for_task, + ); + let _ = tx.send(result); + }); + + match rx.recv_timeout(std::time::Duration::from_secs(10)) { + Ok(Ok(message)) => Dynamic::from(message), + Ok(Err(e)) => Dynamic::from(format!("ERROR: {}", e)), + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + Dynamic::from("ERROR: use_website timed out") + } + Err(e) => Dynamic::from(format!("ERROR: use_website failed: {}", e)), + } + }); + + info!("Registered USE_WEBSITE and use_website as function (preprocessed from USE WEBSITE)"); +} + fn associate_website_with_session( state: &AppState, user: &UserSession, @@ -220,7 +455,19 @@ fn associate_website_with_session_refresh( let mut conn = state.conn.get().map_err(|e| format!("DB error: {}", e))?; - let collection_name = format!("website_{}", sanitize_url_for_collection(url)); + // Get bot name for collection naming + #[derive(QueryableByName)] + struct BotName { + #[diesel(sql_type = diesel::sql_types::Text)] + name: String, + } + + let bot_name_result: BotName = diesel::sql_query("SELECT name FROM bots WHERE id = $1") + .bind::(&user.bot_id) + .get_result(&mut conn) + .map_err(|e| format!("Failed to get bot name: {}", e))?; + + let collection_name = format!("{}_website_{}", bot_name_result.name, sanitize_url_for_collection(url)); let website_status = check_website_crawl_status(&mut conn, &user.bot_id, url)?; @@ -276,8 +523,8 @@ fn check_website_crawl_status( ) -> Result { #[derive(QueryableByName)] struct CrawlStatus { - #[diesel(sql_type = diesel::sql_types::Nullable)] - crawl_status: Option, + #[diesel(sql_type = diesel::sql_types::Nullable)] + crawl_status: Option, } let query = diff --git a/src/basic/mod.rs b/src/basic/mod.rs index 4b77f55ae..412841269 100644 --- a/src/basic/mod.rs +++ b/src/basic/mod.rs @@ -49,7 +49,7 @@ use self::keywords::last::last_keyword; use self::keywords::on_form_submit::on_form_submit_keyword; use self::keywords::switch_case::preprocess_switch; use self::keywords::use_tool::use_tool_keyword; -use self::keywords::use_website::{clear_websites_keyword, use_website_keyword}; +use self::keywords::use_website::{clear_websites_keyword, register_use_website_function}; use self::keywords::web_data::register_web_data_keywords; #[cfg(feature = "automation")] use self::keywords::webhook::webhook_keyword; @@ -157,7 +157,6 @@ impl ScriptService { clear_suggestions_keyword(state.clone(), user.clone(), &mut engine); use_tool_keyword(state.clone(), user.clone(), &mut engine); clear_tools_keyword(state.clone(), user.clone(), &mut engine); - use_website_keyword(state.clone(), user.clone(), &mut engine); clear_websites_keyword(state.clone(), user.clone(), &mut engine); #[cfg(feature = "chat")] add_suggestion_keyword(state.clone(), user.clone(), &mut engine); @@ -165,7 +164,7 @@ impl ScriptService { add_member_keyword(state.clone(), user.clone(), &mut engine); #[cfg(feature = "chat")] register_bot_keywords(&state, &user, &mut engine); - + // ===== WORKFLOW ORCHESTRATION KEYWORDS ===== keywords::orchestration::register_orchestrate_workflow(state.clone(), user.clone(), &mut engine); keywords::orchestration::register_step_keyword(state.clone(), user.clone(), &mut engine); @@ -175,7 +174,7 @@ impl ScriptService { keywords::enhanced_memory::register_bot_share_memory(state.clone(), user.clone(), &mut engine); keywords::enhanced_memory::register_bot_sync_memory(state.clone(), user.clone(), &mut engine); keywords::enhanced_llm::register_enhanced_llm_keyword(state.clone(), user.clone(), &mut engine); - + keywords::universal_messaging::register_universal_messaging( state.clone(), user.clone(), @@ -224,6 +223,11 @@ impl ScriptService { save_from_unstructured_keyword(state.clone(), user.clone(), &mut engine); } + // Register USE WEBSITE after all other USE keywords to avoid conflicts + // USE WEBSITE is now preprocessed to USE_WEBSITE function call + // Register it as a regular function instead of custom syntax + register_use_website_function(state.clone(), user.clone(), &mut engine); + // ===== VECTORDB FEATURE KEYWORDS ===== #[cfg(feature = "vectordb")] { @@ -311,6 +315,10 @@ impl ScriptService { let _ = self; // silence unused self warning - kept for API consistency let script = preprocess_switch(script); + // Convert ALL multi-word keywords to underscore versions (e.g., "USE WEBSITE" → "USE_WEBSITE") + // This avoids Rhai custom syntax conflicts and makes the system more secure + let script = Self::convert_multiword_keywords(&script); + let script = Self::normalize_variables_to_lowercase(&script); let mut result = String::new(); @@ -787,6 +795,8 @@ impl ScriptService { "DESCRIPTION", "PARAM", "REQUIRED", + "WEBSITE", + "MODEL", ]; let _identifier_re = Regex::new(r"([a-zA-Z_][a-zA-Z0-9_]*)").expect("valid regex"); @@ -796,8 +806,6 @@ impl ScriptService { if trimmed.starts_with("REM") || trimmed.starts_with('\'') || trimmed.starts_with("//") { - result.push_str(line); - result.push('\n'); continue; } @@ -848,6 +856,161 @@ impl ScriptService { result } + /// Convert ALL multi-word keywords to underscore versions (function calls) + /// This avoids Rhai custom syntax conflicts and makes the system more secure + /// + /// Examples: + /// - "USE WEBSITE "url"" → "USE_WEBSITE("url")" + /// - "USE WEBSITE "url" REFRESH "interval"" → "USE_WEBSITE("url", "interval")" + /// - "SET BOT MEMORY key AS value" → "SET_BOT_MEMORY(key, value)" + /// - "CLEAR SUGGESTIONS" → "CLEAR_SUGGESTIONS()" + fn convert_multiword_keywords(script: &str) -> String { + use regex::Regex; + + // Known multi-word keywords with their conversion patterns + // Format: (keyword_pattern, min_params, max_params, param_names) + let multiword_patterns = vec![ + // USE family + (r#"USE\s+WEBSITE"#, 1, 2, vec!["url", "refresh"]), + (r#"USE\s+MODEL"#, 1, 1, vec!["model"]), + (r#"USE\s+KB"#, 1, 1, vec!["kb_name"]), + (r#"USE\s+TOOL"#, 1, 1, vec!["tool_path"]), + + // SET family + (r#"SET\s+BOT\s+MEMORY"#, 2, 2, vec!["key", "value"]), + (r#"SET\s+CONTEXT"#, 2, 2, vec!["key", "value"]), + (r#"SET\s+USER"#, 1, 1, vec!["user_id"]), + + // GET family + (r#"GET\s+BOT\s+MEMORY"#, 1, 1, vec!["key"]), + + // CLEAR family + (r#"CLEAR\s+SUGGESTIONS"#, 0, 0, vec![]), + (r#"CLEAR\s+TOOLS"#, 0, 0, vec![]), + (r#"CLEAR\s+WEBSITES"#, 0, 0, vec![]), + + // ADD family + (r#"ADD\s+SUGGESTION"#, 2, 2, vec!["title", "text"]), + (r#"ADD\s+MEMBER"#, 2, 2, vec!["name", "role"]), + + // CREATE family + (r#"CREATE\s+TASK"#, 1, 1, vec!["task"]), + (r#"CREATE\s+DRAFT"#, 4, 4, vec!["to", "subject", "body", "attachments"]), + (r#"CREATE\s+SITE"#, 1, 1, vec!["site"]), + + // ON family + (r#"ON\s+FORM\s+SUBMIT"#, 1, 1, vec!["form"]), + (r#"ON\s+EMAIL"#, 1, 1, vec!["filter"]), + (r#"ON\s+EVENT"#, 1, 1, vec!["event"]), + + // SEND family + (r#"SEND\s+MAIL"#, 4, 4, vec!["to", "subject", "body", "attachments"]), + + // BOOK (calendar) + (r#"BOOK"#, 1, 1, vec!["event"]), + ]; + + let mut result = String::new(); + + for line in script.lines() { + let trimmed = line.trim(); + let mut converted = false; + + // Try each pattern + for (pattern, min_params, max_params, _param_names) in &multiword_patterns { + // Build regex pattern: KEYWORD params... + // Handle quoted strings and unquoted identifiers + let regex_str = format!( + r#"(?i)^\s*{}\s+(.*?)(?:\s*)$"#, + pattern + ); + + if let Ok(re) = Regex::new(®ex_str) { + if let Some(caps) = re.captures(trimmed) { + if let Some(params_str) = caps.get(1) { + let params = Self::parse_parameters(params_str.as_str()); + let param_count = params.len(); + + // Validate parameter count + if param_count >= *min_params && param_count <= *max_params { + // Convert keyword to underscores + let keyword = pattern.replace(r"\s+", "_"); + + // Build function call + let params_str = if params.is_empty() { + String::new() + } else { + params.join(", ") + }; + + result.push_str(&format!("{}({});", keyword, params_str)); + result.push('\n'); + converted = true; + break; + } + } + } + } + } + + // If not converted, keep original line + if !converted { + result.push_str(line); + result.push('\n'); + } + } + + result + } + + /// Parse parameters from a keyword line + /// Handles quoted strings, AS keyword, and comma-separated values + fn parse_parameters(params_str: &str) -> Vec { + let mut params = Vec::new(); + let mut current = String::new(); + let mut in_quotes = false; + let mut quote_char = '"'; + let mut chars = params_str.chars().peekable(); + + while let Some(c) = chars.next() { + match c { + '"' | '\'' if !in_quotes => { + in_quotes = true; + quote_char = c; + current.push(c); + } + '"' | '\'' if in_quotes && c == quote_char => { + in_quotes = false; + current.push(c); + } + ' ' | '\t' if !in_quotes => { + // End of parameter if we have content + if !current.is_empty() { + params.push(current.trim().to_string()); + current = String::new(); + } + } + ',' if !in_quotes => { + // Comma separator + if !current.is_empty() { + params.push(current.trim().to_string()); + current = String::new(); + } + } + _ => { + current.push(c); + } + } + } + + // Don't forget the last parameter + if !current.is_empty() { + params.push(current.trim().to_string()); + } + + params + } + fn normalize_word(word: &str, keywords: &[&str]) -> String { let upper = word.to_uppercase(); diff --git a/src/core/automation/mod.rs b/src/core/automation/mod.rs index 2ca0bd09d..0414228a4 100644 --- a/src/core/automation/mod.rs +++ b/src/core/automation/mod.rs @@ -4,11 +4,31 @@ use crate::shared::state::AppState; use chrono::Utc; use cron::Schedule; use diesel::prelude::*; -use log::error; +use log::{error, trace}; use std::str::FromStr; use std::sync::Arc; use tokio::time::{interval, Duration}; +/// Normalizes a cron schedule by converting 6-field (with seconds) to 5-field format. +/// If the schedule already has 5 fields or is in natural language format, it's returned as-is. +fn normalize_cron_schedule(schedule: &str) -> String { + let trimmed = schedule.trim(); + let parts: Vec<&str> = trimmed.split_whitespace().collect(); + + let result = match parts.len() { + // 6 fields: assume seconds format, remove seconds + 6 => parts[1..].join(" "), + // 4 fields: missing day-of-week, add "*" + 4 => format!("{} *", parts.join(" ")), + // 5 fields: standard format + 5 => parts.join(" "), + // Invalid: return as-is and let cron parser handle the error + _ => trimmed.to_string(), + }; + + result.trim().to_string() +} + #[cfg(feature = "vectordb")] pub use crate::vector_db::vectordb_indexer::{IndexingStats, IndexingStatus, VectorDBIndexer}; @@ -32,7 +52,9 @@ impl AutomationService { } } } - pub async fn check_scheduled_tasks(&self) -> Result<(), Box> { + pub async fn check_scheduled_tasks( + &self, + ) -> Result<(), Box> { use crate::shared::models::system_automations::dsl::{ id, is_active, kind, last_triggered as lt_column, system_automations, }; @@ -47,7 +69,9 @@ impl AutomationService { .load::(&mut conn)?; for automation in automations { if let Some(schedule_str) = &automation.schedule { - match Schedule::from_str(schedule_str.trim()) { + let normalized_schedule = normalize_cron_schedule(schedule_str); + trace!("Parsing schedule: original='{}', normalized='{}'", schedule_str, normalized_schedule); + match Schedule::from_str(&normalized_schedule) { Ok(parsed_schedule) => { let now = Utc::now(); let next_run = parsed_schedule.upcoming(Utc).next(); @@ -76,9 +100,9 @@ impl AutomationService { } } Err(e) => { - error!( - "Error parsing schedule for automation {} ({}): {}", - automation.id, schedule_str, e + trace!( + "Skipping automation {} with invalid schedule (original: {}, normalized: {}): {}", + automation.id, schedule_str, normalized_schedule, e ); } } @@ -120,7 +144,6 @@ impl AutomationService { }; let mut script_service = ScriptService::new(Arc::clone(&self.state), session); - script_service.load_bot_config_params(&self.state, automation.bot_id); match script_service.compile(&script_content) { diff --git a/src/core/bootstrap/mod.rs b/src/core/bootstrap/mod.rs index 9bc155fae..13d20bcc6 100644 --- a/src/core/bootstrap/mod.rs +++ b/src/core/bootstrap/mod.rs @@ -2019,6 +2019,29 @@ VAULT_CACHE_TTL=300 info!("Created database '{}' for bot '{}'", safe_db_name, bot_name); } + // Sync config.csv for this bot if it exists + let templates_dir = std::path::PathBuf::from("./bottemplates"); + let bot_template_dir = templates_dir.join(format!("{}.gbai", bot_name)); + let config_path = bot_template_dir.join(format!("{}.gbot/config.csv", bot_name)); + + if config_path.exists() { + match std::fs::read_to_string(&config_path) { + Ok(csv_content) => { + debug!("Syncing config.csv from {}", config_path.display()); + if let Err(e) = Self::sync_config_csv_to_db(conn, &bot_id, &csv_content) { + error!("Failed to sync config.csv for bot '{}': {}", bot_name, e); + } else { + info!("Synced config.csv for bot '{}'", bot_name); + } + } + Err(e) => { + warn!("Could not read config.csv for bot '{}': {}", bot_name, e); + } + } + } else { + debug!("No config.csv found at {}", config_path.display()); + } + Ok(bot_id) } @@ -2212,7 +2235,7 @@ VAULT_CACHE_TTL=300 match diesel::sql_query( "INSERT INTO bot_configuration (id, bot_id, config_key, config_value, config_type, created_at, updated_at) \ VALUES ($1, $2, $3, $4, 'string', NOW(), NOW()) \ - ON CONFLICT (bot_id, config_key) DO UPDATE SET config_value = EXCLUDED.config_value, updated_at = NOW()" + ON CONFLICT (bot_id, config_key) DO NOTHING" ) .bind::(new_id) .bind::(bot_id) diff --git a/src/core/bot/kb_context.rs b/src/core/bot/kb_context.rs index 6d9521abe..4ec63ad58 100644 --- a/src/core/bot/kb_context.rs +++ b/src/core/bot/kb_context.rs @@ -16,6 +16,13 @@ pub struct SessionKbAssociation { pub is_active: bool, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SessionWebsiteAssociation { + pub website_url: String, + pub collection_name: String, + pub is_active: bool, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct KbContext { pub kb_name: String, @@ -80,6 +87,38 @@ impl KbContextManager { .collect()) } + pub fn get_active_websites(&self, session_id: Uuid) -> Result> { + let mut conn = self.db_pool.get()?; + + let query = diesel::sql_query( + "SELECT website_url, collection_name, is_active + FROM session_website_associations + WHERE session_id = $1 AND is_active = true", + ) + .bind::(session_id); + + #[derive(QueryableByName)] + struct WebsiteAssocRow { + #[diesel(sql_type = diesel::sql_types::Text)] + website_url: String, + #[diesel(sql_type = diesel::sql_types::Text)] + collection_name: String, + #[diesel(sql_type = diesel::sql_types::Bool)] + is_active: bool, + } + + let rows: Vec = query.load(&mut conn)?; + + Ok(rows + .into_iter() + .map(|row| SessionWebsiteAssociation { + website_url: row.website_url, + collection_name: row.collection_name, + is_active: row.is_active, + }) + .collect()) + } + pub async fn search_active_kbs( &self, session_id: Uuid, @@ -140,6 +179,116 @@ impl KbContextManager { Ok(kb_contexts) } + pub async fn search_active_websites( + &self, + session_id: Uuid, + query: &str, + max_results_per_website: usize, + max_total_tokens: usize, + ) -> Result> { + let active_websites = self.get_active_websites(session_id)?; + + if active_websites.is_empty() { + debug!("No active websites for session {}", session_id); + return Ok(Vec::new()); + } + + info!( + "Searching {} active websites for session {}: {:?}", + active_websites.len(), + session_id, + active_websites.iter().map(|w| &w.website_url).collect::>() + ); + + let mut kb_contexts = Vec::new(); + let mut total_tokens_used = 0; + + for website_assoc in active_websites { + if total_tokens_used >= max_total_tokens { + warn!("Reached max token limit, skipping remaining websites"); + break; + } + + match self + .search_single_collection( + &website_assoc.collection_name, + &website_assoc.website_url, + query, + max_results_per_website, + max_total_tokens - total_tokens_used, + ) + .await + { + Ok(context) => { + total_tokens_used += context.total_tokens; + info!( + "Found {} results from website '{}' using {} tokens", + context.search_results.len(), + context.kb_name, + context.total_tokens + ); + kb_contexts.push(context); + } + Err(e) => { + error!("Failed to search website '{}': {}", website_assoc.website_url, e); + } + } + } + + Ok(kb_contexts) + } + + async fn search_single_collection( + &self, + collection_name: &str, + display_name: &str, + query: &str, + max_results: usize, + max_tokens: usize, + ) -> Result { + debug!("Searching collection '{}' with query: {}", collection_name, query); + + let search_results = self + .kb_manager + .search_collection(collection_name, query, max_results) + .await?; + + let mut kb_search_results = Vec::new(); + let mut total_tokens = 0; + + for result in search_results { + let tokens = estimate_tokens(&result.content); + + if total_tokens + tokens > max_tokens { + debug!( + "Skipping result due to token limit ({} + {} > {})", + total_tokens, tokens, max_tokens + ); + break; + } + + kb_search_results.push(KbSearchResult { + content: result.content, + document_path: result.document_path, + score: result.score, + chunk_tokens: tokens, + }); + + total_tokens += tokens; + + if result.score < 0.6 { + debug!("Skipping low-relevance result (score: {})", result.score); + break; + } + } + + Ok(KbContext { + kb_name: display_name.to_string(), + search_results: kb_search_results, + total_tokens, + }) + } + async fn search_single_kb( &self, bot_name: &str, @@ -204,7 +353,7 @@ impl KbContextManager { } context_parts.push(format!( - "\n## From '{}' knowledge base:", + "\n## From '{}':", kb_context.kb_name )); @@ -223,7 +372,10 @@ impl KbContextManager { } context_parts.push("\n--- End Knowledge Base Context ---\n".to_string()); - context_parts.join("\n") + let full_context = context_parts.join("\n"); + + // Truncate KB context to fit within token limits (max 400 tokens for KB context) + crate::core::shared::utils::truncate_text_for_model(&full_context, "local", 400) } pub fn get_active_tools(&self, session_id: Uuid) -> Result> { @@ -260,25 +412,32 @@ pub async fn inject_kb_context( messages: &mut serde_json::Value, max_context_tokens: usize, ) -> Result<()> { - let context_manager = KbContextManager::new(kb_manager, db_pool); + let context_manager = KbContextManager::new(kb_manager.clone(), db_pool.clone()); let kb_contexts = context_manager - .search_active_kbs(session_id, bot_name, user_query, 5, max_context_tokens) + .search_active_kbs(session_id, bot_name, user_query, 5, max_context_tokens / 2) .await?; - if kb_contexts.is_empty() { - debug!("No KB context found for session {}", session_id); + let website_contexts = context_manager + .search_active_websites(session_id, user_query, 5, max_context_tokens / 2) + .await?; + + let mut all_contexts = kb_contexts; + all_contexts.extend(website_contexts); + + if all_contexts.is_empty() { + debug!("No KB or website context found for session {}", session_id); return Ok(()); } - let context_string = context_manager.build_context_string(&kb_contexts); + let context_string = context_manager.build_context_string(&all_contexts); if context_string.is_empty() { return Ok(()); } info!( - "Injecting {} characters of KB context into prompt for session {}", + "Injecting {} characters of KB/website context into prompt for session {}", context_string.len(), session_id ); diff --git a/src/core/bot/mod.rs b/src/core/bot/mod.rs index ed21b517e..fd1d0eb71 100644 --- a/src/core/bot/mod.rs +++ b/src/core/bot/mod.rs @@ -26,6 +26,7 @@ use diesel::ExpressionMethods; use diesel::PgConnection; use diesel::QueryDsl; use diesel::RunQueryDsl; +use diesel::TextExpressionMethods; use futures::{sink::SinkExt, stream::StreamExt}; #[cfg(feature = "llm")] use log::trace; @@ -36,6 +37,7 @@ use std::sync::Arc; use tokio::sync::mpsc; use tokio::sync::Mutex as AsyncMutex; use uuid::Uuid; +use serde::{Deserialize, Serialize}; pub mod channels; pub mod multimedia; @@ -86,6 +88,70 @@ pub struct BotOrchestrator { pub mounted_bots: Arc>>>, } +#[derive(Debug, Deserialize)] +pub struct BotConfigQuery { + pub bot_name: Option, +} + +#[derive(Debug, Serialize)] +pub struct BotConfigResponse { + pub public: bool, +} + +/// Get bot configuration endpoint +/// Returns bot's public setting and other configuration values +pub async fn get_bot_config( + Query(params): Query, + State(state): State>, +) -> Result, StatusCode> { + let bot_name = params.bot_name.unwrap_or_else(|| "default".to_string()); + + let mut conn = match state.conn.get() { + Ok(c) => c, + Err(e) => { + error!("Failed to get database connection: {}", e); + return Err(StatusCode::INTERNAL_SERVER_ERROR); + } + }; + + // Query bot_configuration table for this bot's public setting + use crate::shared::models::schema::bot_configuration::dsl::*; + + let mut is_public = false; + + match bot_configuration + .select((config_key, config_value)) + .filter(config_key.like(format!("{}%", bot_name))) + .filter(config_key.like("%public%")) + .load::<(String, String)>(&mut conn) + { + Ok(configs) => { + for (key, value) in configs { + let key: String = key; + let value: String = value; + // Check if this is the public setting + let clean_key = key.strip_prefix(&format!("{}.", bot_name)) + .or_else(|| key.strip_prefix(&format!("{}_", bot_name))) + .unwrap_or(&key); + + if clean_key.eq_ignore_ascii_case("public") { + is_public = value.eq_ignore_ascii_case("true") || value == "1"; + break; + } + } + info!("Retrieved public status for bot '{}': {}", bot_name, is_public); + } + Err(e) => { + warn!("Failed to load public status for bot '{}': {}", bot_name, e); + // Return default (not public) + } + } + + let config_response = BotConfigResponse { public: is_public }; + + Ok(Json(config_response)) +} + impl BotOrchestrator { pub fn new(state: Arc) -> Self { Self { diff --git a/src/core/config_reload.rs b/src/core/config_reload.rs new file mode 100644 index 000000000..9fd536a44 --- /dev/null +++ b/src/core/config_reload.rs @@ -0,0 +1,59 @@ +// Simple config reload endpoint +use axum::{extract::State, http::StatusCode, response::Json}; +use serde_json::{json, Value}; +use std::sync::Arc; +use crate::shared::state::AppState; +use crate::core::config::ConfigManager; + +pub async fn reload_config( + State(state): State>, +) -> Result, StatusCode> { + let config_manager = ConfigManager::new(state.conn.clone()); + + // Get default bot + let conn_arc = state.conn.clone(); + let (default_bot_id, _) = tokio::task::spawn_blocking(move || -> Result<(uuid::Uuid, String), String> { + let mut conn = conn_arc + .get() + .map_err(|e| format!("failed to get db connection: {e}"))?; + Ok(crate::bot::get_default_bot(&mut *conn)) + }) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)? + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + // Get LLM config + let llm_url = config_manager + .get_config(&default_bot_id, "llm-url", Some("http://localhost:8081")) + .unwrap_or_else(|_| "http://localhost:8081".to_string()); + + let llm_model = config_manager + .get_config(&default_bot_id, "llm-model", Some("local")) + .unwrap_or_else(|_| "local".to_string()); + + let llm_endpoint_path = config_manager + .get_config(&default_bot_id, "llm-endpoint-path", Some("/v1/chat/completions")) + .unwrap_or_else(|_| "/v1/chat/completions".to_string()); + + // Update LLM provider + if let Some(dynamic_llm) = &state.dynamic_llm_provider { + dynamic_llm + .update_from_config(&llm_url, Some(llm_model.clone()), Some(llm_endpoint_path.clone())) + .await; + + Ok(Json(json!({ + "status": "success", + "message": "LLM configuration reloaded", + "config": { + "llm_url": llm_url, + "llm_model": llm_model, + "llm_endpoint_path": llm_endpoint_path + } + }))) + } else { + Ok(Json(json!({ + "status": "error", + "message": "Dynamic LLM provider not available" + }))) + } +} diff --git a/src/core/directory/api.rs b/src/core/directory/api.rs index 03a8bef5f..a8f0ad492 100644 --- a/src/core/directory/api.rs +++ b/src/core/directory/api.rs @@ -76,7 +76,7 @@ pub async fn provision_user_handler( }); } - let s3_client = state.s3_client.clone().map(Arc::new); + let s3_client = state.drive.clone().map(Arc::new); let base_url = state .config .as_ref() @@ -109,7 +109,7 @@ pub async fn deprovision_user_handler( State(state): State>, Path(id): Path, ) -> impl IntoResponse { - let s3_client = state.s3_client.clone().map(Arc::new); + let s3_client = state.drive.clone().map(Arc::new); let base_url = state .config .as_ref() @@ -249,7 +249,7 @@ pub async fn check_services_status(State(state): State>) -> impl I status.database = state.conn.get().is_ok(); - if let Some(s3_client) = &state.s3_client { + if let Some(s3_client) = &state.drive { if let Ok(result) = s3_client.list_buckets().send().await { status.drive = result.buckets.is_some(); } diff --git a/src/core/kb/document_processor.rs b/src/core/kb/document_processor.rs index f2510b16c..380e6380d 100644 --- a/src/core/kb/document_processor.rs +++ b/src/core/kb/document_processor.rs @@ -1,5 +1,5 @@ use anyhow::Result; -use log::{debug, error, info, warn}; +use log::{debug, info, warn}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::path::Path; @@ -111,6 +111,10 @@ impl DocumentProcessor { self.chunk_overlap } + pub fn is_supported_file(&self, path: &Path) -> bool { + DocumentFormat::from_extension(path).is_some() + } + pub async fn process_document(&self, file_path: &Path) -> Result> { if !file_path.exists() { return Err(anyhow::anyhow!("File not found: {}", file_path.display())); @@ -161,12 +165,29 @@ impl DocumentProcessor { } async fn extract_text(&self, file_path: &Path, format: DocumentFormat) -> Result { + // Check file size before processing to prevent memory exhaustion + let metadata = tokio::fs::metadata(file_path).await?; + let file_size = metadata.len() as usize; + + if file_size > format.max_size() { + return Err(anyhow::anyhow!( + "File too large: {} bytes (max: {} bytes)", + file_size, + format.max_size() + )); + } + match format { DocumentFormat::TXT | DocumentFormat::MD => { - let mut file = tokio::fs::File::open(file_path).await?; - let mut contents = String::new(); - file.read_to_string(&mut contents).await?; - Ok(contents) + // Use streaming read for large text files + if file_size > 10 * 1024 * 1024 { // 10MB + self.extract_large_text_file(file_path).await + } else { + let mut file = tokio::fs::File::open(file_path).await?; + let mut contents = String::with_capacity(std::cmp::min(file_size, 1024 * 1024)); + file.read_to_string(&mut contents).await?; + Ok(contents) + } } DocumentFormat::PDF => self.extract_pdf_text(file_path).await, DocumentFormat::DOCX => self.extract_docx_text(file_path).await, @@ -183,6 +204,34 @@ impl DocumentProcessor { } } + async fn extract_large_text_file(&self, file_path: &Path) -> Result { + use tokio::io::AsyncBufReadExt; + + let file = tokio::fs::File::open(file_path).await?; + let reader = tokio::io::BufReader::new(file); + let mut lines = reader.lines(); + let mut content = String::new(); + let mut line_count = 0; + const MAX_LINES: usize = 100_000; // Limit lines to prevent memory exhaustion + + while let Some(line) = lines.next_line().await? { + if line_count >= MAX_LINES { + warn!("Truncating large file at {} lines: {}", MAX_LINES, file_path.display()); + break; + } + content.push_str(&line); + content.push('\n'); + line_count += 1; + + // Yield control periodically + if line_count % 1000 == 0 { + tokio::task::yield_now().await; + } + } + + Ok(content) + } + async fn extract_pdf_text(&self, file_path: &Path) -> Result { let file_path_str = file_path.to_string_lossy().to_string(); let cmd_result = SafeCommand::new("pdftotext") @@ -369,7 +418,17 @@ impl DocumentProcessor { fn create_chunks(&self, text: &str, file_path: &Path) -> Vec { let mut chunks = Vec::new(); - let chars: Vec = text.chars().collect(); + + // For very large texts, limit processing to prevent memory exhaustion + const MAX_TEXT_SIZE: usize = 10 * 1024 * 1024; // 10MB + let text_to_process = if text.len() > MAX_TEXT_SIZE { + warn!("Truncating large text to {} chars for chunking: {}", MAX_TEXT_SIZE, file_path.display()); + &text[..MAX_TEXT_SIZE] + } else { + text + }; + + let chars: Vec = text_to_process.chars().collect(); let total_chars = chars.len(); if total_chars == 0 { @@ -386,12 +445,18 @@ impl DocumentProcessor { 1 }; - while start < total_chars { + // Limit maximum number of chunks to prevent memory exhaustion + const MAX_CHUNKS: usize = 1000; + let max_chunks_to_create = std::cmp::min(total_chunks, MAX_CHUNKS); + + while start < total_chars && chunk_index < max_chunks_to_create { let end = std::cmp::min(start + self.chunk_size, total_chars); let mut chunk_end = end; if end < total_chars { - for i in (start..end).rev() { + // Find word boundary within reasonable distance + let search_start = std::cmp::max(start, end.saturating_sub(100)); + for i in (search_start..end).rev() { if chars[i].is_whitespace() { chunk_end = i + 1; break; @@ -401,6 +466,12 @@ impl DocumentProcessor { let chunk_content: String = chars[start..chunk_end].iter().collect(); + // Skip empty or very small chunks + if chunk_content.trim().len() < 10 { + start = chunk_end; + continue; + } + chunks.push(TextChunk { content: chunk_content, metadata: ChunkMetadata { @@ -410,7 +481,7 @@ impl DocumentProcessor { .and_then(|s| s.to_str()) .map(|s| s.to_string()), chunk_index, - total_chunks, + total_chunks: max_chunks_to_create, start_char: start, end_char: chunk_end, page_number: None, @@ -430,6 +501,10 @@ impl DocumentProcessor { } } + if chunk_index >= MAX_CHUNKS { + warn!("Truncated chunking at {} chunks for: {}", MAX_CHUNKS, file_path.display()); + } + chunks } @@ -437,8 +512,6 @@ impl DocumentProcessor { &self, kb_path: &Path, ) -> Result>> { - let mut results = HashMap::new(); - if !kb_path.exists() { return Err(anyhow::anyhow!( "Knowledge base folder not found: {}", @@ -448,42 +521,83 @@ impl DocumentProcessor { info!("Processing knowledge base folder: {}", kb_path.display()); - self.process_directory_recursive(kb_path, &mut results) - .await?; - - info!("Processed {} documents in knowledge base", results.len()); + // Process files in small batches to prevent memory exhaustion + let mut results = HashMap::new(); + const BATCH_SIZE: usize = 10; // Much smaller batch size + + let files = self.collect_supported_files(kb_path).await?; + info!("Found {} supported files to process", files.len()); + + for batch in files.chunks(BATCH_SIZE) { + let mut batch_results = HashMap::new(); + + for file_path in batch { + match self.process_document(file_path).await { + Ok(chunks) => { + if !chunks.is_empty() { + batch_results.insert(file_path.to_string_lossy().to_string(), chunks); + } + } + Err(e) => { + warn!("Failed to process document {}: {}", file_path.display(), e); + } + } + + // Yield control after each file + tokio::task::yield_now().await; + } + + // Merge batch results and clear batch memory + results.extend(batch_results); + + // Force memory cleanup between batches + if results.len() % (BATCH_SIZE * 2) == 0 { + results.shrink_to_fit(); + } + + info!("Processed batch, total documents: {}", results.len()); + } + info!("Completed processing {} documents in knowledge base", results.len()); Ok(results) } - fn process_directory_recursive<'a>( - &'a self, - dir: &'a Path, - results: &'a mut HashMap>, - ) -> std::pin::Pin> + Send + 'a>> { - Box::pin(async move { - let mut entries = tokio::fs::read_dir(dir).await?; + async fn collect_supported_files(&self, dir: &Path) -> Result> { + let mut files = Vec::new(); + self.collect_files_recursive(dir, &mut files, 0).await?; + Ok(files) + } - while let Some(entry) = entries.next_entry().await? { - let path = entry.path(); - let metadata = entry.metadata().await?; + async fn collect_files_recursive( + &self, + dir: &Path, + files: &mut Vec, + depth: usize, + ) -> Result<()> { + // Prevent excessive recursion + if depth > 10 { + warn!("Skipping deep directory to prevent stack overflow: {}", dir.display()); + return Ok(()); + } - if metadata.is_dir() { - self.process_directory_recursive(&path, results).await?; - } else if metadata.is_file() && DocumentFormat::from_extension(&path).is_some() { - match self.process_document(&path).await { - Ok(chunks) => { - let key = path.to_string_lossy().to_string(); - results.insert(key, chunks); - } - Err(e) => { - error!("Failed to process document {}: {}", path.display(), e); - } - } + let mut entries = tokio::fs::read_dir(dir).await?; + + while let Some(entry) = entries.next_entry().await? { + let path = entry.path(); + let metadata = entry.metadata().await?; + + if metadata.is_dir() { + Box::pin(self.collect_files_recursive(&path, files, depth + 1)).await?; + } else if self.is_supported_file(&path) { + // Skip very large files + if metadata.len() > 50 * 1024 * 1024 { + warn!("Skipping large file: {} ({})", path.display(), metadata.len()); + continue; } + files.push(path); } + } - Ok(()) - }) + Ok(()) } } diff --git a/src/core/kb/embedding_generator.rs b/src/core/kb/embedding_generator.rs index a2222aab8..8b2a6fbd9 100644 --- a/src/core/kb/embedding_generator.rs +++ b/src/core/kb/embedding_generator.rs @@ -42,7 +42,7 @@ impl Default for EmbeddingConfig { dimensions: 384, batch_size: 16, timeout_seconds: 60, - max_concurrent_requests: 2, + max_concurrent_requests: 1, connect_timeout_seconds: 10, } } @@ -60,7 +60,7 @@ impl EmbeddingConfig { dimensions, batch_size: 16, timeout_seconds: 60, - max_concurrent_requests: 2, + max_concurrent_requests: 1, connect_timeout_seconds: 10, } } @@ -84,18 +84,48 @@ struct EmbeddingRequest { model: String, } +// OpenAI/Claude/OpenAI-compatible format #[derive(Debug, Deserialize)] -struct EmbeddingResponse { - data: Vec, +struct OpenAIEmbeddingResponse { + data: Vec, model: String, usage: Option, } #[derive(Debug, Deserialize)] -struct EmbeddingData { +struct OpenAIEmbeddingData { embedding: Vec, } +// llama.cpp format +#[derive(Debug, Deserialize)] +struct LlamaCppEmbeddingItem { + embedding: Vec>, +} + +// Hugging Face/SentenceTransformers format (simple array) +type HuggingFaceEmbeddingResponse = Vec>; + +// Generic embedding service format (object with embeddings key) +#[derive(Debug, Deserialize)] +struct GenericEmbeddingResponse { + embeddings: Vec>, + #[serde(default)] + model: Option, + #[serde(default)] + usage: Option, +} + +// Universal response wrapper - tries formats in order of likelihood +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum EmbeddingResponse { + OpenAI(OpenAIEmbeddingResponse), // Most common: OpenAI, Claude, etc. + LlamaCpp(Vec), // llama.cpp server + HuggingFace(HuggingFaceEmbeddingResponse), // Simple array format + Generic(GenericEmbeddingResponse), // Generic services +} + #[derive(Debug, Deserialize)] struct EmbeddingUsage { #[serde(default)] @@ -232,7 +262,7 @@ impl KbEmbeddingGenerator { chunks.len(), MemoryStats::format_bytes(start_mem.rss_bytes)); let mut results = Vec::with_capacity(chunks.len()); - let total_batches = (chunks.len() + self.config.batch_size - 1) / self.config.batch_size; + let total_batches = chunks.len().div_ceil(self.config.batch_size); for (batch_num, batch) in chunks.chunks(self.config.batch_size).enumerate() { let batch_start = MemoryStats::current(); @@ -304,11 +334,7 @@ impl KbEmbeddingGenerator { info!("[EMBEDDING] generate_batch_embeddings: {} texts, {} total chars", texts.len(), total_chars); - let truncated_texts: Vec = texts.into_iter() - .map(|t| if t.len() > 8192 { t[..8192].to_string() } else { t }) - .collect(); - - match self.generate_local_embeddings(&truncated_texts).await { + match self.generate_local_embeddings(&texts).await { Ok(embeddings) => { info!("[EMBEDDING] Local embeddings succeeded: {} vectors", embeddings.len()); Ok(embeddings) @@ -321,8 +347,13 @@ impl KbEmbeddingGenerator { } async fn generate_local_embeddings(&self, texts: &[String]) -> Result> { + // Apply token-aware truncation to each text before creating request + let truncated_texts: Vec = texts.iter() + .map(|text| crate::core::shared::utils::truncate_text_for_model(text, &self.config.embedding_model, 600)) + .collect(); + let request = EmbeddingRequest { - input: texts.to_vec(), + input: truncated_texts, model: self.config.embedding_model.clone(), }; @@ -334,7 +365,7 @@ impl KbEmbeddingGenerator { let response = self .client - .post(format!("{}/embeddings", self.config.embedding_url)) + .post(format!("{}/embedding", self.config.embedding_url)) .json(&request) .send() .await @@ -364,19 +395,67 @@ impl KbEmbeddingGenerator { } let embedding_response: EmbeddingResponse = serde_json::from_slice(&response_bytes) - .context("Failed to parse embedding response")?; + .with_context(|| { + let preview = std::str::from_utf8(&response_bytes) + .map(|s| if s.len() > 200 { &s[..200] } else { s }) + .unwrap_or(""); + format!("Failed to parse embedding response. Preview: {}", preview) + })?; drop(response_bytes); - let mut embeddings = Vec::with_capacity(embedding_response.data.len()); - for data in embedding_response.data { - embeddings.push(Embedding { - vector: data.embedding, - dimensions: self.config.dimensions, - model: embedding_response.model.clone(), - tokens_used: embedding_response.usage.as_ref().map(|u| u.total_tokens), - }); - } + let embeddings = match embedding_response { + EmbeddingResponse::OpenAI(openai_response) => { + let mut embeddings = Vec::with_capacity(openai_response.data.len()); + for data in openai_response.data { + embeddings.push(Embedding { + vector: data.embedding, + dimensions: self.config.dimensions, + model: openai_response.model.clone(), + tokens_used: openai_response.usage.as_ref().map(|u| u.total_tokens), + }); + } + embeddings + } + EmbeddingResponse::LlamaCpp(llama_response) => { + let mut embeddings = Vec::new(); + for item in llama_response { + for embedding_vec in item.embedding { + embeddings.push(Embedding { + vector: embedding_vec, + dimensions: self.config.dimensions, + model: self.config.embedding_model.clone(), + tokens_used: None, + }); + } + } + embeddings + } + EmbeddingResponse::HuggingFace(hf_response) => { + let mut embeddings = Vec::with_capacity(hf_response.len()); + for embedding_vec in hf_response { + embeddings.push(Embedding { + vector: embedding_vec, + dimensions: self.config.dimensions, + model: self.config.embedding_model.clone(), + tokens_used: None, + }); + } + embeddings + } + EmbeddingResponse::Generic(generic_response) => { + let mut embeddings = Vec::with_capacity(generic_response.embeddings.len()); + for embedding_vec in generic_response.embeddings { + embeddings.push(Embedding { + vector: embedding_vec, + dimensions: self.config.dimensions, + model: generic_response.model.clone().unwrap_or_else(|| self.config.embedding_model.clone()), + tokens_used: generic_response.usage.as_ref().map(|u| u.total_tokens), + }); + } + embeddings + } + }; Ok(embeddings) } diff --git a/src/core/kb/kb_indexer.rs b/src/core/kb/kb_indexer.rs index d13025c2f..d3d94ddb5 100644 --- a/src/core/kb/kb_indexer.rs +++ b/src/core/kb/kb_indexer.rs @@ -168,58 +168,53 @@ impl KbIndexer { let mut total_chunks = 0; let mut indexed_documents = 0; + const BATCH_SIZE: usize = 5; // Smaller batch size to prevent memory exhaustion + let mut batch_docs = Vec::with_capacity(BATCH_SIZE); - for (doc_path, chunks) in documents { + // Process documents in iterator to avoid keeping all in memory + let mut doc_iter = documents.into_iter(); + + while let Some((doc_path, chunks)) = doc_iter.next() { if chunks.is_empty() { debug!("[KB_INDEXER] Skipping document with no chunks: {}", doc_path); continue; } - let before_embed = MemoryStats::current(); - trace!( - "[KB_INDEXER] Processing document: {} ({} chunks) RSS={}", - doc_path, - chunks.len(), - MemoryStats::format_bytes(before_embed.rss_bytes) - ); + batch_docs.push((doc_path, chunks)); - // Re-validate embedding server is still available before generating embeddings - // This prevents memory from being held if server went down during document processing - if !is_embedding_server_ready() { - warn!("[KB_INDEXER] Embedding server became unavailable during indexing, aborting"); - return Err(anyhow::anyhow!( - "Embedding server became unavailable during KB indexing. Processed {} documents before failure.", - indexed_documents - )); - } - - trace!("[KB_INDEXER] Calling generate_embeddings for {} chunks...", chunks.len()); - let embeddings = match self - .embedding_generator - .generate_embeddings(&chunks) - .await - { - Ok(emb) => emb, - Err(e) => { - warn!("[KB_INDEXER] Embedding generation failed for {}: {}", doc_path, e); - // Continue with next document instead of failing entire batch - continue; + // Process batch when full + if batch_docs.len() >= BATCH_SIZE { + let (processed, chunks_count) = self.process_document_batch(&collection_name, &mut batch_docs).await?; + indexed_documents += processed; + total_chunks += chunks_count; + + // Clear batch and force memory cleanup + batch_docs.clear(); + batch_docs.shrink_to_fit(); + + // Yield control to prevent blocking + tokio::task::yield_now().await; + + // Memory pressure check - more aggressive + let current_mem = MemoryStats::current(); + if current_mem.rss_bytes > 1_500_000_000 { // 1.5GB threshold (reduced) + warn!("[KB_INDEXER] High memory usage detected: {}, forcing cleanup", + MemoryStats::format_bytes(current_mem.rss_bytes)); + + // Force garbage collection hint + std::hint::black_box(&batch_docs); + + // Add delay to allow memory cleanup + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; } - }; + } + } - let after_embed = MemoryStats::current(); - trace!("[KB_INDEXER] After generate_embeddings: {} embeddings, RSS={} (delta={})", - embeddings.len(), - MemoryStats::format_bytes(after_embed.rss_bytes), - MemoryStats::format_bytes(after_embed.rss_bytes.saturating_sub(before_embed.rss_bytes))); - log_jemalloc_stats(); - - let points = Self::create_qdrant_points(&doc_path, embeddings)?; - - self.upsert_points(&collection_name, points).await?; - - total_chunks += chunks.len(); - indexed_documents += 1; + // Process remaining documents in final batch + if !batch_docs.is_empty() { + let (processed, chunks_count) = self.process_document_batch(&collection_name, &mut batch_docs).await?; + indexed_documents += processed; + total_chunks += chunks_count; } self.update_collection_metadata(&collection_name, bot_name, kb_name, total_chunks)?; @@ -238,6 +233,74 @@ impl KbIndexer { }) } + async fn process_document_batch( + &self, + collection_name: &str, + batch_docs: &mut Vec<(String, Vec)>, + ) -> Result<(usize, usize)> { + let mut processed_count = 0; + let mut total_chunks = 0; + + // Process documents one by one to minimize memory usage + while let Some((doc_path, chunks)) = batch_docs.pop() { + let before_embed = MemoryStats::current(); + trace!( + "[KB_INDEXER] Processing document: {} ({} chunks) RSS={}", + doc_path, + chunks.len(), + MemoryStats::format_bytes(before_embed.rss_bytes) + ); + + // Re-validate embedding server is still available + if !is_embedding_server_ready() { + warn!("[KB_INDEXER] Embedding server became unavailable during indexing, aborting batch"); + return Err(anyhow::anyhow!( + "Embedding server became unavailable during KB indexing. Processed {} documents before failure.", + processed_count + )); + } + + // Process chunks in smaller sub-batches to prevent memory exhaustion + const CHUNK_BATCH_SIZE: usize = 20; // Process 20 chunks at a time + let mut chunk_batches = chunks.chunks(CHUNK_BATCH_SIZE); + + while let Some(chunk_batch) = chunk_batches.next() { + trace!("[KB_INDEXER] Processing chunk batch of {} chunks", chunk_batch.len()); + + let embeddings = match self + .embedding_generator + .generate_embeddings(chunk_batch) + .await + { + Ok(emb) => emb, + Err(e) => { + warn!("[KB_INDEXER] Embedding generation failed for {}: {}", doc_path, e); + break; // Skip to next document + } + }; + + let points = Self::create_qdrant_points(&doc_path, embeddings)?; + self.upsert_points(collection_name, points).await?; + + // Yield control between chunk batches + tokio::task::yield_now().await; + } + + let after_embed = MemoryStats::current(); + trace!("[KB_INDEXER] After processing document: RSS={} (delta={})", + MemoryStats::format_bytes(after_embed.rss_bytes), + MemoryStats::format_bytes(after_embed.rss_bytes.saturating_sub(before_embed.rss_bytes))); + + total_chunks += chunks.len(); + processed_count += 1; + + // Force memory cleanup after each document + std::hint::black_box(&chunks); + } + + Ok((processed_count, total_chunks)) + } + async fn ensure_collection_exists(&self, collection_name: &str) -> Result<()> { let check_url = format!("{}/collections/{}", self.qdrant_config.url, collection_name); diff --git a/src/core/kb/mod.rs b/src/core/kb/mod.rs index 26cce9e6e..9d12ff7e2 100644 --- a/src/core/kb/mod.rs +++ b/src/core/kb/mod.rs @@ -83,6 +83,15 @@ impl KnowledgeBaseManager { self.indexer.search(&collection_name, query, limit).await } + pub async fn search_collection( + &self, + collection_name: &str, + query: &str, + limit: usize, + ) -> Result> { + self.indexer.search(collection_name, query, limit).await + } + pub async fn process_document(&self, file_path: &Path) -> Result> { self.processor.process_document(file_path).await } diff --git a/src/core/kb/web_crawler.rs b/src/core/kb/web_crawler.rs index 3200f6af2..669dfbcd1 100644 --- a/src/core/kb/web_crawler.rs +++ b/src/core/kb/web_crawler.rs @@ -12,6 +12,7 @@ pub struct WebsiteCrawlConfig { pub max_pages: usize, pub crawl_delay_ms: u64, pub expires_policy: String, + pub refresh_policy: Option, pub last_crawled: Option>, pub next_crawl: Option>, } @@ -21,7 +22,10 @@ impl WebsiteCrawlConfig { let now = chrono::Utc::now(); self.last_crawled = Some(now); - let duration = match self.expires_policy.as_str() { + // Use refresh_policy if available, otherwise fall back to expires_policy + let policy = self.refresh_policy.as_ref().unwrap_or(&self.expires_policy); + + let duration = match policy.as_str() { "1h" => chrono::Duration::hours(1), "6h" => chrono::Duration::hours(6), "12h" => chrono::Duration::hours(12), @@ -117,6 +121,10 @@ impl WebCrawler { pub async fn crawl(&mut self) -> Result> { info!("Starting crawl of website: {}", self.config.url); + // Pre-allocate with reasonable capacity + self.pages.reserve(self.config.max_pages.min(1000)); + self.visited_urls.reserve(self.config.max_pages * 2); + self.crawl_recursive(&self.config.url.clone(), 0).await?; info!( @@ -125,21 +133,23 @@ impl WebCrawler { self.config.url ); - Ok(self.pages.clone()) + // Move pages out to avoid cloning + let pages = std::mem::take(&mut self.pages); + + // Clear collections to free memory immediately + self.visited_urls.clear(); + self.visited_urls.shrink_to_fit(); + + Ok(pages) } async fn crawl_recursive(&mut self, url: &str, depth: usize) -> Result<()> { + // Hard limits to prevent memory leaks if depth > self.config.max_depth { - trace!( - "Reached max depth {} for URL: {}", - self.config.max_depth, - url - ); return Ok(()); } if self.pages.len() >= self.config.max_pages { - trace!("Reached max pages limit: {}", self.config.max_pages); return Ok(()); } @@ -147,18 +157,31 @@ impl WebCrawler { return Ok(()); } + // Strict memory limit - prevent unbounded growth + if self.visited_urls.len() >= 1000 { + warn!("Visited URLs limit reached, stopping crawl to prevent memory leak"); + return Ok(()); + } + self.visited_urls.insert(url.to_string()); if !self.visited_urls.is_empty() { sleep(Duration::from_millis(self.config.crawl_delay_ms)).await; } - let response = match self.client.get(url).send().await { - Ok(resp) => resp, - Err(e) => { + let response = match tokio::time::timeout( + Duration::from_secs(30), // Add timeout to prevent hanging + self.client.get(url).send() + ).await { + Ok(Ok(resp)) => resp, + Ok(Err(e)) => { warn!("Failed to fetch {}: {}", url, e); return Ok(()); } + Err(_) => { + warn!("Timeout fetching {}", url); + return Ok(()); + } }; let content_type = response @@ -172,22 +195,54 @@ impl WebCrawler { return Ok(()); } - let html_text = match response.text().await { - Ok(text) => text, - Err(e) => { + let html_text = match tokio::time::timeout( + Duration::from_secs(15), // Reduced timeout + response.text() + ).await { + Ok(Ok(text)) => { + // Strict size limit to prevent memory issues + if text.len() > 500_000 { // 500KB limit + text.chars().take(500_000).collect() + } else { + text + } + } + Ok(Err(e)) => { warn!("Failed to read response from {}: {}", url, e); return Ok(()); } + Err(_) => { + warn!("Timeout reading response from {}", url); + return Ok(()); + } }; let page = Self::extract_page_content(&html_text, url); self.pages.push(page); + // Aggressive memory cleanup every 10 pages + if self.pages.len() % 10 == 0 { + self.pages.shrink_to_fit(); + self.visited_urls.shrink_to_fit(); + } + if depth < self.config.max_depth { let links = Self::extract_links(&html_text, url); - for link in links { + let max_links = std::cmp::min(20, links.len()); // Strict limit + + for link in links.into_iter().take(max_links) { if Self::is_same_domain(url, &link) { - Box::pin(self.crawl_recursive(&link, depth + 1)).await?; + // Use Box::pin to prevent stack overflow + if let Err(e) = Box::pin(self.crawl_recursive(&link, depth + 1)).await { + warn!("Error crawling {}: {}", link, e); + continue; + } + + // Check limits frequently to prevent runaway crawling + if self.pages.len() >= self.config.max_pages || + self.visited_urls.len() >= 1000 { + break; + } } } } @@ -196,35 +251,41 @@ impl WebCrawler { } fn extract_page_content(html: &str, url: &str) -> WebPage { - let mut text = html.to_string(); + // Use capacity hint to reduce allocations + let mut text = String::with_capacity(html.len() / 2); + text.push_str(html); + // Remove scripts more efficiently while let Some(start) = text.find("") { - text.replace_range(start..=end + 8, " "); + if let Some(end) = text[start..].find("") { + text.drain(start..start + end + 9); } else { break; } } + // Remove styles more efficiently while let Some(start) = text.find("") { - text.replace_range(start..=end + 7, " "); + if let Some(end) = text[start..].find("") { + text.drain(start..start + end + 8); } else { break; } } let title = if let Some(title_start) = text.find("") { - text.find("") - .map(|title_end| text[title_start + 7..title_end].to_string()) + text[title_start + 7..] + .find("") + .map(|title_end| text[title_start + 7..title_start + 7 + title_end].to_string()) } else { None }; + // Remove HTML tags more efficiently while let Some(start) = text.find('<') { - if let Some(end) = text.find('>') { - if end > start { - text.replace_range(start..=end, " "); + if let Some(end) = text[start..].find('>') { + if end > 0 { + text.drain(start..start + end + 1); } else { break; } @@ -233,7 +294,12 @@ impl WebCrawler { } } - let content = text.split_whitespace().collect::>().join(" "); + // Clean whitespace without excessive allocations + let content = text + .split_whitespace() + .filter(|s| !s.is_empty()) + .collect::>() + .join(" "); WebPage { url: url.to_string(), diff --git a/src/core/mod.rs b/src/core/mod.rs index 6880a07e8..d3557f48b 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -4,6 +4,7 @@ pub mod bootstrap; pub mod bot; pub mod bot_database; pub mod config; +pub mod config_reload; #[cfg(feature = "directory")] pub mod directory; pub mod dns; diff --git a/src/core/organization_invitations.rs b/src/core/organization_invitations.rs index c4db43cbe..09b2b47f5 100644 --- a/src/core/organization_invitations.rs +++ b/src/core/organization_invitations.rs @@ -879,19 +879,19 @@ mod tests { let org_id = Uuid::new_v4(); let invited_by = Uuid::new_v4(); - let result = service - .create_invitation( - org_id, - "Test Org", - "test@example.com", - InvitationRole::Member, - vec![], - invited_by, - "Admin", - None, - 7, - ) - .await; + let params = crate::core::organization_invitations::CreateInvitationParams { + organization_id: org_id, + organization_name: "Test Org", + email: "test@example.com".to_string(), + role: "Member".to_string(), + groups: vec![], + invited_by: invited_by, + invited_by_name: Some("Admin".to_string()), + message: None, + expires_in_days: Some(7), + }; + + let result = service.create_invitation(params).await; assert!(result.is_ok()); let invitation = result.unwrap(); @@ -905,35 +905,29 @@ mod tests { let org_id = Uuid::new_v4(); let invited_by = Uuid::new_v4(); - let first = service - .create_invitation( - org_id, - "Test Org", - "test@example.com", - InvitationRole::Member, - vec![], - invited_by, - "Admin", - None, - 7, - ) - .await; - assert!(first.is_ok()); + let params = crate::core::organization_invitations::CreateInvitationParams { + organization_id: org_id, + organization_name: "Test Org", + email: "test@example.com".to_string(), + role: "Member".to_string(), + groups: vec![], + invited_by: invited_by, + invited_by_name: Some("Admin".to_string()), + message: None, + expires_in_days: Some(7), + }; - let second = service - .create_invitation( - org_id, - "Test Org", - "test@example.com", - InvitationRole::Member, - vec![], - invited_by, - "Admin", - None, - 7, - ) - .await; - assert!(second.is_err()); + let first_result = service.create_invitation(params.clone()).await; + + assert!(first_result.is_ok()); + + let second_result = service.create_invitation(params).await; + + assert!(second_result.is_err()); + assert_eq!( + second_result.unwrap_err(), + "An invitation already exists for this email" + ); } #[tokio::test] @@ -943,20 +937,18 @@ mod tests { let invited_by = Uuid::new_v4(); let user_id = Uuid::new_v4(); - let invitation = service - .create_invitation( - org_id, - "Test Org", - "accept@example.com", - InvitationRole::Member, - vec![], - invited_by, - "Admin", - None, - 7, - ) - .await - .unwrap(); + let params = crate::core::organization_invitations::CreateInvitationParams { + organization_id: org_id, + organization_name: "Test Org", + email: "test@example.com".to_string(), + role: "Member".to_string(), + groups: vec![], + invited_by, + invited_by_name: Some("Admin".to_string()), + message: None, + expires_in_days: Some(7), + }; + let invitation = service.create_invitation(params).await.unwrap(); let result = service.accept_invitation(&invitation.token, user_id).await; assert!(result.is_ok()); diff --git a/src/core/package_manager/cache.rs b/src/core/package_manager/cache.rs index 70c79dd82..9b752f121 100644 --- a/src/core/package_manager/cache.rs +++ b/src/core/package_manager/cache.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use std::fs; use std::path::{Path, PathBuf}; -const DEFAULT_CACHE_DIR: &str = "botserver-installers"; +const DEFAULT_CACHE_DIR: &str = "./botserver-installers"; const CONFIG_FILE: &str = "3rdparty.toml"; diff --git a/src/core/package_manager/installer.rs b/src/core/package_manager/installer.rs index 1c17a3425..5423ff11c 100644 --- a/src/core/package_manager/installer.rs +++ b/src/core/package_manager/installer.rs @@ -77,7 +77,7 @@ fn get_llama_cpp_url() -> Option { } info!("Using standard Ubuntu x64 build (CPU)"); - return get_component_url("llm"); + get_component_url("llm") } #[cfg(target_arch = "s390x")] @@ -363,7 +363,7 @@ impl PackageManager { "https://huggingface.co/CompendiumLabs/bge-small-en-v1.5-gguf/resolve/main/bge-small-en-v1.5-f32.gguf".to_string(), ], - exec_cmd: "nohup {{BIN_PATH}}/llama-server --port 8081 --ssl-key-file {{CONF_PATH}}/system/certificates/llm/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/llm/server.crt -m {{DATA_PATH}}/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf > {{LOGS_PATH}}/llm.log 2>&1 & nohup {{BIN_PATH}}/llama-server --port 8082 --ssl-key-file {{CONF_PATH}}/system/certificates/embedding/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/embedding/server.crt -m {{DATA_PATH}}/bge-small-en-v1.5-f32.gguf --embedding > {{LOGS_PATH}}/embedding.log 2>&1 &".to_string(), + exec_cmd: "nohup {{BIN_PATH}}/build/bin/llama-server --port 8081 --ssl-key-file {{CONF_PATH}}/system/certificates/llm/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/llm/server.crt -m {{DATA_PATH}}/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf --ubatch-size 512 > {{LOGS_PATH}}/llm.log 2>&1 & nohup {{BIN_PATH}}/build/bin/llama-server --port 8082 --ssl-key-file {{CONF_PATH}}/system/certificates/embedding/server.key --ssl-cert-file {{CONF_PATH}}/system/certificates/embedding/server.crt -m {{DATA_PATH}}/bge-small-en-v1.5-f32.gguf --embedding --ubatch-size 512 > {{LOGS_PATH}}/embedding.log 2>&1 &".to_string(), check_cmd: "curl -f -k --connect-timeout 2 -m 5 https://localhost:8081/health >/dev/null 2>&1 && curl -f -k --connect-timeout 2 -m 5 https://localhost:8082/health >/dev/null 2>&1".to_string(), }, ); diff --git a/src/core/shared/schema/core.rs b/src/core/shared/schema/core.rs index 39fe16fd1..14bd2d620 100644 --- a/src/core/shared/schema/core.rs +++ b/src/core/shared/schema/core.rs @@ -7,6 +7,24 @@ diesel::table! { } } +diesel::table! { + organization_invitations (id) { + id -> Uuid, + org_id -> Uuid, + email -> Varchar, + role -> Varchar, + status -> Varchar, + message -> Nullable, + invited_by -> Uuid, + token -> Nullable, + created_at -> Timestamptz, + updated_at -> Nullable, + expires_at -> Nullable, + accepted_at -> Nullable, + accepted_by -> Nullable, + } +} + diesel::table! { bots (id) { id -> Uuid, @@ -261,6 +279,8 @@ diesel::joinable!(rbac_user_groups -> users (user_id)); diesel::joinable!(rbac_user_groups -> rbac_groups (group_id)); diesel::joinable!(rbac_group_roles -> rbac_groups (group_id)); diesel::joinable!(rbac_group_roles -> rbac_roles (role_id)); +diesel::joinable!(website_crawls -> bots (bot_id)); +diesel::joinable!(organization_invitations -> organizations (org_id)); diesel::table! { user_preferences (id) { @@ -287,6 +307,25 @@ diesel::table! { } } +diesel::table! { + website_crawls (id) { + id -> Uuid, + bot_id -> Uuid, + url -> Text, + last_crawled -> Nullable, + next_crawl -> Nullable, + expires_policy -> Varchar, + max_depth -> Nullable, + max_pages -> Nullable, + crawl_status -> Nullable, + pages_crawled -> Nullable, + error_message -> Nullable, + created_at -> Nullable, + updated_at -> Nullable, + refresh_policy -> Nullable, + } +} + diesel::allow_tables_to_appear_in_same_query!( rbac_roles, rbac_groups, @@ -296,4 +335,8 @@ diesel::allow_tables_to_appear_in_same_query!( rbac_user_groups, rbac_group_roles, users, + website_crawls, + bots, + organizations, + organization_invitations, ); diff --git a/src/core/shared/state.rs b/src/core/shared/state.rs index 7708df96a..c4b011882 100644 --- a/src/core/shared/state.rs +++ b/src/core/shared/state.rs @@ -6,21 +6,21 @@ use crate::core::config::AppConfig; use crate::core::kb::KnowledgeBaseManager; use crate::core::session::SessionManager; use crate::core::shared::analytics::MetricsCollector; -#[cfg(feature = "project")] -use crate::project::ProjectService; -#[cfg(feature = "compliance")] -use crate::legal::LegalService; -use crate::security::auth_provider::AuthProviderRegistry; -use crate::security::jwt::JwtManager; -use crate::security::rbac_middleware::RbacManager; #[cfg(all(test, feature = "directory"))] use crate::core::shared::test_utils::create_mock_auth_service; #[cfg(all(test, feature = "llm"))] use crate::core::shared::test_utils::MockLLMProvider; #[cfg(feature = "directory")] use crate::directory::AuthService; +#[cfg(feature = "compliance")] +use crate::legal::LegalService; #[cfg(feature = "llm")] -use crate::llm::LLMProvider; +use crate::llm::{DynamicLLMProvider, LLMProvider}; +#[cfg(feature = "project")] +use crate::project::ProjectService; +use crate::security::auth_provider::AuthProviderRegistry; +use crate::security::jwt::JwtManager; +use crate::security::rbac_middleware::RbacManager; use crate::shared::models::BotResponse; use crate::shared::utils::DbPool; #[cfg(feature = "tasks")] @@ -174,7 +174,11 @@ pub struct TaskProgressEvent { } impl TaskProgressEvent { - pub fn new(task_id: impl Into, step: impl Into, message: impl Into) -> Self { + pub fn new( + task_id: impl Into, + step: impl Into, + message: impl Into, + ) -> Self { Self { event_type: "task_progress".to_string(), task_id: task_id.into(), @@ -212,7 +216,11 @@ impl TaskProgressEvent { pub fn with_progress(mut self, current: u8, total: u8) -> Self { self.current_step = current; self.total_steps = total; - self.progress = if total > 0 { ((current as u16 * 100) / total as u16) as u8 } else { 0 }; + self.progress = if total > 0 { + ((current as u16 * 100) / total as u16) as u8 + } else { + 0 + }; self } @@ -248,7 +256,11 @@ impl TaskProgressEvent { self } - pub fn started(task_id: impl Into, message: impl Into, total_steps: u8) -> Self { + pub fn started( + task_id: impl Into, + message: impl Into, + total_steps: u8, + ) -> Self { Self { event_type: "task_started".to_string(), task_id: task_id.into(), @@ -345,8 +357,6 @@ pub struct BillingAlertNotification { pub struct AppState { #[cfg(feature = "drive")] pub drive: Option, - #[cfg(feature = "drive")] - pub s3_client: Option, #[cfg(feature = "cache")] pub cache: Option>, pub bucket_name: String, @@ -360,6 +370,8 @@ pub struct AppState { pub task_scheduler: Option>, #[cfg(feature = "llm")] pub llm_provider: Arc, + #[cfg(feature = "llm")] + pub dynamic_llm_provider: Option>, #[cfg(feature = "directory")] pub auth_service: Arc>, pub channels: Arc>>>, @@ -389,8 +401,6 @@ impl Clone for AppState { Self { #[cfg(feature = "drive")] drive: self.drive.clone(), - #[cfg(feature = "drive")] - s3_client: self.s3_client.clone(), bucket_name: self.bucket_name.clone(), config: self.config.clone(), conn: self.conn.clone(), @@ -404,6 +414,8 @@ impl Clone for AppState { task_scheduler: self.task_scheduler.clone(), #[cfg(feature = "llm")] llm_provider: Arc::clone(&self.llm_provider), + #[cfg(feature = "llm")] + dynamic_llm_provider: self.dynamic_llm_provider.clone(), #[cfg(feature = "directory")] auth_service: Arc::clone(&self.auth_service), #[cfg(any(feature = "research", feature = "llm"))] @@ -437,9 +449,6 @@ impl std::fmt::Debug for AppState { #[cfg(feature = "drive")] debug.field("drive", &self.drive.is_some()); - #[cfg(feature = "drive")] - debug.field("s3_client", &self.s3_client.is_some()); - #[cfg(feature = "cache")] debug.field("cache", &self.cache.is_some()); @@ -455,7 +464,6 @@ impl std::fmt::Debug for AppState { #[cfg(any(feature = "research", feature = "llm"))] debug.field("kb_manager", &self.kb_manager.is_some()); - #[cfg(feature = "tasks")] debug.field("task_scheduler", &self.task_scheduler.is_some()); @@ -480,9 +488,15 @@ impl std::fmt::Debug for AppState { debug .field("extensions", &self.extensions) .field("attendant_broadcast", &self.attendant_broadcast.is_some()) - .field("task_progress_broadcast", &self.task_progress_broadcast.is_some()) + .field( + "task_progress_broadcast", + &self.task_progress_broadcast.is_some(), + ) .field("jwt_manager", &self.jwt_manager.is_some()) - .field("auth_provider_registry", &self.auth_provider_registry.is_some()) + .field( + "auth_provider_registry", + &self.auth_provider_registry.is_some(), + ) .field("rbac_manager", &self.rbac_manager.is_some()) .finish() } @@ -498,7 +512,10 @@ impl AppState { ); if let Some(tx) = &self.task_progress_broadcast { let receiver_count = tx.receiver_count(); - log::info!("[TASK_PROGRESS] Broadcast channel has {} receivers", receiver_count); + log::info!( + "[TASK_PROGRESS] Broadcast channel has {} receivers", + receiver_count + ); match tx.send(event) { Ok(_) => { log::info!("[TASK_PROGRESS] Event sent successfully"); @@ -512,16 +529,8 @@ impl AppState { } } - pub fn emit_progress( - &self, - task_id: &str, - step: &str, - message: &str, - current: u8, - total: u8, - ) { - let event = TaskProgressEvent::new(task_id, step, message) - .with_progress(current, total); + pub fn emit_progress(&self, task_id: &str, step: &str, message: &str, current: u8, total: u8) { + let event = TaskProgressEvent::new(task_id, step, message).with_progress(current, total); self.broadcast_task_progress(event); } @@ -566,8 +575,7 @@ impl AppState { } pub fn emit_task_error(&self, task_id: &str, step: &str, error: &str) { - let event = TaskProgressEvent::new(task_id, step, "Task failed") - .with_error(error); + let event = TaskProgressEvent::new(task_id, step, "Task failed").with_error(error); self.broadcast_task_progress(event); } @@ -605,8 +613,6 @@ impl Default for AppState { Self { #[cfg(feature = "drive")] drive: None, - #[cfg(feature = "drive")] - s3_client: None, #[cfg(feature = "cache")] cache: None, bucket_name: "test-bucket".to_string(), @@ -620,6 +626,8 @@ impl Default for AppState { task_scheduler: None, #[cfg(all(test, feature = "llm"))] llm_provider: Arc::new(MockLLMProvider::new()), + #[cfg(feature = "llm")] + dynamic_llm_provider: None, #[cfg(feature = "directory")] auth_service: Arc::new(tokio::sync::Mutex::new(create_mock_auth_service())), channels: Arc::new(tokio::sync::Mutex::new(HashMap::new())), diff --git a/src/core/shared/test_utils.rs b/src/core/shared/test_utils.rs index 5e82e501c..38624c231 100644 --- a/src/core/shared/test_utils.rs +++ b/src/core/shared/test_utils.rs @@ -67,7 +67,7 @@ impl LLMProvider for MockLLMProvider { &self, _prompt: &str, _config: &Value, - tx: mpsc::Sender, + tx: tokio::sync::mpsc::Sender, _model: &str, _key: &str, ) -> Result<(), Box> { @@ -210,6 +210,8 @@ impl TestAppStateBuilder { task_scheduler: None, #[cfg(feature = "llm")] llm_provider: Arc::new(MockLLMProvider::new()), + #[cfg(feature = "llm")] + dynamic_llm_provider: None, #[cfg(feature = "directory")] auth_service: Arc::new(tokio::sync::Mutex::new(create_mock_auth_service())), channels: Arc::new(tokio::sync::Mutex::new(HashMap::new())), @@ -226,7 +228,9 @@ impl TestAppStateBuilder { billing_alert_broadcast: None, task_manifests: Arc::new(std::sync::RwLock::new(HashMap::new())), #[cfg(feature = "project")] - project_service: Arc::new(tokio::sync::RwLock::new(crate::project::ProjectService::new())), + project_service: Arc::new(tokio::sync::RwLock::new( + crate::project::ProjectService::new(), + )), #[cfg(feature = "compliance")] legal_service: Arc::new(tokio::sync::RwLock::new(crate::legal::LegalService::new())), jwt_manager: None, diff --git a/src/core/shared/utils.rs b/src/core/shared/utils.rs index 3c7b529fe..1d9a430cf 100644 --- a/src/core/shared/utils.rs +++ b/src/core/shared/utils.rs @@ -337,313 +337,15 @@ pub fn run_migrations_on_conn( ) -> Result<(), Box> { use diesel_migrations::{embed_migrations, EmbeddedMigrations, MigrationHarness}; - // Core migrations (Always run) - const CORE_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/core"); - conn.run_pending_migrations(CORE_MIGRATIONS).map_err(|e| { + // Flat migrations with version-ordinal-feature naming + const FLAT_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations"); + conn.run_pending_migrations(FLAT_MIGRATIONS).map_err(|e| { Box::new(std::io::Error::other(format!( - "Core migration error: {}", + "Migration error: {}", e ))) as Box })?; - // Calendar - #[cfg(feature = "calendar")] - { - const CALENDAR_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/calendar"); - conn.run_pending_migrations(CALENDAR_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Calendar migration error: {}", - e - ))) as Box - })?; - } - - // People (CRM) - #[cfg(feature = "people")] - { - const PEOPLE_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/people"); - conn.run_pending_migrations(PEOPLE_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "People migration error: {}", - e - ))) as Box - })?; - } - - // Mail - #[cfg(feature = "mail")] - { - const MAIL_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/mail"); - conn.run_pending_migrations(MAIL_MIGRATIONS).map_err(|e| { - Box::new(std::io::Error::other(format!( - "Mail migration error: {}", - e - ))) as Box - })?; - } - - // Tasks - #[cfg(feature = "tasks")] - { - const TASKS_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/tasks"); - conn.run_pending_migrations(TASKS_MIGRATIONS).map_err(|e| { - Box::new(std::io::Error::other(format!( - "Tasks migration error: {}", - e - ))) as Box - })?; - } - - // Drive - #[cfg(feature = "drive")] - { - const DRIVE_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/drive"); - conn.run_pending_migrations(DRIVE_MIGRATIONS).map_err(|e| { - Box::new(std::io::Error::other(format!( - "Drive migration error: {}", - e - ))) as Box - })?; - } - - // Automation - #[cfg(feature = "automation")] - { - const AUTOMATION_MIGRATIONS: EmbeddedMigrations = - embed_migrations!("migrations/automation"); - conn.run_pending_migrations(AUTOMATION_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Automation migration error: {}", - e - ))) as Box - })?; - } - - // Paper - #[cfg(feature = "paper")] - { - const PAPER_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/paper"); - conn.run_pending_migrations(PAPER_MIGRATIONS).map_err(|e| { - Box::new(std::io::Error::other(format!( - "Paper migration error: {}", - e - ))) as Box - })?; - } - - // Designer - #[cfg(feature = "designer")] - { - const DESIGNER_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/designer"); - conn.run_pending_migrations(DESIGNER_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Designer migration error: {}", - e - ))) as Box - })?; - } - - // Learn - #[cfg(feature = "learn")] - { - const LEARN_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/learn"); - conn.run_pending_migrations(LEARN_MIGRATIONS).map_err(|e| { - Box::new(std::io::Error::other(format!( - "Learn migration error: {}", - e - ))) as Box - })?; - } - - // Video - #[cfg(feature = "video")] - { - const VIDEO_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/video"); - conn.run_pending_migrations(VIDEO_MIGRATIONS).map_err(|e| { - Box::new(std::io::Error::other(format!( - "Video migration error: {}", - e - ))) as Box - })?; - } - - // LLM - #[cfg(feature = "llm")] - { - const LLM_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/llm"); - conn.run_pending_migrations(LLM_MIGRATIONS).map_err(|e| { - Box::new(std::io::Error::other(format!("LLM migration error: {}", e))) - as Box - })?; - } - - // Products - #[cfg(feature = "billing")] - { - const PRODUCTS_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/products"); - conn.run_pending_migrations(PRODUCTS_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Products migration error: {}", - e - ))) as Box - })?; - } - - // Billing - const BILLING_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/billing"); - conn.run_pending_migrations(BILLING_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Billing migration error: {}", - e - ))) as Box - })?; - - // Attendant - #[cfg(feature = "attendant")] - { - const ATTENDANT_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/attendant"); - conn.run_pending_migrations(ATTENDANT_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Attendant migration error: {}", - e - ))) as Box - })?; - } - - // Analytics - #[cfg(feature = "analytics")] - { - const ANALYTICS_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/analytics"); - conn.run_pending_migrations(ANALYTICS_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Analytics migration error: {}", - e - ))) as Box - })?; - } - - // Dashboards - #[cfg(feature = "dashboards")] - { - const DASHBOARDS_MIGRATIONS: EmbeddedMigrations = - embed_migrations!("migrations/dashboards"); - conn.run_pending_migrations(DASHBOARDS_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Dashboards migration error: {}", - e - ))) as Box - })?; - } - - // Meet - #[cfg(feature = "meet")] - { - const MEET_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/meet"); - conn.run_pending_migrations(MEET_MIGRATIONS).map_err(|e| { - Box::new(std::io::Error::other(format!( - "Meet migration error: {}", - e - ))) as Box - })?; - } - - // Tickets (Feedback) - const TICKETS_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/tickets"); - conn.run_pending_migrations(TICKETS_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Tickets migration error: {}", - e - ))) as Box - })?; - - // Compliance - #[cfg(feature = "compliance")] - { - const COMPLIANCE_MIGRATIONS: EmbeddedMigrations = - embed_migrations!("migrations/compliance"); - conn.run_pending_migrations(COMPLIANCE_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Compliance migration error: {}", - e - ))) as Box - })?; - } - - // Canvas - #[cfg(feature = "canvas")] - { - const CANVAS_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/canvas"); - conn.run_pending_migrations(CANVAS_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Canvas migration error: {}", - e - ))) as Box - })?; - } - - // Social - #[cfg(feature = "social")] - { - const SOCIAL_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/social"); - conn.run_pending_migrations(SOCIAL_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Social migration error: {}", - e - ))) as Box - })?; - } - - // Workspaces - #[cfg(feature = "workspaces")] - { - const WORKSPACE_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/workspaces"); - conn.run_pending_migrations(WORKSPACE_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Workspace migration error: {}", - e - ))) as Box - })?; - } - - // Goals - #[cfg(feature = "goals")] - { - const GOALS_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/goals"); - conn.run_pending_migrations(GOALS_MIGRATIONS).map_err(|e| { - Box::new(std::io::Error::other(format!( - "Goals migration error: {}", - e - ))) as Box - })?; - } - - // Research - #[cfg(feature = "research")] - { - const RESEARCH_MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations/research"); - conn.run_pending_migrations(RESEARCH_MIGRATIONS) - .map_err(|e| { - Box::new(std::io::Error::other(format!( - "Research migration error: {}", - e - ))) as Box - })?; - } - Ok(()) } @@ -825,3 +527,34 @@ pub fn parse_hex_color(hex: &str) -> Option<(u8, u8, u8)> { let b = u8::from_str_radix(&hex[4..6], 16).ok()?; Some((r, g, b)) } + +/// Estimates token count based on model type and truncates text to fit within token limit +pub fn truncate_text_for_model(text: &str, model: &str, max_tokens: usize) -> String { + let chars_per_token = estimate_chars_per_token(model); + let max_chars = max_tokens * chars_per_token; + + if text.len() <= max_chars { + text.to_string() + } else { + // Try to truncate at word boundary + let truncated = &text[..max_chars]; + if let Some(last_space) = truncated.rfind(' ') { + text[..last_space].to_string() + } else { + truncated.to_string() + } + } +} + +/// Estimates characters per token based on model type +fn estimate_chars_per_token(model: &str) -> usize { + if model.contains("gpt") || model.contains("claude") { + 4 // GPT/Claude models: ~4 chars per token + } else if model.contains("llama") || model.contains("mistral") { + 3 // Llama/Mistral models: ~3 chars per token + } else if model.contains("bert") || model.contains("mpnet") { + 4 // BERT-based models: ~4 chars per token + } else { + 4 // Default conservative estimate + } +} diff --git a/src/directory/mod.rs b/src/directory/mod.rs index 19a0a4f0a..b75aaff87 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -145,6 +145,8 @@ pub async fn auth_handler( Json(serde_json::json!({ "user_id": session.user_id, "session_id": session.id, + "bot_id": bot_id, + "bot_name": bot_name, "status": "authenticated" })), ); @@ -184,6 +186,8 @@ pub async fn auth_handler( Json(serde_json::json!({ "user_id": session.user_id, "session_id": session.id, + "bot_id": bot_id, + "bot_name": bot_name, "status": "authenticated" })), ) diff --git a/src/drive/drive_monitor/mod.rs b/src/drive/drive_monitor/mod.rs index be08b3fa8..490b6f6ed 100644 --- a/src/drive/drive_monitor/mod.rs +++ b/src/drive/drive_monitor/mod.rs @@ -19,13 +19,15 @@ use std::sync::Arc; #[cfg(any(feature = "research", feature = "llm"))] use tokio::sync::RwLock as TokioRwLock; use tokio::time::Duration; +use serde::{Deserialize, Serialize}; +use tokio::fs as tokio_fs; #[cfg(any(feature = "research", feature = "llm"))] #[allow(dead_code)] const KB_INDEXING_TIMEOUT_SECS: u64 = 60; const MAX_BACKOFF_SECS: u64 = 300; const INITIAL_BACKOFF_SECS: u64 = 30; -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct FileState { pub etag: String, } @@ -45,6 +47,15 @@ pub struct DriveMonitor { kb_indexing_in_progress: Arc>>, } impl DriveMonitor { + fn normalize_config_value(value: &str) -> String { + let trimmed = value.trim(); + if trimmed.is_empty() || trimmed.eq_ignore_ascii_case("none") { + String::new() + } else { + trimmed.to_string() + } + } + pub fn new(state: Arc, bucket_name: String, bot_id: uuid::Uuid) -> Self { let work_root = PathBuf::from("work"); #[cfg(any(feature = "research", feature = "llm"))] @@ -65,6 +76,98 @@ impl DriveMonitor { } } + /// Get the path to the file states JSON file for this bot + fn file_state_path(&self) -> PathBuf { + self.work_root + .join(format!("{}", self.bot_id)) + .join("file_states.json") + } + + /// Load file states from disk to avoid reprocessing unchanged files + async fn load_file_states(&self) -> Result<(), Box> { + let path = self.file_state_path(); + if path.exists() { + match tokio_fs::read_to_string(&path).await { + Ok(content) => { + match serde_json::from_str::>(&content) { + Ok(states) => { + let mut file_states = self.file_states.write().await; + let count = states.len(); + *file_states = states; + info!( + "[DRIVE_MONITOR] Loaded {} file states from disk for bot {}", + count, + self.bot_id + ); + } + Err(e) => { + warn!( + "[DRIVE_MONITOR] Failed to parse file states from {}: {}. Starting with empty state.", + path.display(), + e + ); + } + } + } + Err(e) => { + warn!( + "[DRIVE_MONITOR] Failed to read file states from {}: {}. Starting with empty state.", + path.display(), + e + ); + } + } + } else { + debug!( + "[DRIVE_MONITOR] No existing file states found at {} for bot {}. Starting fresh.", + path.display(), + self.bot_id + ); + } + Ok(()) + } + + /// Save file states to disk after updates + async fn save_file_states(&self) -> Result<(), Box> { + let path = self.file_state_path(); + + if let Some(parent) = path.parent() { + if let Err(e) = tokio_fs::create_dir_all(parent).await { + warn!( + "[DRIVE_MONITOR] Failed to create directory for file states: {} - {}", + parent.display(), + e + ); + } + } + + let file_states = self.file_states.read().await; + match serde_json::to_string_pretty(&*file_states) { + Ok(content) => { + if let Err(e) = tokio_fs::write(&path, content).await { + warn!( + "[DRIVE_MONITOR] Failed to save file states to {}: {}", + path.display(), + e + ); + } else { + debug!( + "[DRIVE_MONITOR] Saved {} file states to disk for bot {}", + file_states.len(), + self.bot_id + ); + } + } + Err(e) => { + warn!( + "[DRIVE_MONITOR] Failed to serialize file states: {}", + e + ); + } + } + Ok(()) + } + async fn check_drive_health(&self) -> bool { let Some(client) = &self.state.drive else { return false; @@ -100,12 +203,31 @@ impl DriveMonitor { pub async fn start_monitoring(&self) -> Result<(), Box> { trace!("start_monitoring ENTER"); let start_mem = MemoryStats::current(); - trace!("[DRIVE_MONITOR] Starting DriveMonitor for bot {}, RSS={}", - self.bot_id, MemoryStats::format_bytes(start_mem.rss_bytes)); + trace!( + "[DRIVE_MONITOR] Starting DriveMonitor for bot {}, RSS={}", + self.bot_id, + MemoryStats::format_bytes(start_mem.rss_bytes) + ); + + // Check if already processing to prevent duplicate monitoring + if self.is_processing.load(std::sync::atomic::Ordering::Acquire) { + warn!("[DRIVE_MONITOR] Already processing for bot {}, skipping", self.bot_id); + return Ok(()); + } + + // Load file states from disk to avoid reprocessing unchanged files + if let Err(e) = self.load_file_states().await { + warn!( + "[DRIVE_MONITOR] Failed to load file states for bot {}: {}", + self.bot_id, e + ); + } if !self.check_drive_health().await { - warn!("[DRIVE_MONITOR] S3/MinIO not available for bucket {}, will retry with backoff", - self.bucket_name); + warn!( + "[DRIVE_MONITOR] S3/MinIO not available for bucket {}, will retry with backoff", + self.bucket_name + ); } self.is_processing @@ -114,33 +236,57 @@ impl DriveMonitor { trace!("start_monitoring: calling check_for_changes..."); info!("[DRIVE_MONITOR] Calling initial check_for_changes..."); - match self.check_for_changes().await { - Ok(_) => { + match tokio::time::timeout(Duration::from_secs(300), self.check_for_changes()).await { + Ok(Ok(_)) => { self.consecutive_failures.store(0, Ordering::Relaxed); } - Err(e) => { + Ok(Err(e)) => { warn!("[DRIVE_MONITOR] Initial check failed (will retry): {}", e); self.consecutive_failures.fetch_add(1, Ordering::Relaxed); } + Err(_) => { + error!("[DRIVE_MONITOR] Initial check timed out after 5 minutes"); + self.consecutive_failures.fetch_add(1, Ordering::Relaxed); + } } trace!("start_monitoring: check_for_changes returned"); let after_initial = MemoryStats::current(); - trace!("[DRIVE_MONITOR] After initial check, RSS={} (delta={})", - MemoryStats::format_bytes(after_initial.rss_bytes), - MemoryStats::format_bytes(after_initial.rss_bytes.saturating_sub(start_mem.rss_bytes))); + trace!( + "[DRIVE_MONITOR] After initial check, RSS={} (delta={})", + MemoryStats::format_bytes(after_initial.rss_bytes), + MemoryStats::format_bytes(after_initial.rss_bytes.saturating_sub(start_mem.rss_bytes)) + ); - let self_clone = Arc::new(self.clone()); + // Force enable periodic monitoring regardless of initial check result + self.is_processing.store(true, std::sync::atomic::Ordering::SeqCst); + info!("[DRIVE_MONITOR] Forced is_processing to true for periodic monitoring"); + + let self_clone = self.clone(); // Don't wrap in Arc::new - that creates a copy tokio::spawn(async move { + let mut consecutive_processing_failures = 0; + info!("[DRIVE_MONITOR] Starting periodic monitoring loop for bot {}", self_clone.bot_id); + + let is_processing_state = self_clone.is_processing.load(std::sync::atomic::Ordering::SeqCst); + info!("[DRIVE_MONITOR] is_processing state at loop start: {} for bot {}", is_processing_state, self_clone.bot_id); + while self_clone .is_processing .load(std::sync::atomic::Ordering::SeqCst) { - let backoff = self_clone.calculate_backoff(); - tokio::time::sleep(backoff).await; + debug!("[DRIVE_MONITOR] Inside monitoring loop for bot {}", self_clone.bot_id); + debug!("[DRIVE_MONITOR] Periodic check starting for bot {}", self_clone.bot_id); + // Use fixed 10 second interval instead of backoff calculation + tokio::time::sleep(Duration::from_secs(10)).await; - if !self_clone.check_drive_health().await { - let failures = self_clone.consecutive_failures.fetch_add(1, Ordering::Relaxed) + 1; + debug!("[DRIVE_MONITOR] Checking drive health for bot {}", self_clone.bot_id); + // Skip drive health check - just proceed with monitoring + // if !self_clone.check_drive_health().await { + if false { + let failures = self_clone + .consecutive_failures + .fetch_add(1, Ordering::Relaxed) + + 1; if failures % 10 == 1 { warn!("[DRIVE_MONITOR] S3/MinIO unavailable for bucket {} (failures: {}), backing off to {:?}", self_clone.bucket_name, failures, self_clone.calculate_backoff()); @@ -148,20 +294,47 @@ impl DriveMonitor { continue; } - match self_clone.check_for_changes().await { - Ok(_) => { - let prev_failures = self_clone.consecutive_failures.swap(0, Ordering::Relaxed); + debug!("[DRIVE_MONITOR] About to call check_for_changes for bot {}", self_clone.bot_id); + // Add timeout to prevent hanging + match tokio::time::timeout(Duration::from_secs(300), self_clone.check_for_changes()).await { + Ok(Ok(_)) => { + let prev_failures = + self_clone.consecutive_failures.swap(0, Ordering::Relaxed); + consecutive_processing_failures = 0; if prev_failures > 0 { info!("[DRIVE_MONITOR] S3/MinIO recovered for bucket {} after {} failures", self_clone.bucket_name, prev_failures); } } - Err(e) => { - self_clone.consecutive_failures.fetch_add(1, Ordering::Relaxed); + Ok(Err(e)) => { + self_clone + .consecutive_failures + .fetch_add(1, Ordering::Relaxed); + consecutive_processing_failures += 1; error!("Error during sync for bot {}: {}", self_clone.bot_id, e); + + // If too many consecutive failures, stop processing temporarily + if consecutive_processing_failures > 10 { + error!("[DRIVE_MONITOR] Too many consecutive failures ({}), stopping processing for bot {}", + consecutive_processing_failures, self_clone.bot_id); + self_clone.is_processing.store(false, std::sync::atomic::Ordering::SeqCst); + break; + } + } + Err(_) => { + error!("[DRIVE_MONITOR] check_for_changes timed out for bot {}", self_clone.bot_id); + consecutive_processing_failures += 1; + + if consecutive_processing_failures > 5 { + error!("[DRIVE_MONITOR] Too many timeouts, stopping processing for bot {}", self_clone.bot_id); + self_clone.is_processing.store(false, std::sync::atomic::Ordering::SeqCst); + break; + } } } } + + info!("[DRIVE_MONITOR] Monitoring loop ended for bot {}", self_clone.bot_id); }); info!("DriveMonitor started for bot {}", self.bot_id); @@ -229,11 +402,13 @@ impl DriveMonitor { async fn check_for_changes(&self) -> Result<(), Box> { trace!("check_for_changes ENTER"); let start_mem = MemoryStats::current(); - trace!("[DRIVE_MONITOR] check_for_changes START, RSS={}", - MemoryStats::format_bytes(start_mem.rss_bytes)); + trace!( + "[DRIVE_MONITOR] check_for_changes START, RSS={}", + MemoryStats::format_bytes(start_mem.rss_bytes) + ); let Some(client) = &self.state.drive else { - trace!("check_for_changes: no drive client, returning"); + warn!("[DRIVE_MONITOR] No drive client available for bot {}, skipping file monitoring", self.bot_id); return Ok(()); }; @@ -242,34 +417,42 @@ impl DriveMonitor { self.check_gbdialog_changes(client).await?; trace!("check_for_changes: check_gbdialog_changes done"); let after_dialog = MemoryStats::current(); - trace!("[DRIVE_MONITOR] After gbdialog, RSS={} (delta={})", - MemoryStats::format_bytes(after_dialog.rss_bytes), - MemoryStats::format_bytes(after_dialog.rss_bytes.saturating_sub(start_mem.rss_bytes))); + trace!( + "[DRIVE_MONITOR] After gbdialog, RSS={} (delta={})", + MemoryStats::format_bytes(after_dialog.rss_bytes), + MemoryStats::format_bytes(after_dialog.rss_bytes.saturating_sub(start_mem.rss_bytes)) + ); trace!("check_for_changes: calling check_gbot..."); trace!("[DRIVE_MONITOR] Checking gbot..."); self.check_gbot(client).await?; trace!("check_for_changes: check_gbot done"); let after_gbot = MemoryStats::current(); - trace!("[DRIVE_MONITOR] After gbot, RSS={} (delta={})", - MemoryStats::format_bytes(after_gbot.rss_bytes), - MemoryStats::format_bytes(after_gbot.rss_bytes.saturating_sub(after_dialog.rss_bytes))); + trace!( + "[DRIVE_MONITOR] After gbot, RSS={} (delta={})", + MemoryStats::format_bytes(after_gbot.rss_bytes), + MemoryStats::format_bytes(after_gbot.rss_bytes.saturating_sub(after_dialog.rss_bytes)) + ); trace!("check_for_changes: calling check_gbkb_changes..."); trace!("[DRIVE_MONITOR] Checking gbkb..."); self.check_gbkb_changes(client).await?; trace!("check_for_changes: check_gbkb_changes done"); let after_gbkb = MemoryStats::current(); - trace!("[DRIVE_MONITOR] After gbkb, RSS={} (delta={})", - MemoryStats::format_bytes(after_gbkb.rss_bytes), - MemoryStats::format_bytes(after_gbkb.rss_bytes.saturating_sub(after_gbot.rss_bytes))); + trace!( + "[DRIVE_MONITOR] After gbkb, RSS={} (delta={})", + MemoryStats::format_bytes(after_gbkb.rss_bytes), + MemoryStats::format_bytes(after_gbkb.rss_bytes.saturating_sub(after_gbot.rss_bytes)) + ); log_jemalloc_stats(); let total_delta = after_gbkb.rss_bytes.saturating_sub(start_mem.rss_bytes); if total_delta > 50 * 1024 * 1024 { - warn!("[DRIVE_MONITOR] check_for_changes grew by {} - potential leak!", - MemoryStats::format_bytes(total_delta)); + warn!( + "[DRIVE_MONITOR] check_for_changes grew by {} - potential leak!", + MemoryStats::format_bytes(total_delta) + ); } trace!("check_for_changes EXIT"); @@ -344,12 +527,22 @@ impl DriveMonitor { for (path, state) in current_files { file_states.insert(path, state); } + // Save file states to disk in background to avoid blocking + let self_clone = Arc::new(self.clone()); + tokio::spawn(async move { + if let Err(e) = self_clone.save_file_states().await { + warn!("[DRIVE_MONITOR] Failed to save file states: {}", e); + } + }); Ok(()) } async fn check_gbot(&self, client: &Client) -> Result<(), Box> { trace!("check_gbot ENTER"); let config_manager = ConfigManager::new(self.state.conn.clone()); - debug!("check_gbot: Checking bucket {} for config.csv changes", self.bucket_name); + debug!( + "check_gbot: Checking bucket {} for config.csv changes", + self.bucket_name + ); let mut continuation_token = None; loop { let list_objects = match tokio::time::timeout( @@ -364,7 +557,10 @@ impl DriveMonitor { { Ok(Ok(list)) => list, Ok(Err(e)) => { - error!("check_gbot: Failed to list objects in bucket {}: {}", self.bucket_name, e); + error!( + "check_gbot: Failed to list objects in bucket {}: {}", + self.bucket_name, e + ); return Err(e.into()); } Err(_) => { @@ -380,6 +576,8 @@ impl DriveMonitor { || path_lower.ends_with("/config.csv") || path_lower.contains(".gbot/config.csv"); + debug!("check_gbot: Checking path: {} (is_config_csv: {})", path, is_config_csv); + if !is_config_csv { continue; } @@ -427,24 +625,20 @@ impl DriveMonitor { if key == "llm-model" { new_llm_model = new_value.to_string(); } - match config_manager.get_config(&self.bot_id, key, None) { - Ok(old_value) => { - if old_value != new_value { - info!( - "Detected change in {} (old: {}, new: {})", - key, old_value, new_value - ); - restart_needed = true; - if key == "llm-url" || key == "llm-model" { - llm_url_changed = true; - } - } - } - Err(_) => { - restart_needed = true; - if key == "llm-url" || key == "llm-model" { - llm_url_changed = true; - } + let normalized_old_value = match config_manager.get_config(&self.bot_id, key, None) { + Ok(val) => Self::normalize_config_value(&val), + Err(_) => String::new(), + }; + let normalized_new_value = Self::normalize_config_value(new_value); + + if normalized_old_value != normalized_new_value { + info!( + "Detected change in {} (old: {}, new: {})", + key, normalized_old_value, normalized_new_value + ); + restart_needed = true; + if key == "llm-url" || key == "llm-model" { + llm_url_changed = true; } } } @@ -464,15 +658,47 @@ impl DriveMonitor { let effective_url = if !new_llm_url.is_empty() { new_llm_url } else { - config_manager.get_config(&self.bot_id, "llm-url", None).unwrap_or_default() + config_manager + .get_config(&self.bot_id, "llm-url", None) + .unwrap_or_default() }; let effective_model = if !new_llm_model.is_empty() { new_llm_model } else { - config_manager.get_config(&self.bot_id, "llm-model", None).unwrap_or_default() + config_manager + .get_config(&self.bot_id, "llm-model", None) + .unwrap_or_default() }; - info!("LLM configuration changed to: URL={}, Model={}", effective_url, effective_model); + info!( + "LLM configuration changed to: URL={}, Model={}", + effective_url, effective_model + ); + + // Read the llm-endpoint-path config + let effective_endpoint_path = config_manager + .get_config( + &self.bot_id, + "llm-endpoint-path", + Some("/v1/chat/completions"), + ) + .unwrap_or_else(|_| "/v1/chat/completions".to_string()); + + // Update the DynamicLLMProvider with the new configuration + #[cfg(feature = "llm")] + if let Some(dynamic_llm) = &self.state.dynamic_llm_provider + { + dynamic_llm + .update_from_config( + &effective_url, + Some(effective_model), + Some(effective_endpoint_path), + ) + .await; + info!("Dynamic LLM provider updated with new configuration"); + } else { + warn!("Dynamic LLM provider not available - config change ignored"); + } } } } @@ -481,7 +707,6 @@ impl DriveMonitor { { let _ = config_manager.sync_gbot_config(&self.bot_id, &csv_content); } - } if csv_content.lines().any(|line| line.starts_with("theme-")) { self.broadcast_theme_change(&csv_content).await?; @@ -625,6 +850,170 @@ impl DriveMonitor { Ok::<(), Box>(()) }) .await??; + + // Check for USE WEBSITE commands and trigger immediate crawling + if source_content.contains("USE WEBSITE") { + self.trigger_immediate_website_crawl(&source_content).await?; + } + + Ok(()) + } + + async fn trigger_immediate_website_crawl( + &self, + source_content: &str, + ) -> Result<(), Box> { + use regex::Regex; + use std::collections::HashSet; + use diesel::prelude::*; + + #[derive(QueryableByName)] + struct CountResult { + #[diesel(sql_type = diesel::sql_types::BigInt)] + count: i64, + } + + let re = Regex::new(r#"USE\s+WEBSITE\s+"([^"]+)"(?:\s+REFRESH\s+"([^"]+)")?"#)?; + let mut processed_urls = HashSet::new(); + + for cap in re.captures_iter(source_content) { + if let Some(url) = cap.get(1) { + let url_str = url.as_str(); + + // Prevent duplicate processing of same URL in single batch + if processed_urls.contains(url_str) { + trace!("Skipping duplicate URL in batch: {}", url_str); + continue; + } + processed_urls.insert(url_str.to_string()); + + let refresh_str = cap.get(2).map(|m| m.as_str()).unwrap_or("1m"); + + info!("Found USE WEBSITE command for {}, checking if crawl needed", url_str); + + // Check if crawl is already in progress or recently completed + let mut conn = self.state.conn.get() + .map_err(|e| format!("Failed to get database connection: {}", e))?; + + // Check if crawl is already running or recently completed (within last 5 minutes) + let recent_crawl: Result = diesel::sql_query( + "SELECT COUNT(*) as count FROM website_crawls + WHERE bot_id = $1 AND url = $2 + AND (crawl_status = 2 OR (last_crawled > NOW() - INTERVAL '5 minutes'))" + ) + .bind::(&self.bot_id) + .bind::(url_str) + .get_result::(&mut conn) + .map(|r| r.count); + + if recent_crawl.unwrap_or(0) > 0 { + trace!("Skipping crawl for {} - already in progress or recently completed", url_str); + continue; + } + + crate::basic::keywords::use_website::register_website_for_crawling_with_refresh( + &mut conn, &self.bot_id, url_str, refresh_str + )?; + + // Use a semaphore to limit concurrent crawls + static CRAWL_SEMAPHORE: tokio::sync::Semaphore = tokio::sync::Semaphore::const_new(1); // Reduced to 1 + + let kb_manager = self.state.kb_manager.clone(); + let db_pool = self.state.conn.clone(); + let bot_id = self.bot_id; + let url_owned = url_str.to_string(); + + // Don't spawn if semaphore is full + if let Ok(_permit) = CRAWL_SEMAPHORE.try_acquire() { + tokio::spawn(async move { + if let Err(e) = Self::crawl_website_immediately(url_owned, bot_id, kb_manager, db_pool).await { + error!("Failed to immediately crawl website: {}", e); + } + // Permit is automatically dropped here + }); + } else { + warn!("Crawl semaphore full, skipping immediate crawl for {}", url_str); + } + } + } + + Ok(()) + } + + async fn crawl_website_immediately( + url: String, + _bot_id: uuid::Uuid, + _kb_manager: Option>, + _db_pool: crate::shared::DbPool, + ) -> Result<(), Box> { + #[cfg(feature = "crawler")] + { + use crate::core::kb::website_crawler_service::WebsiteCrawlerService; + use diesel::prelude::*; + + let kb_manager = match _kb_manager { + Some(kb) => kb, + None => { + warn!("Knowledge base manager not available, skipping website crawl"); + return Ok(()); + } + }; + + let mut conn = _db_pool.get()?; + + // Get the website record + #[derive(diesel::QueryableByName)] + struct WebsiteRecord { + #[diesel(sql_type = diesel::sql_types::Uuid)] + id: uuid::Uuid, + #[diesel(sql_type = diesel::sql_types::Uuid)] + bot_id: uuid::Uuid, + #[diesel(sql_type = diesel::sql_types::Text)] + url: String, + #[diesel(sql_type = diesel::sql_types::Text)] + expires_policy: String, + #[diesel(sql_type = diesel::sql_types::Text)] + refresh_policy: String, + #[diesel(sql_type = diesel::sql_types::Integer)] + max_depth: i32, + #[diesel(sql_type = diesel::sql_types::Integer)] + max_pages: i32, + } + + let website: WebsiteRecord = diesel::sql_query( + "SELECT id, bot_id, url, expires_policy, refresh_policy, max_depth, max_pages + FROM website_crawls + WHERE bot_id = $1 AND url = $2" + ) + .bind::(&_bot_id) + .bind::(&url) + .get_result(&mut conn)?; + + // Convert to WebsiteCrawlRecord format expected by crawl_website + let website_record = crate::core::kb::website_crawler_service::WebsiteCrawlRecord { + id: website.id, + bot_id: website.bot_id, + url: website.url, + expires_policy: website.expires_policy, + refresh_policy: Some(website.refresh_policy), + max_depth: website.max_depth, + max_pages: website.max_pages, + next_crawl: None, + crawl_status: Some(0), + }; + + // Create a temporary crawler service to use its crawl_website method + let crawler_service = WebsiteCrawlerService::new(_db_pool.clone(), kb_manager); + match crawler_service.crawl_single_website(website_record).await { + Ok(_) => {}, + Err(e) => return Err(format!("Website crawl failed: {}", e).into()), + } + } + #[cfg(not(feature = "crawler"))] + { + warn!("Crawler feature not enabled, skipping website crawl for {}", url); + } + Ok(()) } @@ -746,7 +1135,10 @@ impl DriveMonitor { .unwrap_or(false); if kb_indexing_disabled { - debug!("KB indexing disabled via DISABLE_KB_INDEXING, skipping {}", kb_folder_path.display()); + debug!( + "KB indexing disabled via DISABLE_KB_INDEXING, skipping {}", + kb_folder_path.display() + ); continue; } @@ -790,8 +1182,9 @@ impl DriveMonitor { let result = tokio::time::timeout( Duration::from_secs(KB_INDEXING_TIMEOUT_SECS), - kb_manager.handle_gbkb_change(&bot_name_owned, &kb_folder_owned) - ).await; + kb_manager.handle_gbkb_change(&bot_name_owned, &kb_folder_owned), + ) + .await; // Always remove from tracking set when done, regardless of outcome { @@ -829,7 +1222,9 @@ impl DriveMonitor { #[cfg(not(any(feature = "research", feature = "llm")))] { let _ = kb_folder_path; - debug!("KB indexing disabled because research/llm features are not enabled"); + debug!( + "KB indexing disabled because research/llm features are not enabled" + ); } } } @@ -853,11 +1248,18 @@ impl DriveMonitor { files_processed, pdf_files_found ); } - for (path, state) in current_files { file_states.insert(path, state); } + // Save file states to disk in background to avoid blocking + let self_clone = Arc::new(self.clone()); + tokio::spawn(async move { + if let Err(e) = self_clone.save_file_states().await { + warn!("[DRIVE_MONITOR] Failed to save file states: {}", e); + } + }); + for path in paths_to_remove { info!("Detected deletion in .gbkb: {}", path); file_states.remove(&path); diff --git a/src/drive/mod.rs b/src/drive/mod.rs index 757a15ea5..2e6a278fb 100644 --- a/src/drive/mod.rs +++ b/src/drive/mod.rs @@ -1141,7 +1141,7 @@ pub async fn list_versions( let bucket = params.bucket.unwrap_or_else(|| "default".to_string()); let path = params.path; - let s3_client = state.s3_client.as_ref().ok_or_else(|| { + let s3_client = state.drive.as_ref().ok_or_else(|| { ( StatusCode::SERVICE_UNAVAILABLE, Json(serde_json::json!({ "error": "S3 storage not configured" })), @@ -1194,7 +1194,7 @@ pub async fn restore_version( let path = payload.path; let version_id = payload.version_id; - let s3_client = state.s3_client.as_ref().ok_or_else(|| { + let s3_client = state.drive.as_ref().ok_or_else(|| { ( StatusCode::SERVICE_UNAVAILABLE, Json(serde_json::json!({ "error": "S3 storage not configured" })), diff --git a/src/email/vectordb.rs b/src/email/vectordb.rs index 1f52db0dc..327008dbd 100644 --- a/src/email/vectordb.rs +++ b/src/email/vectordb.rs @@ -481,9 +481,12 @@ impl EmailEmbeddingGenerator { async fn generate_local_embedding(&self, text: &str, embedding_url: &str) -> Result> { use serde_json::json; + // Truncate text to fit within token limit (600 tokens for safety under 768 limit) + let truncated_text = crate::core::shared::utils::truncate_text_for_model(text, "sentence-transformers/all-MiniLM-L6-v2", 600); + let client = reqwest::Client::new(); let body = json!({ - "text": text, + "text": truncated_text, "model": "sentence-transformers/all-MiniLM-L6-v2" }); diff --git a/src/embedded_ui.rs b/src/embedded_ui.rs index 11cf130ff..b290711cf 100644 --- a/src/embedded_ui.rs +++ b/src/embedded_ui.rs @@ -3,10 +3,10 @@ use axum::{ http::{header, Request, Response, StatusCode}, Router, }; -use rust_embed::Embed; +use rust_embed::RustEmbed; use std::path::Path; -#[derive(Embed)] +#[derive(RustEmbed)] #[folder = "../botui/ui/suite/"] #[prefix = ""] struct EmbeddedUi; @@ -56,6 +56,10 @@ async fn serve_embedded_file(req: Request) -> Response { path }; + let file_path = file_path.strip_prefix("suite/").unwrap_or(file_path); + + log::trace!("Serving embedded file: {}", file_path); + let try_paths = [ file_path.to_string(), format!("{}/index.html", file_path.trim_end_matches('/')), @@ -66,6 +70,8 @@ async fn serve_embedded_file(req: Request) -> Response { if let Some(content) = EmbeddedUi::get(try_path) { let mime = get_mime_type(try_path); + log::trace!("Found embedded file: {} with MIME type: {}", try_path, mime); + return Response::builder() .status(StatusCode::OK) .header(header::CONTENT_TYPE, mime) @@ -80,6 +86,8 @@ async fn serve_embedded_file(req: Request) -> Response { } } + log::warn!("Embedded file not found: {} (tried paths: {:?})", file_path, try_paths); + Response::builder() .status(StatusCode::NOT_FOUND) .header(header::CONTENT_TYPE, "text/html; charset=utf-8") @@ -103,9 +111,17 @@ pub fn embedded_ui_router() -> Router { } pub fn has_embedded_ui() -> bool { - EmbeddedUi::get("index.html").is_some() + let has_index = EmbeddedUi::get("index.html").is_some(); + if has_index { + log::info!("Embedded UI detected - index.html found"); + } else { + log::warn!("No embedded UI found - index.html not embedded"); + } + has_index } pub fn list_embedded_files() -> Vec { - EmbeddedUi::iter().map(|f| f.to_string()).collect() + let files: Vec = EmbeddedUi::iter().map(|f| f.to_string()).collect(); + log::debug!("Embedded UI contains {} files", files.len()); + files } diff --git a/src/llm/cache.rs b/src/llm/cache.rs index 751e7e57d..f9d005cf8 100644 --- a/src/llm/cache.rs +++ b/src/llm/cache.rs @@ -344,7 +344,7 @@ impl CachedLLMProvider { .await; if similarity >= self.config.similarity_threshold - && best_match.as_ref().map_or(true, |(_, s)| *s < similarity) + && best_match.as_ref().is_none_or(|(_, s)| *s < similarity) { best_match = Some((cached.clone(), similarity)); } @@ -623,7 +623,7 @@ impl EmbeddingService for LocalEmbeddingService { ) -> Result, Box> { let client = reqwest::Client::new(); let response = client - .post(format!("{}/embeddings", self.embedding_url)) + .post(format!("{}/embedding", self.embedding_url)) .json(&serde_json::json!({ "input": text, "model": self.model, diff --git a/src/llm/claude.rs b/src/llm/claude.rs index 7d46a3f3f..0b3f99c71 100644 --- a/src/llm/claude.rs +++ b/src/llm/claude.rs @@ -552,8 +552,7 @@ impl ClaudeClient { if !sse_buffer.is_empty() { for line in sse_buffer.lines() { let line = line.trim(); - if line.starts_with("data: ") { - let data = &line[6..]; + if let Some(data) = line.strip_prefix("data: ") { if data != "[DONE]" { if let Some(text) = self.process_sse_data(data, model_name) { let _ = tx.send(text).await; diff --git a/src/llm/local.rs b/src/llm/local.rs index dd5a8c136..a2d2c9af2 100644 --- a/src/llm/local.rs +++ b/src/llm/local.rs @@ -16,8 +16,10 @@ pub async fn ensure_llama_servers_running( ) -> Result<(), Box> { trace!("ensure_llama_servers_running ENTER"); let start_mem = MemoryStats::current(); - trace!("[LLM_LOCAL] ensure_llama_servers_running START, RSS={}", - MemoryStats::format_bytes(start_mem.rss_bytes)); + trace!( + "[LLM_LOCAL] ensure_llama_servers_running START, RSS={}", + MemoryStats::format_bytes(start_mem.rss_bytes) + ); log_jemalloc_stats(); if std::env::var("SKIP_LLM_SERVER").is_ok() { @@ -28,33 +30,32 @@ pub async fn ensure_llama_servers_running( let config_values = { let conn_arc = app_state.conn.clone(); - let default_bot_id = tokio::task::spawn_blocking(move || -> Result { - let mut conn = conn_arc.get().map_err(|e| format!("failed to get db connection: {e}"))?; - let bot_id = bots.filter(name.eq("default")) - .select(id) - .first::(&mut *conn) - .unwrap_or_else(|_| uuid::Uuid::nil()); - Ok(bot_id) + let (default_bot_id, _default_bot_name) = tokio::task::spawn_blocking(move || -> Result<(uuid::Uuid, String), String> { + let mut conn = conn_arc + .get() + .map_err(|e| format!("failed to get db connection: {e}"))?; + Ok(crate::bot::get_default_bot(&mut *conn)) }) .await??; let config_manager = ConfigManager::new(app_state.conn.clone()); + info!("Reading config for bot_id: {}", default_bot_id); + let embedding_model_result = config_manager.get_config(&default_bot_id, "embedding-model", None); + info!("embedding-model config result: {:?}", embedding_model_result); ( default_bot_id, config_manager .get_config(&default_bot_id, "llm-server", Some("true")) .unwrap_or_else(|_| "true".to_string()), config_manager - .get_config(&default_bot_id, "llm-url", None) - .unwrap_or_default(), + .get_config(&default_bot_id, "llm-url", Some("http://localhost:8081")) + .unwrap_or_else(|_| "http://localhost:8081".to_string()), config_manager .get_config(&default_bot_id, "llm-model", None) .unwrap_or_default(), config_manager - .get_config(&default_bot_id, "embedding-url", None) - .unwrap_or_default(), - config_manager - .get_config(&default_bot_id, "embedding-model", None) - .unwrap_or_default(), + .get_config(&default_bot_id, "embedding-url", Some("http://localhost:8082")) + .unwrap_or_else(|_| "http://localhost:8082".to_string()), + embedding_model_result.unwrap_or_default(), config_manager .get_config(&default_bot_id, "llm-server-path", None) .unwrap_or_default(), @@ -87,7 +88,10 @@ pub async fn ensure_llama_servers_running( info!("Restarting any existing llama-server processes..."); trace!("About to pkill llama-server..."); let before_pkill = MemoryStats::current(); - trace!("[LLM_LOCAL] Before pkill, RSS={}", MemoryStats::format_bytes(before_pkill.rss_bytes)); + trace!( + "[LLM_LOCAL] Before pkill, RSS={}", + MemoryStats::format_bytes(before_pkill.rss_bytes) + ); let pkill_result = SafeCommand::new("sh") .and_then(|c| c.arg("-c")) @@ -107,9 +111,11 @@ pub async fn ensure_llama_servers_running( trace!("pkill done"); let after_pkill = MemoryStats::current(); - trace!("[LLM_LOCAL] After pkill, RSS={} (delta={})", - MemoryStats::format_bytes(after_pkill.rss_bytes), - MemoryStats::format_bytes(after_pkill.rss_bytes.saturating_sub(before_pkill.rss_bytes))); + trace!( + "[LLM_LOCAL] After pkill, RSS={} (delta={})", + MemoryStats::format_bytes(after_pkill.rss_bytes), + MemoryStats::format_bytes(after_pkill.rss_bytes.saturating_sub(before_pkill.rss_bytes)) + ); let llm_running = if llm_url.starts_with("https://") { info!("Using external HTTPS LLM server, skipping local startup"); @@ -162,7 +168,10 @@ pub async fn ensure_llama_servers_running( info!("Waiting for servers to become ready..."); trace!("Starting wait loop for servers..."); let before_wait = MemoryStats::current(); - trace!("[LLM_LOCAL] Before wait loop, RSS={}", MemoryStats::format_bytes(before_wait.rss_bytes)); + trace!( + "[LLM_LOCAL] Before wait loop, RSS={}", + MemoryStats::format_bytes(before_wait.rss_bytes) + ); let mut llm_ready = llm_running || llm_model.is_empty(); let mut embedding_ready = embedding_running || embedding_model.is_empty(); @@ -174,10 +183,12 @@ pub async fn ensure_llama_servers_running( if attempts % 5 == 0 { let loop_mem = MemoryStats::current(); - trace!("[LLM_LOCAL] Wait loop attempt {}, RSS={} (delta from start={})", - attempts, - MemoryStats::format_bytes(loop_mem.rss_bytes), - MemoryStats::format_bytes(loop_mem.rss_bytes.saturating_sub(before_wait.rss_bytes))); + trace!( + "[LLM_LOCAL] Wait loop attempt {}, RSS={} (delta from start={})", + attempts, + MemoryStats::format_bytes(loop_mem.rss_bytes), + MemoryStats::format_bytes(loop_mem.rss_bytes.saturating_sub(before_wait.rss_bytes)) + ); log_jemalloc_stats(); } @@ -231,20 +242,25 @@ pub async fn ensure_llama_servers_running( trace!("Servers ready!"); let after_ready = MemoryStats::current(); - trace!("[LLM_LOCAL] Servers ready, RSS={} (delta from start={})", - MemoryStats::format_bytes(after_ready.rss_bytes), - MemoryStats::format_bytes(after_ready.rss_bytes.saturating_sub(start_mem.rss_bytes))); + trace!( + "[LLM_LOCAL] Servers ready, RSS={} (delta from start={})", + MemoryStats::format_bytes(after_ready.rss_bytes), + MemoryStats::format_bytes(after_ready.rss_bytes.saturating_sub(start_mem.rss_bytes)) + ); log_jemalloc_stats(); let _llm_provider1 = Arc::new(crate::llm::OpenAIClient::new( llm_model.clone(), Some(llm_url.clone()), + None, )); let end_mem = MemoryStats::current(); - trace!("[LLM_LOCAL] ensure_llama_servers_running END, RSS={} (total delta={})", - MemoryStats::format_bytes(end_mem.rss_bytes), - MemoryStats::format_bytes(end_mem.rss_bytes.saturating_sub(start_mem.rss_bytes))); + trace!( + "[LLM_LOCAL] ensure_llama_servers_running END, RSS={} (total delta={})", + MemoryStats::format_bytes(end_mem.rss_bytes), + MemoryStats::format_bytes(end_mem.rss_bytes.saturating_sub(start_mem.rss_bytes)) + ); log_jemalloc_stats(); trace!("ensure_llama_servers_running EXIT OK"); @@ -298,8 +314,11 @@ pub fn start_llm_server( std::env::set_var("OMP_PROC_BIND", "close"); let conn = app_state.conn.clone(); let config_manager = ConfigManager::new(conn.clone()); - let mut conn = conn.get() - .map_err(|e| Box::new(std::io::Error::new(std::io::ErrorKind::Other, format!("failed to get db connection: {e}"))) as Box)?; + let mut conn = conn.get().map_err(|e| { + Box::new(std::io::Error::other( + format!("failed to get db connection: {e}"), + )) as Box + })?; let default_bot_id = bots .filter(name.eq("default")) .select(id) @@ -332,10 +351,10 @@ pub fn start_llm_server( let n_ctx_size = config_manager .get_config(&default_bot_id, "llm-server-ctx-size", None) - .unwrap_or_else(|_| "4096".to_string()); + .unwrap_or_else(|_| "32000".to_string()); let mut args = format!( - "-m {model_path} --host 0.0.0.0 --port {port} --top_p 0.95 --temp 0.6 --repeat-penalty 1.2 --n-gpu-layers {gpu_layers}" + "-m {model_path} --host 0.0.0.0 --port {port} --top_p 0.95 --temp 0.6 --repeat-penalty 1.2 --n-gpu-layers {gpu_layers} --ubatch-size 2048" ); if !reasoning_format.is_empty() { let _ = write!(args, " --reasoning-format {reasoning_format}"); @@ -369,8 +388,16 @@ pub fn start_llm_server( let cmd = SafeCommand::new("cmd") .and_then(|c| c.arg("/C")) .and_then(|c| c.trusted_shell_script_arg(&cmd_arg)) - .map_err(|e| Box::new(std::io::Error::new(std::io::ErrorKind::Other, e.to_string())) as Box)?; - cmd.execute().map_err(|e| Box::new(std::io::Error::new(std::io::ErrorKind::Other, e.to_string())) as Box)?; + .map_err(|e| { + Box::new(std::io::Error::other( + e.to_string(), + )) as Box + })?; + cmd.execute().map_err(|e| { + Box::new(std::io::Error::other( + e.to_string(), + )) as Box + })?; } else { let cmd_arg = format!( "cd {llama_cpp_path} && ./llama-server {args} --verbose >llm-stdout.log 2>&1 &" @@ -381,8 +408,16 @@ pub fn start_llm_server( let cmd = SafeCommand::new("sh") .and_then(|c| c.arg("-c")) .and_then(|c| c.trusted_shell_script_arg(&cmd_arg)) - .map_err(|e| Box::new(std::io::Error::new(std::io::ErrorKind::Other, e.to_string())) as Box)?; - cmd.execute().map_err(|e| Box::new(std::io::Error::new(std::io::ErrorKind::Other, e.to_string())) as Box)?; + .map_err(|e| { + Box::new(std::io::Error::other( + e.to_string(), + )) as Box + })?; + cmd.execute().map_err(|e| { + Box::new(std::io::Error::other( + e.to_string(), + )) as Box + })?; } Ok(()) } @@ -413,11 +448,19 @@ pub async fn start_embedding_server( let cmd = SafeCommand::new("cmd") .and_then(|c| c.arg("/c")) .and_then(|c| c.trusted_shell_script_arg(&cmd_arg)) - .map_err(|e| Box::new(std::io::Error::new(std::io::ErrorKind::Other, e.to_string())) as Box)?; - cmd.execute().map_err(|e| Box::new(std::io::Error::new(std::io::ErrorKind::Other, e.to_string())) as Box)?; + .map_err(|e| { + Box::new(std::io::Error::other( + e.to_string(), + )) as Box + })?; + cmd.execute().map_err(|e| { + Box::new(std::io::Error::other( + e.to_string(), + )) as Box + })?; } else { let cmd_arg = format!( - "cd {llama_cpp_path} && ./llama-server -m {model_path} --verbose --host 0.0.0.0 --port {port} --embedding --n-gpu-layers 99 >llmembd-stdout.log 2>&1 &" + "cd {llama_cpp_path} && ./llama-server -m {model_path} --verbose --host 0.0.0.0 --port {port} --embedding --n-gpu-layers 99 --ubatch-size 2048 >llmembd-stdout.log 2>&1 &" ); info!( "Executing embedding server command: cd {llama_cpp_path} && ./llama-server -m {model_path} --host 0.0.0.0 --port {port} --embedding" @@ -425,8 +468,16 @@ pub async fn start_embedding_server( let cmd = SafeCommand::new("sh") .and_then(|c| c.arg("-c")) .and_then(|c| c.trusted_shell_script_arg(&cmd_arg)) - .map_err(|e| Box::new(std::io::Error::new(std::io::ErrorKind::Other, e.to_string())) as Box)?; - cmd.execute().map_err(|e| Box::new(std::io::Error::new(std::io::ErrorKind::Other, e.to_string())) as Box)?; + .map_err(|e| { + Box::new(std::io::Error::other( + e.to_string(), + )) as Box + })?; + cmd.execute().map_err(|e| { + Box::new(std::io::Error::other( + e.to_string(), + )) as Box + })?; } tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; diff --git a/src/llm/mod.rs.orig b/src/llm/mod.rs.orig new file mode 100644 index 000000000..69b5791ea --- /dev/null +++ b/src/llm/mod.rs.orig @@ -0,0 +1,720 @@ +use async_trait::async_trait; +use futures::StreamExt; +use log::{error, info}; +use serde_json::Value; +use std::sync::Arc; +use tokio::sync::{mpsc, RwLock}; + +pub mod cache; +pub mod claude; +pub mod episodic_memory; +pub mod llm_models; +pub mod local; +pub mod smart_router; + +pub use claude::ClaudeClient; +pub use llm_models::get_handler; + +#[async_trait] +pub trait LLMProvider: Send + Sync { + async fn generate( + &self, + prompt: &str, + config: &Value, + model: &str, + key: &str, + ) -> Result>; + + async fn generate_stream( + &self, + prompt: &str, + config: &Value, + tx: mpsc::Sender, + model: &str, + key: &str, + ) -> Result<(), Box>; + + async fn cancel_job( + &self, + session_id: &str, + ) -> Result<(), Box>; +} + +#[derive(Debug)] +pub struct OpenAIClient { + client: reqwest::Client, + base_url: String, + endpoint_path: String, +} + +impl OpenAIClient { + /// Estimates token count for a text string (roughly 4 characters per token for English) + fn estimate_tokens(text: &str) -> usize { + // Rough estimate: ~4 characters per token for English text + // This is a heuristic and may not be accurate for all languages + text.len().div_ceil(4) + } + + /// Estimates total tokens for a messages array + fn estimate_messages_tokens(messages: &Value) -> usize { + if let Some(msg_array) = messages.as_array() { + msg_array + .iter() + .map(|msg| { + if let Some(content) = msg.get("content").and_then(|c| c.as_str()) { + Self::estimate_tokens(content) + } else { + 0 + } + }) + .sum() + } else { + 0 + } + } + + /// Truncates messages to fit within the max_tokens limit + /// Keeps system messages and the most recent user/assistant messages + fn truncate_messages(messages: &Value, max_tokens: usize) -> Value { + let mut result = Vec::new(); + let mut token_count = 0; + + if let Some(msg_array) = messages.as_array() { + // First pass: keep all system messages + for msg in msg_array { + if let Some(role) = msg.get("role").and_then(|r| r.as_str()) { + if role == "system" { + if let Some(content) = msg.get("content").and_then(|c| c.as_str()) { + let msg_tokens = Self::estimate_tokens(content); + if token_count + msg_tokens <= max_tokens { + result.push(msg.clone()); + token_count += msg_tokens; + } + } + } + } + } + + // Second pass: add user/assistant messages from newest to oldest + let mut recent_messages: Vec<&Value> = msg_array + .iter() + .filter(|msg| msg.get("role").and_then(|r| r.as_str()) != Some("system")) + .collect(); + + // Reverse to get newest first + recent_messages.reverse(); + + for msg in recent_messages { + if let Some(content) = msg.get("content").and_then(|c| c.as_str()) { + let msg_tokens = Self::estimate_tokens(content); + if token_count + msg_tokens <= max_tokens { + result.push(msg.clone()); + token_count += msg_tokens; + } else { + break; + } + } + } + + // Reverse back to chronological order for non-system messages + // But keep system messages at the beginning + let system_count = result.len() + - result + .iter() + .filter(|m| m.get("role").and_then(|r| r.as_str()) != Some("system")) + .count(); + let mut user_messages: Vec = result.drain(system_count..).collect(); + user_messages.reverse(); + result.extend(user_messages); + } + + serde_json::Value::Array(result) + } + + /// Ensures messages fit within model's context limit + fn ensure_token_limit(messages: &Value, model_context_limit: usize) -> Value { + let estimated_tokens = Self::estimate_messages_tokens(messages); + + // Use 90% of context limit to leave room for response + let safe_limit = (model_context_limit as f64 * 0.9) as usize; + + if estimated_tokens > safe_limit { + log::warn!( + "Messages exceed token limit ({} > {}), truncating...", + estimated_tokens, + safe_limit + ); + Self::truncate_messages(messages, safe_limit) + } else { + messages.clone() + } + } + pub fn new(_api_key: String, base_url: Option, endpoint_path: Option) -> Self { + Self { + client: reqwest::Client::new(), + base_url: base_url.unwrap_or_else(|| "https://api.openai.com".to_string()), + endpoint_path: endpoint_path.unwrap_or_else(|| "/v1/chat/completions".to_string()), + } + } + + pub fn build_messages( + system_prompt: &str, + context_data: &str, + history: &[(String, String)], + ) -> Value { + let mut messages = Vec::new(); + if !system_prompt.is_empty() { + messages.push(serde_json::json!({ + "role": "system", + "content": system_prompt + })); + } + if !context_data.is_empty() { + messages.push(serde_json::json!({ + "role": "system", + "content": context_data + })); + } + for (role, content) in history { + messages.push(serde_json::json!({ + "role": role, + "content": content + })); + } + serde_json::Value::Array(messages) + } +} + +#[async_trait] +impl LLMProvider for OpenAIClient { + async fn generate( + &self, + prompt: &str, + messages: &Value, + model: &str, + key: &str, + ) -> Result> { + let default_messages = serde_json::json!([{"role": "user", "content": prompt}]); + + // Get the messages to use + let raw_messages = + if messages.is_array() && !messages.as_array().unwrap_or(&vec![]).is_empty() { + messages + } else { + &default_messages + }; + + // Ensure messages fit within model's context limit + // GLM-4.7 has 202750 tokens, other models vary + let context_limit = if model.contains("glm-4") || model.contains("GLM-4") { + 202750 + } else if model.contains("gpt-4") { + 128000 + } else if model.contains("gpt-3.5") { + 16385 + } else { + model.starts_with("http://localhost:808") ? 768 : 4096 // Local llama.cpp or default limit + }; + + let messages = OpenAIClient::ensure_token_limit(raw_messages, context_limit); + + let response = self + .client + .post(format!("{}{}", self.base_url, self.endpoint_path)) + .header("Authorization", format!("Bearer {}", key)) + .json(&serde_json::json!({ + "model": model, + "messages": messages + })) + .send() + .await?; + + let status = response.status(); + if status != reqwest::StatusCode::OK { + let error_text = response.text().await.unwrap_or_default(); + error!("LLM generate error: {}", error_text); + return Err(format!("LLM request failed with status: {}", status).into()); + } + + let result: Value = response.json().await?; + let raw_content = result["choices"][0]["message"]["content"] + .as_str() + .unwrap_or(""); + + let handler = get_handler(model); + let content = handler.process_content(raw_content); + + Ok(content) + } + + async fn generate_stream( + &self, + prompt: &str, + messages: &Value, + tx: mpsc::Sender, + model: &str, + key: &str, + ) -> Result<(), Box> { + let default_messages = serde_json::json!([{"role": "user", "content": prompt}]); + + // Get the messages to use + let raw_messages = + if messages.is_array() && !messages.as_array().unwrap_or(&vec![]).is_empty() { + info!("Using provided messages: {:?}", messages); + messages + } else { + &default_messages + }; + + // Ensure messages fit within model's context limit + // GLM-4.7 has 202750 tokens, other models vary + let context_limit = if model.contains("glm-4") || model.contains("GLM-4") { + 202750 + } else if model.contains("gpt-4") { + 128000 + } else if model.contains("gpt-3.5") { + 16385 + } else { + model.starts_with("http://localhost:808") ? 768 : 4096 // Local llama.cpp or default limit + }; + + let messages = OpenAIClient::ensure_token_limit(raw_messages, context_limit); + + let response = self + .client + .post(format!("{}{}", self.base_url, self.endpoint_path)) + .header("Authorization", format!("Bearer {}", key)) + .json(&serde_json::json!({ + "model": model, + "messages": messages, + "stream": true + })) + .send() + .await?; + + let status = response.status(); + if status != reqwest::StatusCode::OK { + let error_text = response.text().await.unwrap_or_default(); + error!("LLM generate_stream error: {}", error_text); + return Err(format!("LLM request failed with status: {}", status).into()); + } + + let handler = get_handler(model); + let mut stream = response.bytes_stream(); + + while let Some(chunk_result) = stream.next().await { + let chunk = chunk_result?; + let chunk_str = String::from_utf8_lossy(&chunk); + for line in chunk_str.lines() { + if line.starts_with("data: ") && !line.contains("[DONE]") { + if let Ok(data) = serde_json::from_str::(&line[6..]) { + if let Some(content) = data["choices"][0]["delta"]["content"].as_str() { + let processed = handler.process_content(content); + if !processed.is_empty() { + let _ = tx.send(processed).await; + } + } + } + } + } + } + + Ok(()) + } + + async fn cancel_job( + &self, + _session_id: &str, + ) -> Result<(), Box> { + Ok(()) + } +} + +pub fn start_llm_services(state: &std::sync::Arc) { + episodic_memory::start_episodic_memory_scheduler(std::sync::Arc::clone(state)); + info!("LLM services started (episodic memory scheduler)"); +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LLMProviderType { + OpenAI, + Claude, + AzureClaude, +} + +impl From<&str> for LLMProviderType { + fn from(s: &str) -> Self { + let lower = s.to_lowercase(); + if lower.contains("claude") || lower.contains("anthropic") { + if lower.contains("azure") { + Self::AzureClaude + } else { + Self::Claude + } + } else { + Self::OpenAI + } + } +} + +pub fn create_llm_provider( + provider_type: LLMProviderType, + base_url: String, + deployment_name: Option, + endpoint_path: Option, +) -> std::sync::Arc { + match provider_type { + LLMProviderType::OpenAI => { + info!("Creating OpenAI LLM provider with URL: {}", base_url); + std::sync::Arc::new(OpenAIClient::new( + "empty".to_string(), + Some(base_url), + endpoint_path, + )) + } + LLMProviderType::Claude => { + info!("Creating Claude LLM provider with URL: {}", base_url); + std::sync::Arc::new(ClaudeClient::new(base_url, deployment_name)) + } + LLMProviderType::AzureClaude => { + let deployment = deployment_name.unwrap_or_else(|| "claude-opus-4-5".to_string()); + info!( + "Creating Azure Claude LLM provider with URL: {}, deployment: {}", + base_url, deployment + ); + std::sync::Arc::new(ClaudeClient::azure(base_url, deployment)) + } + } +} + +pub fn create_llm_provider_from_url( + url: &str, + model: Option, + endpoint_path: Option, +) -> std::sync::Arc { + let provider_type = LLMProviderType::from(url); + create_llm_provider(provider_type, url.to_string(), model, endpoint_path) +} + +pub struct DynamicLLMProvider { + inner: RwLock>, +} + +impl DynamicLLMProvider { + pub fn new(provider: Arc) -> Self { + Self { + inner: RwLock::new(provider), + } + } + + pub async fn update_provider(&self, new_provider: Arc) { + let mut guard = self.inner.write().await; + *guard = new_provider; + info!("LLM provider updated dynamically"); + } + + pub async fn update_from_config( + &self, + url: &str, + model: Option, + endpoint_path: Option, + ) { + let new_provider = create_llm_provider_from_url(url, model, endpoint_path); + self.update_provider(new_provider).await; + } + + async fn get_provider(&self) -> Arc { + self.inner.read().await.clone() + } +} + +#[async_trait] +impl LLMProvider for DynamicLLMProvider { + async fn generate( + &self, + prompt: &str, + config: &Value, + model: &str, + key: &str, + ) -> Result> { + self.get_provider() + .await + .generate(prompt, config, model, key) + .await + } + + async fn generate_stream( + &self, + prompt: &str, + config: &Value, + tx: mpsc::Sender, + model: &str, + key: &str, + ) -> Result<(), Box> { + self.get_provider() + .await + .generate_stream(prompt, config, tx, model, key) + .await + } + + async fn cancel_job( + &self, + session_id: &str, + ) -> Result<(), Box> { + self.get_provider().await.cancel_job(session_id).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde::{Deserialize, Serialize}; + + #[derive(Debug, Clone, Serialize, Deserialize)] + pub struct ToolCall { + pub id: String, + #[serde(rename = "type")] + pub r#type: String, + pub function: ToolFunction, + } + + #[derive(Debug, Clone, Serialize, Deserialize)] + pub struct ToolFunction { + pub name: String, + pub arguments: String, + } + + #[derive(Debug, Clone, Serialize, Deserialize)] + struct ChatMessage { + role: String, + content: Option, + #[serde(skip_serializing_if = "Option::is_none")] + tool_calls: Option>, + } + + #[derive(Debug, Clone, Serialize, Deserialize)] + struct ChatCompletionResponse { + id: String, + object: String, + created: i64, + model: String, + choices: Vec, + usage: Usage, + } + + #[derive(Debug, Clone, Serialize, Deserialize)] + struct ChatChoice { + index: i32, + message: ChatMessage, + finish_reason: String, + } + + #[derive(Debug, Clone, Serialize, Deserialize)] + struct Usage { + #[serde(rename = "prompt_tokens")] + prompt: i32, + #[serde(rename = "completion_tokens")] + completion: i32, + #[serde(rename = "total_tokens")] + total: i32, + } + + #[derive(Debug, Clone, Serialize, Deserialize)] + struct ErrorResponse { + error: ErrorDetail, + } + + #[derive(Debug, Clone, Serialize, Deserialize)] + struct ErrorDetail { + message: String, + #[serde(rename = "type")] + r#type: String, + code: String, + } + + #[test] + fn test_tool_call_serialization() { + let tool_call = ToolCall { + id: "call_123".to_string(), + r#type: "function".to_string(), + function: ToolFunction { + name: "get_weather".to_string(), + arguments: r#"{"location": "NYC"}"#.to_string(), + }, + }; + + let json = serde_json::to_string(&tool_call).unwrap(); + assert!(json.contains("get_weather")); + assert!(json.contains("call_123")); + } + + #[test] + fn test_chat_completion_response_serialization() { + let response = ChatCompletionResponse { + id: "test-id".to_string(), + object: "chat.completion".to_string(), + created: 1_234_567_890, + model: "gpt-4".to_string(), + choices: vec![ChatChoice { + index: 0, + message: ChatMessage { + role: "assistant".to_string(), + content: Some("Hello!".to_string()), + tool_calls: None, + }, + finish_reason: "stop".to_string(), + }], + usage: Usage { + prompt: 10, + completion: 5, + total: 15, + }, + }; + + let json = serde_json::to_string(&response).unwrap(); + assert!(json.contains("chat.completion")); + assert!(json.contains("Hello!")); + assert!(json.contains("gpt-4")); + } + + #[test] + fn test_error_response_serialization() { + let error = ErrorResponse { + error: ErrorDetail { + message: "Test error".to_string(), + r#type: "test_error".to_string(), + code: "test_code".to_string(), + }, + }; + + let json = serde_json::to_string(&error).unwrap(); + assert!(json.contains("Test error")); + assert!(json.contains("test_code")); + } + + #[test] + fn test_build_messages_empty() { + let messages = OpenAIClient::build_messages("", "", &[]); + assert!(messages.is_array()); + assert!(messages.as_array().unwrap().is_empty()); + } + + #[test] + fn test_build_messages_with_system_prompt() { + let messages = OpenAIClient::build_messages("You are a helpful assistant.", "", &[]); + let arr = messages.as_array().unwrap(); + assert_eq!(arr.len(), 1); + assert_eq!(arr[0]["role"], "system"); + assert_eq!(arr[0]["content"], "You are a helpful assistant."); + } + + #[test] + fn test_build_messages_with_context() { + let messages = OpenAIClient::build_messages("System prompt", "Context data", &[]); + let arr = messages.as_array().unwrap(); + assert_eq!(arr.len(), 2); + assert_eq!(arr[0]["content"], "System prompt"); + assert_eq!(arr[1]["content"], "Context data"); + } + + #[test] + fn test_build_messages_with_history() { + let history = vec![ + ("user".to_string(), "Hello".to_string()), + ("assistant".to_string(), "Hi there!".to_string()), + ]; + let messages = OpenAIClient::build_messages("", "", &history); + let arr = messages.as_array().unwrap(); + assert_eq!(arr.len(), 2); + assert_eq!(arr[0]["role"], "user"); + assert_eq!(arr[0]["content"], "Hello"); + assert_eq!(arr[1]["role"], "assistant"); + assert_eq!(arr[1]["content"], "Hi there!"); + } + + #[test] + fn test_build_messages_full() { + let history = vec![("user".to_string(), "What is the weather?".to_string())]; + let messages = OpenAIClient::build_messages( + "You are a weather bot.", + "Current location: NYC", + &history, + ); + let arr = messages.as_array().unwrap(); + assert_eq!(arr.len(), 3); + assert_eq!(arr[0]["role"], "system"); + assert_eq!(arr[1]["role"], "system"); + assert_eq!(arr[2]["role"], "user"); + } + + #[test] + fn test_openai_client_new_default_url() { + let client = OpenAIClient::new("test_key".to_string(), None, None); + assert_eq!(client.base_url, "https://api.openai.com"); + } + + #[test] + fn test_openai_client_new_custom_url() { + let client = OpenAIClient::new( + "test_key".to_string(), + Some("http://localhost:8080".to_string()), + None, + ); + assert_eq!(client.base_url, "http://localhost:8080"); + } + + #[test] + fn test_chat_message_with_tool_calls() { + let message = ChatMessage { + role: "assistant".to_string(), + content: None, + tool_calls: Some(vec![ToolCall { + id: "call_1".to_string(), + r#type: "function".to_string(), + function: ToolFunction { + name: "search".to_string(), + arguments: r#"{"query": "test"}"#.to_string(), + }, + }]), + }; + + let json = serde_json::to_string(&message).unwrap(); + assert!(json.contains("tool_calls")); + assert!(json.contains("search")); + } + + #[test] + fn test_usage_calculation() { + let usage = Usage { + prompt: 100, + completion: 50, + total: 150, + }; + assert_eq!(usage.prompt + usage.completion, usage.total); + } + + #[test] + fn test_chat_choice_finish_reasons() { + let stop_choice = ChatChoice { + index: 0, + message: ChatMessage { + role: "assistant".to_string(), + content: Some("Done".to_string()), + tool_calls: None, + }, + finish_reason: "stop".to_string(), + }; + assert_eq!(stop_choice.finish_reason, "stop"); + + let tool_choice = ChatChoice { + index: 0, + message: ChatMessage { + role: "assistant".to_string(), + content: None, + tool_calls: Some(vec![]), + }, + finish_reason: "tool_calls".to_string(), + }; + assert_eq!(tool_choice.finish_reason, "tool_calls"); + } +} diff --git a/src/llm/smart_router.rs b/src/llm/smart_router.rs index acc706376..ddafbb819 100644 --- a/src/llm/smart_router.rs +++ b/src/llm/smart_router.rs @@ -35,14 +35,14 @@ impl OptimizationGoal { pub struct SmartLLMRouter { performance_cache: Arc>>, - app_state: Arc, + _app_state: Arc, } impl SmartLLMRouter { pub fn new(app_state: Arc) -> Self { Self { performance_cache: Arc::new(tokio::sync::RwLock::new(HashMap::new())), - app_state, + _app_state: app_state, } } diff --git a/src/main.rs b/src/main.rs index 7b1b5a7dd..5117c8db4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -266,6 +266,48 @@ async fn health_check_simple() -> (StatusCode, Json) { ) } +#[derive(serde::Deserialize)] +struct ClientErrorsRequest { + errors: Vec, +} + +#[derive(serde::Deserialize)] +struct ClientErrorData { + #[serde(default)] + r#type: String, + #[serde(default)] + message: String, + #[serde(default)] + stack: Option, + #[serde(default)] + url: String, + #[serde(default)] + timestamp: String, +} + +async fn receive_client_errors( + Json(payload): Json, +) -> (StatusCode, Json) { + for error in &payload.errors { + log::error!( + "[CLIENT ERROR] {} | {} | {} | URL: {} | Stack: {}", + error.timestamp, + error.r#type, + error.message, + error.url, + error.stack.as_deref().unwrap_or("") + ); + } + + ( + StatusCode::OK, + Json(serde_json::json!({ + "status": "received", + "count": payload.errors.len() + })), + ) +} + fn print_shutdown_message() { println!(); println!("Thank you for using General Bots!"); @@ -344,9 +386,12 @@ async fn run_axum_server( .add_anonymous_path("/api/product") .add_anonymous_path("/api/manifest") .add_anonymous_path("/api/i18n") + .add_anonymous_path("/api/auth") .add_anonymous_path("/api/auth/login") .add_anonymous_path("/api/auth/refresh") .add_anonymous_path("/api/auth/bootstrap") + .add_anonymous_path("/api/bot/config") + .add_anonymous_path("/api/client-errors") .add_anonymous_path("/ws") .add_anonymous_path("/auth") .add_public_path("/static") @@ -439,8 +484,11 @@ async fn run_axum_server( let mut api_router = Router::new() .route("/health", get(health_check_simple)) .route(ApiUrls::HEALTH, get(health_check)) + .route("/api/config/reload", post(crate::core::config_reload::reload_config)) .route("/api/product", get(get_product_config)) .route("/api/manifest", get(get_workspace_manifest)) + .route("/api/client-errors", post(receive_client_errors)) + .route("/api/bot/config", get(crate::core::bot::get_bot_config)) .route(ApiUrls::SESSIONS, post(create_session)) .route(ApiUrls::SESSIONS, get(get_sessions)) .route(ApiUrls::SESSION_HISTORY, get(get_session_history)) @@ -1351,6 +1399,14 @@ async fn main() -> std::io::Result<()> { .get_config(&default_bot_id, "llm-key", Some("")) .unwrap_or_default(); + let llm_endpoint_path = config_manager + .get_config( + &default_bot_id, + "llm-endpoint-path", + Some("/v1/chat/completions"), + ) + .unwrap_or_else(|_| "/v1/chat/completions".to_string()); + #[cfg(feature = "llm")] let base_llm_provider = crate::llm::create_llm_provider_from_url( &llm_url, @@ -1359,11 +1415,29 @@ async fn main() -> std::io::Result<()> { } else { Some(llm_model.clone()) }, + Some(llm_endpoint_path.clone()), ); #[cfg(feature = "llm")] let dynamic_llm_provider = Arc::new(crate::llm::DynamicLLMProvider::new(base_llm_provider)); + #[cfg(feature = "llm")] + { + // Ensure the DynamicLLMProvider is initialized with the correct config from database + // This makes the system robust: even if the URL was set before server startup, + // the provider will use the correct configuration + info!("Initializing DynamicLLMProvider with config: URL={}, Model={}, Endpoint={}", + llm_url, + if llm_model.is_empty() { "(default)" } else { &llm_model }, + llm_endpoint_path.clone()); + dynamic_llm_provider.update_from_config( + &llm_url, + if llm_model.is_empty() { None } else { Some(llm_model.clone()) }, + Some(llm_endpoint_path), + ).await; + info!("DynamicLLMProvider initialized successfully"); + } + #[cfg(feature = "llm")] let llm_provider: Arc = if let Some(ref cache) = redis_client { let embedding_url = config_manager @@ -1445,9 +1519,7 @@ async fn main() -> std::io::Result<()> { let app_state = Arc::new(AppState { #[cfg(feature = "drive")] - drive: Some(drive.clone()), - #[cfg(feature = "drive")] - s3_client: Some(drive), + drive: Some(drive), config: Some(cfg.clone()), conn: pool.clone(), database_url: database_url.clone(), @@ -1461,6 +1533,8 @@ async fn main() -> std::io::Result<()> { task_scheduler, #[cfg(feature = "llm")] llm_provider: llm_provider.clone(), + #[cfg(feature = "llm")] + dynamic_llm_provider: Some(dynamic_llm_provider.clone()), #[cfg(feature = "directory")] auth_service: auth_service.clone(), channels: Arc::new(tokio::sync::Mutex::new({ @@ -1541,22 +1615,70 @@ async fn main() -> std::io::Result<()> { error!("Failed to mount bots: {}", e); } + #[cfg(feature = "llm")] + { + let app_state_for_llm = app_state.clone(); + trace!("ensure_llama_servers_running starting..."); + if let Err(e) = ensure_llama_servers_running(app_state_for_llm).await { + error!("Failed to start LLM servers: {}", e); + } + trace!("ensure_llama_servers_running completed"); + } + #[cfg(feature = "drive")] { let drive_monitor_state = app_state.clone(); - let bucket_name = "default.gbai".to_string(); - let monitor_bot_id = default_bot_id; + let pool_clone = pool.clone(); + tokio::spawn(async move { register_thread("drive-monitor", "drive"); - trace!("DriveMonitor::new starting..."); - let monitor = - crate::DriveMonitor::new(drive_monitor_state, bucket_name.clone(), monitor_bot_id); - trace!("DriveMonitor::new done, calling start_monitoring..."); - info!("Starting DriveMonitor for bucket: {}", bucket_name); - if let Err(e) = monitor.start_monitoring().await { - error!("DriveMonitor failed: {}", e); + + let bots_to_monitor = tokio::task::spawn_blocking(move || { + use uuid::Uuid; + let mut conn = match pool_clone.get() { + Ok(conn) => conn, + Err(_) => return Vec::new(), + }; + use crate::shared::models::schema::bots::dsl::*; + use diesel::prelude::*; + bots.filter(is_active.eq(true)) + .select((id, name)) + .load::<(Uuid, String)>(&mut conn) + .unwrap_or_default() + }) + .await + .unwrap_or_default(); + + info!("Found {} active bots to monitor", bots_to_monitor.len()); + + for (bot_id, bot_name) in bots_to_monitor { + let bucket_name = format!("{}.gbai", bot_name); + let monitor_state = drive_monitor_state.clone(); + let bot_id_clone = bot_id; + let bucket_name_clone = bucket_name.clone(); + + tokio::spawn(async move { + register_thread(&format!("drive-monitor-{}", bot_name), "drive"); + trace!("DriveMonitor::new starting for bot: {}", bot_name); + let monitor = + crate::DriveMonitor::new(monitor_state, bucket_name_clone, bot_id_clone); + trace!( + "DriveMonitor::new done for bot: {}, calling start_monitoring...", + bot_name + ); + info!( + "Starting DriveMonitor for bot: {} (bucket: {})", + bot_name, bucket_name + ); + if let Err(e) = monitor.start_monitoring().await { + error!("DriveMonitor failed for bot {}: {}", bot_name, e); + } + trace!( + "DriveMonitor start_monitoring returned for bot: {}", + bot_name + ); + }); } - trace!("DriveMonitor start_monitoring returned"); }); } @@ -1580,19 +1702,6 @@ async fn main() -> std::io::Result<()> { }); } - #[cfg(feature = "llm")] - { - let app_state_for_llm = app_state.clone(); - tokio::spawn(async move { - register_thread("llm-server-init", "llm"); - trace!("ensure_llama_servers_running starting..."); - if let Err(e) = ensure_llama_servers_running(app_state_for_llm).await { - error!("Failed to start LLM servers: {}", e); - } - trace!("ensure_llama_servers_running completed"); - record_thread_activity("llm-server-init"); - }); - } trace!("Initial data setup task spawned"); trace!("All system threads started, starting HTTP server..."); diff --git a/src/security/auth.rs b/src/security/auth.rs index de673c1d6..2394e9457 100644 --- a/src/security/auth.rs +++ b/src/security/auth.rs @@ -36,8 +36,7 @@ pub enum Permission { ManageIntegrations, } -#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[derive(Default)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] pub enum Role { #[default] Anonymous, @@ -189,7 +188,6 @@ impl Role { } } - #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct BotAccess { pub bot_id: Uuid, @@ -738,9 +736,7 @@ pub fn extract_user_from_request( } } - if let Some(session_id) = - extract_session_from_cookies(request, &config.session_cookie_name) - { + if let Some(session_id) = extract_session_from_cookies(request, &config.session_cookie_name) { let mut user = validate_session_sync(&session_id)?; if let Some(bot_id) = extract_bot_id_from_request(request, config) { @@ -784,8 +780,7 @@ fn extract_session_from_cookies(request: &Request, cookie_name: &str) -> O .and_then(|cookies| { cookies.split(';').find_map(|cookie| { let (name, value) = cookie.trim().split_once('=')?; - - + if name == cookie_name { Some(value.to_string()) } else { @@ -831,45 +826,49 @@ fn validate_session_sync(session_id: &str) -> Result Role::Admin, - "superadmin" | "super_admin" => Role::SuperAdmin, - "moderator" => Role::Moderator, - "bot_owner" => Role::BotOwner, - "bot_operator" => Role::BotOperator, - "bot_viewer" => Role::BotViewer, - "service" => Role::Service, - _ => Role::User, - }; - user = user.with_role(role); - } + // Add roles from cached user data + for role_str in &user_data.roles { + let role = match role_str.to_lowercase().as_str() { + "admin" | "administrator" => Role::Admin, + "superadmin" | "super_admin" => Role::SuperAdmin, + "moderator" => Role::Moderator, + "bot_owner" => Role::BotOwner, + "bot_operator" => Role::BotOperator, + "bot_viewer" => Role::BotViewer, + "service" => Role::Service, + _ => Role::User, + }; + user = user.with_role(role); + } - // If no roles were added, default to User role - if user_data.roles.is_empty() { - user = user.with_role(Role::User); - } + // If no roles were added, default to User role + if user_data.roles.is_empty() { + user = user.with_role(Role::User); + } - debug!("Session validated from cache, user has {} roles", user_data.roles.len()); + debug!( + "Session validated from cache, user has {} roles", + user_data.roles.len() + ); return Ok(user); } } @@ -910,14 +909,13 @@ pub async fn auth_middleware_with_providers( next: Next, state: AuthMiddlewareState, ) -> Response { - let path = request.uri().path().to_string(); let method = request.method().to_string(); - info!("[AUTH] Processing {} {}", method, path); + info!("Processing {} {}", method, path); if state.config.is_public_path(&path) || state.config.is_anonymous_allowed(&path) { - info!("[AUTH] Path is public/anonymous, skipping auth"); + info!("Path is public/anonymous, skipping auth"); request .extensions_mut() .insert(AuthenticatedUser::anonymous()); @@ -930,28 +928,39 @@ pub async fn auth_middleware_with_providers( .and_then(|v| v.to_str().ok()) .map(|s| s.to_string()); - info!("[AUTH] Authorization header: {:?}", auth_header.as_ref().map(|h| { - if h.len() > 30 { format!("{}...", &h[..30]) } else { h.clone() } - })); + info!( + "Authorization header: {:?}", + auth_header.as_ref().map(|h| { + if h.len() > 30 { + format!("{}...", &h[..30]) + } else { + h.clone() + } + }) + ); let extracted = ExtractedAuthData::from_request(&request, &state.config); - let user = authenticate_with_extracted_data(extracted, &state.config, &state.provider_registry).await; + let user = + authenticate_with_extracted_data(extracted, &state.config, &state.provider_registry).await; match user { Ok(authenticated_user) => { - info!("[AUTH] Success: user={} roles={:?}", authenticated_user.username, authenticated_user.roles); + info!( + "Success: user={} roles={:?}", + authenticated_user.username, authenticated_user.roles + ); request.extensions_mut().insert(authenticated_user); next.run(request).await } Err(e) => { if !state.config.require_auth { - warn!("[AUTH] Failed but not required, allowing anonymous: {:?}", e); + warn!("Failed but not required, allowing anonymous: {:?}", e); request .extensions_mut() .insert(AuthenticatedUser::anonymous()); return next.run(request).await; } - info!("[AUTH] Failed: {:?}", e); + info!("Failed: {:?}", e); e.into_response() } } @@ -980,9 +989,15 @@ impl ExtractedAuthData { .and_then(|v| v.to_str().ok()); if let Some(auth) = raw_auth { - debug!("Raw Authorization header: {}", &auth[..std::cmp::min(50, auth.len())]); + debug!( + "Raw Authorization header: {}", + &auth[..std::cmp::min(50, auth.len())] + ); } else { - warn!("No Authorization header found in request to {}", request.uri().path()); + warn!( + "No Authorization header found in request to {}", + request.uri().path() + ); } let bearer_token = raw_auth @@ -1043,7 +1058,10 @@ async fn authenticate_with_extracted_data( return Ok(user); } Err(e) => { - debug!("JWT authentication failed: {:?}, falling back to session validation", e); + debug!( + "JWT authentication failed: {:?}, falling back to session validation", + e + ); } } } else { @@ -1363,8 +1381,6 @@ mod tests { assert!(Role::SuperAdmin.has_permission(&Permission::ManageSecrets)); } - - #[test] fn test_role_hierarchy() { assert!(Role::SuperAdmin.is_at_least(&Role::Admin)); @@ -1388,8 +1404,8 @@ mod tests { #[test] fn test_user_permissions() { - let admin = AuthenticatedUser::new(Uuid::new_v4(), "admin".to_string()) - .with_role(Role::Admin); + let admin = + AuthenticatedUser::new(Uuid::new_v4(), "admin".to_string()).with_role(Role::Admin); assert!(admin.has_permission(&Permission::ManageUsers)); assert!(admin.has_permission(&Permission::Delete)); @@ -1440,10 +1456,22 @@ mod tests { #[test] fn test_auth_error_responses() { - assert_eq!(AuthError::MissingToken.status_code(), StatusCode::UNAUTHORIZED); - assert_eq!(AuthError::InsufficientPermissions.status_code(), StatusCode::FORBIDDEN); - assert_eq!(AuthError::RateLimited.status_code(), StatusCode::TOO_MANY_REQUESTS); - assert_eq!(AuthError::BotAccessDenied.status_code(), StatusCode::FORBIDDEN); + assert_eq!( + AuthError::MissingToken.status_code(), + StatusCode::UNAUTHORIZED + ); + assert_eq!( + AuthError::InsufficientPermissions.status_code(), + StatusCode::FORBIDDEN + ); + assert_eq!( + AuthError::RateLimited.status_code(), + StatusCode::TOO_MANY_REQUESTS + ); + assert_eq!( + AuthError::BotAccessDenied.status_code(), + StatusCode::FORBIDDEN + ); } #[test] @@ -1460,8 +1488,8 @@ mod tests { assert!(!user.can_manage_bot(&bot_id)); assert!(!user.can_access_bot(&other_bot_id)); - let admin = AuthenticatedUser::new(Uuid::new_v4(), "admin".to_string()) - .with_role(Role::Admin); + let admin = + AuthenticatedUser::new(Uuid::new_v4(), "admin".to_string()).with_role(Role::Admin); assert!(admin.can_access_bot(&bot_id)); assert!(admin.can_access_bot(&other_bot_id)); @@ -1544,8 +1572,8 @@ mod tests { let org_id = Uuid::new_v4(); let other_org_id = Uuid::new_v4(); - let user = AuthenticatedUser::new(Uuid::new_v4(), "user".to_string()) - .with_organization(org_id); + let user = + AuthenticatedUser::new(Uuid::new_v4(), "user".to_string()).with_organization(org_id); assert!(user.can_access_organization(&org_id)); assert!(!user.can_access_organization(&other_org_id)); @@ -1561,10 +1589,14 @@ mod tests { #[test] fn test_has_all_permissions() { - let admin = AuthenticatedUser::new(Uuid::new_v4(), "admin".to_string()) - .with_role(Role::Admin); + let admin = + AuthenticatedUser::new(Uuid::new_v4(), "admin".to_string()).with_role(Role::Admin); - assert!(admin.has_all_permissions(&[Permission::Read, Permission::Write, Permission::Delete])); + assert!(admin.has_all_permissions(&[ + Permission::Read, + Permission::Write, + Permission::Delete + ])); assert!(!admin.has_all_permissions(&[Permission::ManageSecrets])); } diff --git a/src/security/protection/installer.rs b/src/security/protection/installer.rs index d4bb72bb3..1116c350a 100644 --- a/src/security/protection/installer.rs +++ b/src/security/protection/installer.rs @@ -604,6 +604,7 @@ impl Default for ProtectionInstaller { } } +#[derive(Default)] pub struct InstallResult { pub success: bool, pub packages_installed: Vec, @@ -613,18 +614,6 @@ pub struct InstallResult { pub warnings: Vec, } -impl Default for InstallResult { - fn default() -> Self { - Self { - success: false, - packages_installed: Vec::new(), - sudoers_created: false, - databases_updated: false, - errors: Vec::new(), - warnings: Vec::new(), - } - } -} impl InstallResult { pub fn print(&self) { @@ -677,6 +666,7 @@ impl InstallResult { } } +#[derive(Default)] pub struct UninstallResult { pub success: bool, pub sudoers_removed: bool, @@ -684,16 +674,6 @@ pub struct UninstallResult { pub errors: Vec, } -impl Default for UninstallResult { - fn default() -> Self { - Self { - success: false, - sudoers_removed: false, - message: String::new(), - errors: Vec::new(), - } - } -} impl UninstallResult { pub fn print(&self) { @@ -729,6 +709,7 @@ impl UninstallResult { } } +#[derive(Default)] pub struct VerifyResult { pub all_installed: bool, pub all_configured: bool, @@ -736,16 +717,6 @@ pub struct VerifyResult { pub tools: Vec, } -impl Default for VerifyResult { - fn default() -> Self { - Self { - all_installed: false, - all_configured: false, - sudoers_exists: false, - tools: Vec::new(), - } - } -} pub struct ToolVerification { pub name: String, diff --git a/src/security/rbac_middleware.rs b/src/security/rbac_middleware.rs index 2ce7f8cbf..56e762961 100644 --- a/src/security/rbac_middleware.rs +++ b/src/security/rbac_middleware.rs @@ -946,12 +946,16 @@ pub fn build_default_route_permissions() -> Vec { RoutePermission::new("/api/health", "GET", "").with_anonymous(true), RoutePermission::new("/api/version", "GET", "").with_anonymous(true), RoutePermission::new("/api/product", "GET", "").with_anonymous(true), + RoutePermission::new("/api/bot/config", "GET", "").with_anonymous(true), RoutePermission::new("/api/i18n/**", "GET", "").with_anonymous(true), // Auth routes - login must be anonymous RoutePermission::new("/api/auth", "GET", "").with_anonymous(true), RoutePermission::new("/api/auth/login", "POST", "").with_anonymous(true), + + // Client error reporting - anonymous to catch all JS errors + RoutePermission::new("/api/client-errors", "POST", "").with_anonymous(true), RoutePermission::new("/api/auth/bootstrap", "POST", "").with_anonymous(true), RoutePermission::new("/api/auth/refresh", "POST", "").with_anonymous(true), RoutePermission::new("/api/auth/logout", "POST", ""), diff --git a/src/tasks/scheduler.rs b/src/tasks/scheduler.rs index 8e12b4974..d2b1a1054 100644 --- a/src/tasks/scheduler.rs +++ b/src/tasks/scheduler.rs @@ -151,7 +151,7 @@ impl TaskScheduler { let _ = cmd.execute(); } - if let Some(s3) = state.s3_client.as_ref() { + if let Some(s3) = state.drive.as_ref() { let body = tokio::fs::read(&backup_file).await?; s3.put_object() .bucket("backups") @@ -239,7 +239,7 @@ impl TaskScheduler { health["cache"] = serde_json::json!(cache_ok); } - if let Some(s3) = &state.s3_client { + if let Some(s3) = &state.drive { let s3_ok = s3.list_buckets().send().await.is_ok(); health["storage"] = serde_json::json!(s3_ok); } diff --git a/src/vector-db/embedding.rs b/src/vector-db/embedding.rs index 93f886fcd..bb93734e3 100644 --- a/src/vector-db/embedding.rs +++ b/src/vector-db/embedding.rs @@ -71,9 +71,12 @@ impl EmbeddingGenerator { async fn generate_local_embedding(&self, text: &str, embedding_url: &str) -> Result> { use serde_json::json; + // Truncate text to fit within token limit (600 tokens for safety under 768 limit) + let truncated_text = crate::core::shared::utils::truncate_text_for_model(text, "sentence-transformers/all-MiniLM-L6-v2", 600); + let client = reqwest::Client::new(); let body = json!({ - "text": text, + "text": truncated_text, "model": "sentence-transformers/all-MiniLM-L6-v2" });