botserver/src/basic/keywords/knowledge_graph.rs

750 lines
23 KiB
Rust

use chrono::{DateTime, Utc};
use rhai::{Array, Dynamic, Engine, Map};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use tracing::info;
use uuid::Uuid;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KgEntity {
pub id: Uuid,
pub bot_id: Uuid,
pub entity_type: String,
pub entity_name: String,
pub aliases: Vec<String>,
pub properties: serde_json::Value,
pub confidence: f64,
pub source: EntitySource,
pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum EntitySource {
Manual,
Extracted,
Imported,
Inferred,
}
impl Default for EntitySource {
fn default() -> Self {
EntitySource::Manual
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KgRelationship {
pub id: Uuid,
pub bot_id: Uuid,
pub from_entity_id: Uuid,
pub to_entity_id: Uuid,
pub relationship_type: String,
pub properties: serde_json::Value,
pub confidence: f64,
pub bidirectional: bool,
pub source: EntitySource,
pub created_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractedEntity {
pub name: String,
pub canonical_name: String,
pub entity_type: String,
pub start_pos: usize,
pub end_pos: usize,
pub confidence: f64,
pub properties: serde_json::Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractedRelationship {
pub from_entity: String,
pub to_entity: String,
pub relationship_type: String,
pub confidence: f64,
pub evidence: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractionResult {
pub entities: Vec<ExtractedEntity>,
pub relationships: Vec<ExtractedRelationship>,
pub metadata: ExtractionMetadata,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractionMetadata {
pub model: String,
pub processing_time_ms: u64,
pub tokens_processed: usize,
pub text_length: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GraphQueryResult {
pub entities: Vec<KgEntity>,
pub relationships: Vec<KgRelationship>,
pub explanation: String,
pub confidence: f64,
}
#[derive(Debug, Clone)]
pub struct KnowledgeGraphConfig {
pub enabled: bool,
pub backend: String,
pub extract_entities: bool,
pub extraction_model: String,
pub max_entities: usize,
pub max_relationships: usize,
pub min_confidence: f64,
pub entity_types: Vec<String>,
}
impl Default for KnowledgeGraphConfig {
fn default() -> Self {
KnowledgeGraphConfig {
enabled: true,
backend: "postgresql".to_string(),
extract_entities: true,
extraction_model: "quality".to_string(),
max_entities: 10000,
max_relationships: 50000,
min_confidence: 0.7,
entity_types: vec![
"person".to_string(),
"organization".to_string(),
"project".to_string(),
"product".to_string(),
"location".to_string(),
"event".to_string(),
"concept".to_string(),
],
}
}
}
#[derive(Debug)]
pub struct KnowledgeGraphManager {
config: KnowledgeGraphConfig,
}
impl KnowledgeGraphManager {
pub fn new(config: KnowledgeGraphConfig) -> Self {
KnowledgeGraphManager { config }
}
pub fn from_config(config_map: &HashMap<String, String>) -> Self {
let config = KnowledgeGraphConfig {
enabled: config_map
.get("knowledge-graph-enabled")
.map(|v| v == "true")
.unwrap_or(true),
backend: config_map
.get("knowledge-graph-backend")
.cloned()
.unwrap_or_else(|| "postgresql".to_string()),
extract_entities: config_map
.get("knowledge-graph-extract-entities")
.map(|v| v == "true")
.unwrap_or(true),
extraction_model: config_map
.get("knowledge-graph-extraction-model")
.cloned()
.unwrap_or_else(|| "quality".to_string()),
max_entities: config_map
.get("knowledge-graph-max-entities")
.and_then(|v| v.parse().ok())
.unwrap_or(10000),
max_relationships: config_map
.get("knowledge-graph-max-relationships")
.and_then(|v| v.parse().ok())
.unwrap_or(50000),
min_confidence: config_map
.get("knowledge-graph-min-confidence")
.and_then(|v| v.parse().ok())
.unwrap_or(0.7),
entity_types: config_map
.get("knowledge-graph-entity-types")
.map(|v| v.split(',').map(|s| s.trim().to_string()).collect())
.unwrap_or_else(|| KnowledgeGraphConfig::default().entity_types),
};
KnowledgeGraphManager::new(config)
}
pub fn generate_extraction_prompt(&self, text: &str) -> String {
let entity_types = self.config.entity_types.join(", ");
format!(
r#"Extract entities and relationships from the following text.
ENTITY TYPES TO EXTRACT: {entity_types}
TEXT:
{text}
Respond with valid JSON only:
{{
"entities": [
{{
"name": "exact name as in text",
"canonical_name": "normalized name",
"entity_type": "person|organization|project|product|location|event|concept",
"confidence": 0.95,
"properties": {{"key": "value"}}
}}
],
"relationships": [
{{
"from_entity": "entity name",
"to_entity": "entity name",
"relationship_type": "works_on|reports_to|owns|part_of|located_in|related_to",
"confidence": 0.9,
"evidence": "text snippet supporting this relationship"
}}
]
}}"#
)
}
pub fn generate_query_prompt(&self, query: &str, context: &str) -> String {
format!(
r"Answer this question using the knowledge graph context.
QUESTION: {query}
KNOWLEDGE GRAPH CONTEXT:
{context}
Provide a natural language answer based on the entities and relationships.
If the information is not available, say so clearly.
"
)
}
pub fn parse_extraction_response(
&self,
response: &str,
text_length: usize,
processing_time_ms: u64,
) -> Result<ExtractionResult, String> {
let json_str = extract_json(response)?;
let parsed: serde_json::Value =
serde_json::from_str(&json_str).map_err(|e| format!("Failed to parse JSON: {}", e))?;
let entities: Vec<ExtractedEntity> = parsed["entities"]
.as_array()
.map(|arr| {
arr.iter()
.filter_map(|v| {
Some(ExtractedEntity {
name: v["name"].as_str()?.to_string(),
canonical_name: v["canonical_name"]
.as_str()
.unwrap_or(v["name"].as_str()?)
.to_string(),
entity_type: v["entity_type"].as_str()?.to_string(),
start_pos: 0,
end_pos: 0,
confidence: v["confidence"].as_f64().unwrap_or(0.8),
properties: v["properties"].clone(),
})
})
.filter(|e| e.confidence >= self.config.min_confidence)
.collect()
})
.unwrap_or_default();
let relationships: Vec<ExtractedRelationship> = parsed["relationships"]
.as_array()
.map(|arr| {
arr.iter()
.filter_map(|v| {
Some(ExtractedRelationship {
from_entity: v["from_entity"].as_str()?.to_string(),
to_entity: v["to_entity"].as_str()?.to_string(),
relationship_type: v["relationship_type"].as_str()?.to_string(),
confidence: v["confidence"].as_f64().unwrap_or(0.8),
evidence: v["evidence"].as_str().unwrap_or("").to_string(),
})
})
.filter(|r| r.confidence >= self.config.min_confidence)
.collect()
})
.unwrap_or_default();
Ok(ExtractionResult {
entities,
relationships,
metadata: ExtractionMetadata {
model: self.config.extraction_model.clone(),
processing_time_ms,
tokens_processed: text_length / 4,
text_length,
},
})
}
pub fn should_extract(&self) -> bool {
self.config.enabled && self.config.extract_entities
}
pub fn is_valid_entity_type(&self, entity_type: &str) -> bool {
self.config
.entity_types
.iter()
.any(|t| t.eq_ignore_ascii_case(entity_type))
}
}
fn extract_json(response: &str) -> Result<String, String> {
if let Some(start) = response.find("```json") {
if let Some(end) = response[start + 7..].find("```") {
return Ok(response[start + 7..start + 7 + end].trim().to_string());
}
}
if let Some(start) = response.find("```") {
let after_start = start + 3;
let json_start = response[after_start..]
.find('\n')
.map(|i| after_start + i + 1)
.unwrap_or(after_start);
if let Some(end) = response[json_start..].find("```") {
return Ok(response[json_start..json_start + end].trim().to_string());
}
}
if let Some(start) = response.find('{') {
if let Some(end) = response.rfind('}') {
if end > start {
return Ok(response[start..=end].to_string());
}
}
}
Err("No JSON found in response".to_string())
}
impl KgEntity {
pub fn to_dynamic(&self) -> Dynamic {
let mut map = Map::new();
map.insert("id".into(), self.id.to_string().into());
map.insert("bot_id".into(), self.bot_id.to_string().into());
map.insert("entity_type".into(), self.entity_type.clone().into());
map.insert("entity_name".into(), self.entity_name.clone().into());
let aliases: Array = self
.aliases
.iter()
.map(|a| Dynamic::from(a.clone()))
.collect();
map.insert("aliases".into(), aliases.into());
map.insert("properties".into(), json_to_dynamic(&self.properties));
map.insert("confidence".into(), self.confidence.into());
map.insert(
"source".into(),
format!("{:?}", self.source).to_lowercase().into(),
);
map.insert("created_at".into(), self.created_at.to_rfc3339().into());
map.insert("updated_at".into(), self.updated_at.to_rfc3339().into());
Dynamic::from(map)
}
}
impl KgRelationship {
pub fn to_dynamic(&self) -> Dynamic {
let mut map = Map::new();
map.insert("id".into(), self.id.to_string().into());
map.insert("bot_id".into(), self.bot_id.to_string().into());
map.insert(
"from_entity_id".into(),
self.from_entity_id.to_string().into(),
);
map.insert("to_entity_id".into(), self.to_entity_id.to_string().into());
map.insert(
"relationship_type".into(),
self.relationship_type.clone().into(),
);
map.insert("properties".into(), json_to_dynamic(&self.properties));
map.insert("confidence".into(), self.confidence.into());
map.insert("bidirectional".into(), self.bidirectional.into());
map.insert(
"source".into(),
format!("{:?}", self.source).to_lowercase().into(),
);
map.insert("created_at".into(), self.created_at.to_rfc3339().into());
Dynamic::from(map)
}
}
fn json_to_dynamic(value: &serde_json::Value) -> Dynamic {
match value {
serde_json::Value::Null => Dynamic::UNIT,
serde_json::Value::Bool(b) => Dynamic::from(*b),
serde_json::Value::Number(n) => {
if let Some(i) = n.as_i64() {
Dynamic::from(i)
} else if let Some(f) = n.as_f64() {
Dynamic::from(f)
} else {
Dynamic::UNIT
}
}
serde_json::Value::String(s) => Dynamic::from(s.clone()),
serde_json::Value::Array(arr) => {
let array: Array = arr.iter().map(json_to_dynamic).collect();
Dynamic::from(array)
}
serde_json::Value::Object(obj) => {
let mut map = Map::new();
for (k, v) in obj {
map.insert(k.clone().into(), json_to_dynamic(v));
}
Dynamic::from(map)
}
}
}
pub fn register_knowledge_graph_keywords(engine: &mut Engine) {
engine.register_fn("entity_name", |entity: Map| -> String {
entity
.get("entity_name")
.and_then(|v| v.clone().try_cast::<String>())
.unwrap_or_default()
});
engine.register_fn("entity_type", |entity: Map| -> String {
entity
.get("entity_type")
.and_then(|v| v.clone().try_cast::<String>())
.unwrap_or_default()
});
engine.register_fn("entity_properties", |entity: Map| -> Map {
entity
.get("properties")
.and_then(|v| v.clone().try_cast::<Map>())
.unwrap_or_default()
});
engine.register_fn("relationship_type", |rel: Map| -> String {
rel.get("relationship_type")
.and_then(|v| v.clone().try_cast::<String>())
.unwrap_or_default()
});
engine.register_fn("is_bidirectional", |rel: Map| -> bool {
rel.get("bidirectional")
.and_then(|v| v.clone().try_cast::<bool>())
.unwrap_or(false)
});
info!("Knowledge graph keywords registered");
}
pub const KNOWLEDGE_GRAPH_SCHEMA: &str = r"
-- Knowledge graph entities
CREATE TABLE IF NOT EXISTS kg_entities (
id UUID PRIMARY KEY,
bot_id UUID NOT NULL,
entity_type VARCHAR(100) NOT NULL,
entity_name VARCHAR(500) NOT NULL,
aliases JSONB NOT NULL DEFAULT '[]',
properties JSONB NOT NULL DEFAULT '{}',
confidence DOUBLE PRECISION NOT NULL DEFAULT 1.0,
source VARCHAR(50) NOT NULL DEFAULT 'manual',
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
UNIQUE(bot_id, entity_type, entity_name)
);
-- Knowledge graph relationships
CREATE TABLE IF NOT EXISTS kg_relationships (
id UUID PRIMARY KEY,
bot_id UUID NOT NULL,
from_entity_id UUID NOT NULL REFERENCES kg_entities(id) ON DELETE CASCADE,
to_entity_id UUID NOT NULL REFERENCES kg_entities(id) ON DELETE CASCADE,
relationship_type VARCHAR(100) NOT NULL,
properties JSONB NOT NULL DEFAULT '{}',
confidence DOUBLE PRECISION NOT NULL DEFAULT 1.0,
bidirectional BOOLEAN NOT NULL DEFAULT false,
source VARCHAR(50) NOT NULL DEFAULT 'manual',
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
UNIQUE(bot_id, from_entity_id, to_entity_id, relationship_type)
);
-- Indexes for efficient querying
CREATE INDEX IF NOT EXISTS idx_kg_entities_bot_id ON kg_entities(bot_id);
CREATE INDEX IF NOT EXISTS idx_kg_entities_type ON kg_entities(entity_type);
CREATE INDEX IF NOT EXISTS idx_kg_entities_name ON kg_entities(entity_name);
CREATE INDEX IF NOT EXISTS idx_kg_entities_name_lower ON kg_entities(LOWER(entity_name));
CREATE INDEX IF NOT EXISTS idx_kg_entities_aliases ON kg_entities USING GIN(aliases);
CREATE INDEX IF NOT EXISTS idx_kg_relationships_bot_id ON kg_relationships(bot_id);
CREATE INDEX IF NOT EXISTS idx_kg_relationships_from ON kg_relationships(from_entity_id);
CREATE INDEX IF NOT EXISTS idx_kg_relationships_to ON kg_relationships(to_entity_id);
CREATE INDEX IF NOT EXISTS idx_kg_relationships_type ON kg_relationships(relationship_type);
-- Full-text search on entity names
CREATE INDEX IF NOT EXISTS idx_kg_entities_name_fts ON kg_entities
USING GIN(to_tsvector('english', entity_name));
";
pub mod sql {
pub const INSERT_ENTITY: &str = r"
INSERT INTO kg_entities (
id, bot_id, entity_type, entity_name, aliases, properties,
confidence, source, created_at, updated_at
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10
)
ON CONFLICT (bot_id, entity_type, entity_name)
DO UPDATE SET
aliases = kg_entities.aliases || $5,
properties = kg_entities.properties || $6,
confidence = GREATEST(kg_entities.confidence, $7),
updated_at = $10
RETURNING id
";
pub const INSERT_RELATIONSHIP: &str = r"
INSERT INTO kg_relationships (
id, bot_id, from_entity_id, to_entity_id, relationship_type,
properties, confidence, bidirectional, source, created_at
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10
)
ON CONFLICT (bot_id, from_entity_id, to_entity_id, relationship_type)
DO UPDATE SET
properties = kg_relationships.properties || $6,
confidence = GREATEST(kg_relationships.confidence, $7)
RETURNING id
";
pub const GET_ENTITY_BY_NAME: &str = r"
SELECT * FROM kg_entities
WHERE bot_id = $1
AND (
LOWER(entity_name) = LOWER($2)
OR aliases @> $3::jsonb
)
LIMIT 1
";
pub const GET_ENTITY_BY_ID: &str = r"
SELECT * FROM kg_entities WHERE id = $1
";
pub const SEARCH_ENTITIES: &str = r"
SELECT * FROM kg_entities
WHERE bot_id = $1
AND (
to_tsvector('english', entity_name) @@ plainto_tsquery('english', $2)
OR LOWER(entity_name) LIKE LOWER($3)
)
ORDER BY confidence DESC
LIMIT $4
";
pub const GET_ENTITIES_BY_TYPE: &str = r"
SELECT * FROM kg_entities
WHERE bot_id = $1 AND entity_type = $2
ORDER BY entity_name
LIMIT $3
";
pub const GET_RELATED_ENTITIES: &str = r"
SELECT e.*, r.relationship_type, r.confidence as rel_confidence
FROM kg_entities e
JOIN kg_relationships r ON (
(r.from_entity_id = $1 AND r.to_entity_id = e.id)
OR (r.bidirectional AND r.to_entity_id = $1 AND r.from_entity_id = e.id)
)
WHERE r.bot_id = $2
ORDER BY r.confidence DESC
LIMIT $3
";
pub const GET_RELATED_BY_TYPE: &str = r"
SELECT e.*, r.relationship_type, r.confidence as rel_confidence
FROM kg_entities e
JOIN kg_relationships r ON (
(r.from_entity_id = $1 AND r.to_entity_id = e.id)
OR (r.bidirectional AND r.to_entity_id = $1 AND r.from_entity_id = e.id)
)
WHERE r.bot_id = $2 AND r.relationship_type = $3
ORDER BY r.confidence DESC
LIMIT $4
";
pub const GET_RELATIONSHIP: &str = r"
SELECT * FROM kg_relationships
WHERE bot_id = $1
AND from_entity_id = $2
AND to_entity_id = $3
AND relationship_type = $4
";
pub const GET_ALL_RELATIONSHIPS_FOR_ENTITY: &str = r"
SELECT r.*,
e1.entity_name as from_name, e1.entity_type as from_type,
e2.entity_name as to_name, e2.entity_type as to_type
FROM kg_relationships r
JOIN kg_entities e1 ON r.from_entity_id = e1.id
JOIN kg_entities e2 ON r.to_entity_id = e2.id
WHERE r.bot_id = $1
AND (r.from_entity_id = $2 OR r.to_entity_id = $2)
ORDER BY r.confidence DESC
";
pub const DELETE_ENTITY: &str = r"
DELETE FROM kg_entities WHERE id = $1 AND bot_id = $2
";
pub const DELETE_RELATIONSHIP: &str = r"
DELETE FROM kg_relationships WHERE id = $1 AND bot_id = $2
";
pub const COUNT_ENTITIES: &str = r"
SELECT COUNT(*) FROM kg_entities WHERE bot_id = $1
";
pub const COUNT_RELATIONSHIPS: &str = r"
SELECT COUNT(*) FROM kg_relationships WHERE bot_id = $1
";
pub const GET_ENTITY_TYPES: &str = r"
SELECT DISTINCT entity_type, COUNT(*) as count
FROM kg_entities
WHERE bot_id = $1
GROUP BY entity_type
ORDER BY count DESC
";
pub const GET_RELATIONSHIP_TYPES: &str = r"
SELECT DISTINCT relationship_type, COUNT(*) as count
FROM kg_relationships
WHERE bot_id = $1
GROUP BY relationship_type
ORDER BY count DESC
";
pub const FIND_PATH: &str = r"
WITH RECURSIVE path_finder AS (
-- Base case: start from source entity
SELECT
from_entity_id,
to_entity_id,
relationship_type,
ARRAY[from_entity_id] as path,
1 as depth
FROM kg_relationships
WHERE bot_id = $1 AND from_entity_id = $2
UNION ALL
-- Recursive case: follow relationships
SELECT
r.from_entity_id,
r.to_entity_id,
r.relationship_type,
pf.path || r.from_entity_id,
pf.depth + 1
FROM kg_relationships r
JOIN path_finder pf ON r.from_entity_id = pf.to_entity_id
WHERE r.bot_id = $1
AND NOT r.from_entity_id = ANY(pf.path) -- Prevent cycles
AND pf.depth < $4 -- Max depth
)
SELECT * FROM path_finder
WHERE to_entity_id = $3
ORDER BY depth
LIMIT 1
";
}
pub mod relationship_types {
pub const WORKS_ON: &str = "works_on";
pub const REPORTS_TO: &str = "reports_to";
pub const MANAGES: &str = "manages";
pub const OWNS: &str = "owns";
pub const PART_OF: &str = "part_of";
pub const LOCATED_IN: &str = "located_in";
pub const RELATED_TO: &str = "related_to";
pub const CREATED_BY: &str = "created_by";
pub const DEPENDS_ON: &str = "depends_on";
pub const CONNECTED_TO: &str = "connected_to";
pub const MEMBER_OF: &str = "member_of";
pub const SUCCESSOR_OF: &str = "successor_of";
pub const PREDECESSOR_OF: &str = "predecessor_of";
pub const ALIAS_OF: &str = "alias_of";
}
pub mod entity_types {
pub const PERSON: &str = "person";
pub const ORGANIZATION: &str = "organization";
pub const PROJECT: &str = "project";
pub const PRODUCT: &str = "product";
pub const LOCATION: &str = "location";
pub const EVENT: &str = "event";
pub const CONCEPT: &str = "concept";
pub const DOCUMENT: &str = "document";
pub const TEAM: &str = "team";
pub const ROLE: &str = "role";
pub const SKILL: &str = "skill";
pub const TECHNOLOGY: &str = "technology";
}