generalbots/botmodels/src/services/anomaly_service.py

"""
Anomaly Detection Service - Detecção de desvios/anomalias em dados tabulares
Compatible with salary data, sensor readings, and other numerical time series
"""

import numpy as np
from typing import Optional
from dataclasses import dataclass
from ..core.logging import get_logger

logger = get_logger("anomaly_service")


@dataclass
class AnomalyResult:
    is_anomaly: bool
    score: float
    method: str
    threshold: float
    details: dict


class AnomalyDetectionService:
    def __init__(self):
        self.initialized = True

    def detect_zscore(
        self, data: list[float], threshold: float = 3.0
    ) -> list[AnomalyResult]:
        """
        Z-Score based anomaly detection
        Identifies values that are more than N standard deviations from mean
        """
        if len(data) < 3:
            return []

        arr = np.array(data)
        mean = np.mean(arr)
        std = np.std(arr)

        if std == 0:
            return []

        z_scores = np.abs((arr - mean) / std)
        results = []

        for i, z in enumerate(z_scores):
            is_anomaly = z > threshold
            results.append(
                AnomalyResult(
                    is_anomaly=is_anomaly,
                    score=float(z),
                    method="zscore",
                    threshold=threshold,
                    details={
                        "index": i,
                        "value": float(arr[i]),
                        "mean": float(mean),
                        "std": float(std),
                        "deviation": float(arr[i] - mean),
                    },
                )
            )

        return results

    def detect_iqr(
        self, data: list[float], multiplier: float = 1.5
    ) -> list[AnomalyResult]:
        """
        IQR (Interquartile Range) based anomaly detection
        Identifies values outside Q1 - 1.5*IQR and Q3 + 1.5*IQR
        """
        if len(data) < 4:
            return []

        arr = np.array(data)
        q1 = np.percentile(arr, 25)
        q3 = np.percentile(arr, 75)
        iqr = q3 - q1

        lower_bound = q1 - multiplier * iqr
        upper_bound = q3 + multiplier * iqr

        results = []
        for i, val in enumerate(arr):
            is_anomaly = val < lower_bound or val > upper_bound
            distance = 0
            if val < lower_bound:
                distance = lower_bound - val
            elif val > upper_bound:
                distance = val - upper_bound

            results.append(
                AnomalyResult(
                    is_anomaly=is_anomaly,
                    score=float(distance),
                    method="iqr",
                    threshold=multiplier,
                    details={
                        "index": i,
                        "value": float(val),
                        "q1": float(q1),
                        "q3": float(q3),
                        "iqr": float(iqr),
                        "lower_bound": float(lower_bound),
                        "upper_bound": float(upper_bound),
                    },
                )
            )

        return results

    def detect_isolation_forest(
        self, data: list[float], contamination: float = 0.1
    ) -> list[AnomalyResult]:
        """
        Simplified Isolation Forest-like detection using average absolute deviation
        More robust to outliers than Z-score
        """
        if len(data) < 3:
            return []

        arr = np.array(data)
        median = np.median(arr)
        mad = np.median(np.abs(arr - median))

        if mad == 0:
            return []

        modified_z = np.abs(0.6745 * (arr - median) / mad)
        threshold = 3.5

        results = []
        for i, z in enumerate(modified_z):
            is_anomaly = z > threshold
            results.append(
                AnomalyResult(
                    is_anomaly=is_anomaly,
                    score=float(z),
                    method="isolation_forest",
                    threshold=threshold,
                    details={
                        "index": i,
                        "value": float(arr[i]),
                        "median": float(median),
                        "mad": float(mad),
                    },
                )
            )

        return results

    def detect_salary_anomalies(
        self, records: list[dict], value_field: str = "salarioBase"
    ) -> dict:
        """
        Specialized detection for salary/payroll data
        Combines multiple methods for robust anomaly detection
        """
        values = [float(r.get(value_field, 0)) for r in records if value_field in r]

        if not values:
            return {"error": f"Field '{value_field}' not found in records"}

        zscore_results = self.detect_zscore(values, threshold=2.5)
        iqr_results = self.detect_iqr(values, multiplier=1.5)
        iso_results = self.detect_isolation_forest(values)

        anomalies = []
        for i in range(len(values)):
            votes = sum(
                [
                    zscore_results[i].is_anomaly if zscore_results else False,
                    iqr_results[i].is_anomaly if iqr_results else False,
                    iso_results[i].is_anomaly if iso_results else False,
                ]
            )

            if votes >= 2:
                anomalies.append(
                    {
                        "index": i,
                        "record": records[i],
                        "value": values[i],
                        "confidence": votes / 3,
                        "methods": {
                            "zscore": zscore_results[i].is_anomaly
                            if zscore_results
                            else False,
                            "iqr": iqr_results[i].is_anomaly if iqr_results else False,
                            "isolation": iso_results[i].is_anomaly
                            if iso_results
                            else False,
                        },
                        "zscore_details": zscore_results[i].details
                        if zscore_results
                        else {},
                    }
                )

        return {
            "total_records": len(records),
            "anomalies_found": len(anomalies),
            "anomaly_rate": len(anomalies) / len(records) if records else 0,
            "anomalies": anomalies,
            "summary": {
                "mean": float(np.mean(values)),
                "median": float(np.median(values)),
                "std": float(np.std(values)),
                "min": float(np.min(values)),
                "max": float(np.max(values)),
            },
        }

    def detect_sensor_anomalies(
        self, readings: list[dict], value_field: str = "value"
    ) -> dict:
        """
        Detection for sensor/IoT data with time-series characteristics
        """
        values = [float(r.get(value_field, 0)) for r in readings if value_field in r]

        if not values:
            return {"error": f"Field '{value_field}' not found in readings"}

        arr = np.array(values)
        diff = np.diff(arr)
        mean_diff = np.mean(np.abs(diff))
        std_diff = np.std(diff)

        anomalies = []
        for i in range(1, len(values)):
            change = abs(values[i] - values[i - 1])
            z_change = (change - mean_diff) / std_diff if std_diff > 0 else 0

            if z_change > 2.5 or change > 3 * mean_diff:
                anomalies.append(
                    {
                        "index": i,
                        "record": readings[i],
                        "previous_value": values[i - 1],
                        "current_value": values[i],
                        "change": change,
                        "change_zscore": float(z_change),
                        "type": "sudden_change",
                    }
                )

        return {
            "total_readings": len(readings),
            "anomalies_found": len(anomalies),
            "anomalies": anomalies,
            "baseline": {
                "mean_change": float(mean_diff),
                "std_change": float(std_diff),
            },
        }


_service: Optional[AnomalyDetectionService] = None


def get_anomaly_service() -> AnomalyDetectionService:
    global _service
    if _service is None:
        _service = AnomalyDetectionService()
    return _service