generalbots/botmodels/src/services/vision_service.py

import io
import time
from datetime import datetime
from typing import Optional

import torch
from PIL import Image

from ..core.config import settings
from ..core.logging import get_logger

logger = get_logger("vision_service")


class VisionService:
    def __init__(self):
        self.model = None
        self.processor = None
        self.device = settings.device
        self._initialized = False

    def initialize(self):
        if self._initialized:
            return
        logger.info("Loading vision model (BLIP2)")
        try:
            from transformers import Blip2ForConditionalGeneration, Blip2Processor

            self.processor = Blip2Processor.from_pretrained(settings.vision_model_path)
            self.model = Blip2ForConditionalGeneration.from_pretrained(
                settings.vision_model_path,
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            )
            self.model = self.model.to(self.device)
            self._initialized = True
            logger.info("Vision model loaded")
        except Exception as e:
            logger.error("Failed to load vision model", error=str(e))
            # Don't raise - allow service to run without vision
            logger.warning("Vision service will return placeholder responses")

    async def describe_image(
        self, image_data: bytes, prompt: Optional[str] = None
    ) -> dict:
        """Generate a caption/description for an image"""
        start = time.time()

        if not self._initialized or self.model is None:
            # Return placeholder if model not loaded
            return {
                "description": "Vision model not initialized. Please check model path configuration.",
                "confidence": 0.0,
                "generation_time": time.time() - start,
            }

        try:
            # Load image from bytes
            image = Image.open(io.BytesIO(image_data)).convert("RGB")

            # Prepare inputs
            if prompt:
                inputs = self.processor(image, text=prompt, return_tensors="pt").to(
                    self.device
                )
            else:
                inputs = self.processor(image, return_tensors="pt").to(self.device)

            # Generate caption
            with torch.no_grad():
                generated_ids = self.model.generate(
                    **inputs, max_new_tokens=100, num_beams=5, early_stopping=True
                )

            # Decode the generated text
            description = self.processor.decode(
                generated_ids[0], skip_special_tokens=True
            )

            return {
                "description": description.strip(),
                "confidence": 0.85,  # BLIP2 doesn't provide confidence scores directly
                "generation_time": time.time() - start,
            }

        except Exception as e:
            logger.error("Image description failed", error=str(e))
            return {
                "description": f"Error describing image: {str(e)}",
                "confidence": 0.0,
                "generation_time": time.time() - start,
            }

    async def describe_video(self, video_data: bytes, num_frames: int = 8) -> dict:
        """Generate a description for a video by sampling frames"""
        start = time.time()

        if not self._initialized or self.model is None:
            return {
                "description": "Vision model not initialized. Please check model path configuration.",
                "frame_count": 0,
                "generation_time": time.time() - start,
            }

        try:
            import tempfile

            import cv2
            import numpy as np

            # Save video to temp file
            with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
                tmp.write(video_data)
                tmp_path = tmp.name

            # Open video and extract frames
            cap = cv2.VideoCapture(tmp_path)
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

            if total_frames == 0:
                cap.release()
                return {
                    "description": "Could not read video frames",
                    "frame_count": 0,
                    "generation_time": time.time() - start,
                }

            # Sample frames evenly throughout the video
            frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
            frames = []

            for idx in frame_indices:
                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
                ret, frame = cap.read()
                if ret:
                    # Convert BGR to RGB
                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    frames.append(Image.fromarray(frame_rgb))

            cap.release()

            # Clean up temp file
            import os

            os.unlink(tmp_path)

            if not frames:
                return {
                    "description": "No frames could be extracted from video",
                    "frame_count": 0,
                    "generation_time": time.time() - start,
                }

            # Generate descriptions for each sampled frame
            descriptions = []
            for frame in frames:
                inputs = self.processor(frame, return_tensors="pt").to(self.device)

                with torch.no_grad():
                    generated_ids = self.model.generate(
                        **inputs, max_new_tokens=50, num_beams=3, early_stopping=True
                    )

                desc = self.processor.decode(generated_ids[0], skip_special_tokens=True)
                descriptions.append(desc.strip())

            # Combine descriptions into a coherent summary
            # Use the most common elements or create a timeline
            unique_descriptions = list(
                dict.fromkeys(descriptions)
            )  # Remove duplicates preserving order

            if len(unique_descriptions) == 1:
                combined = unique_descriptions[0]
            else:
                combined = "Video shows: " + "; ".join(unique_descriptions[:4])

            return {
                "description": combined,
                "frame_count": len(frames),
                "generation_time": time.time() - start,
            }

        except Exception as e:
            logger.error("Video description failed", error=str(e))
            return {
                "description": f"Error describing video: {str(e)}",
                "frame_count": 0,
                "generation_time": time.time() - start,
            }

    async def answer_question(self, image_data: bytes, question: str) -> dict:
        """Visual question answering - ask a question about an image"""
        # Use describe_image with the question as a prompt
        return await self.describe_image(image_data, prompt=question)


_service = None


def get_vision_service():
    global _service
    if _service is None:
        _service = VisionService()
    return _service
feat: Major workspace reorganization and documentation update - Add comprehensive documentation in botbook/ with 12 chapters - Add botapp/ Tauri desktop application - Add botdevice/ IoT device support - Add botlib/ shared library crate - Add botmodels/ Python ML models service - Add botplugin/ browser extension - Add botserver/ reorganized server code - Add bottemplates/ bot templates - Add bottest/ integration tests - Add botui/ web UI server - Add CI/CD workflows in .forgejo/workflows/ - Add AGENTS.md and PROD.md documentation - Add dependency management scripts (DEPENDENCIES.sh/ps1) - Remove legacy src/ structure and migrations - Clean up temporary and backup files 2026-04-19 08:14:25 -03:00			`import io`
			`import time`
			`from datetime import datetime`
			`from typing import Optional`

			`import torch`
			`from PIL import Image`

			`from ..core.config import settings`
			`from ..core.logging import get_logger`

			`logger = get_logger("vision_service")`


			`class VisionService:`
			`def __init__(self):`
			`self.model = None`
			`self.processor = None`
			`self.device = settings.device`
			`self._initialized = False`

			`def initialize(self):`
			`if self._initialized:`
			`return`
			`logger.info("Loading vision model (BLIP2)")`
			`try:`
			`from transformers import Blip2ForConditionalGeneration, Blip2Processor`

			`self.processor = Blip2Processor.from_pretrained(settings.vision_model_path)`
			`self.model = Blip2ForConditionalGeneration.from_pretrained(`
			`settings.vision_model_path,`
			`torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,`
			`)`
			`self.model = self.model.to(self.device)`
			`self._initialized = True`
			`logger.info("Vision model loaded")`
			`except Exception as e:`
			`logger.error("Failed to load vision model", error=str(e))`
			`# Don't raise - allow service to run without vision`
			`logger.warning("Vision service will return placeholder responses")`

			`async def describe_image(`
			`self, image_data: bytes, prompt: Optional[str] = None`
			`) -> dict:`
			`"""Generate a caption/description for an image"""`
			`start = time.time()`

			`if not self._initialized or self.model is None:`
			`# Return placeholder if model not loaded`
			`return {`
			`"description": "Vision model not initialized. Please check model path configuration.",`
			`"confidence": 0.0,`
			`"generation_time": time.time() - start,`
			`}`

			`try:`
			`# Load image from bytes`
			`image = Image.open(io.BytesIO(image_data)).convert("RGB")`

			`# Prepare inputs`
			`if prompt:`
			`inputs = self.processor(image, text=prompt, return_tensors="pt").to(`
			`self.device`
			`)`
			`else:`
			`inputs = self.processor(image, return_tensors="pt").to(self.device)`

			`# Generate caption`
			`with torch.no_grad():`
			`generated_ids = self.model.generate(`
			`**inputs, max_new_tokens=100, num_beams=5, early_stopping=True`
			`)`

			`# Decode the generated text`
			`description = self.processor.decode(`
			`generated_ids[0], skip_special_tokens=True`
			`)`

			`return {`
			`"description": description.strip(),`
			`"confidence": 0.85, # BLIP2 doesn't provide confidence scores directly`
			`"generation_time": time.time() - start,`
			`}`

			`except Exception as e:`
			`logger.error("Image description failed", error=str(e))`
			`return {`
			`"description": f"Error describing image: {str(e)}",`
			`"confidence": 0.0,`
			`"generation_time": time.time() - start,`
			`}`

			`async def describe_video(self, video_data: bytes, num_frames: int = 8) -> dict:`
			`"""Generate a description for a video by sampling frames"""`
			`start = time.time()`

			`if not self._initialized or self.model is None:`
			`return {`
			`"description": "Vision model not initialized. Please check model path configuration.",`
			`"frame_count": 0,`
			`"generation_time": time.time() - start,`
			`}`

			`try:`
			`import tempfile`

			`import cv2`
			`import numpy as np`

			`# Save video to temp file`
			`with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:`
			`tmp.write(video_data)`
			`tmp_path = tmp.name`

			`# Open video and extract frames`
			`cap = cv2.VideoCapture(tmp_path)`
			`total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))`

			`if total_frames == 0:`
			`cap.release()`
			`return {`
			`"description": "Could not read video frames",`
			`"frame_count": 0,`
			`"generation_time": time.time() - start,`
			`}`

			`# Sample frames evenly throughout the video`
			`frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)`
			`frames = []`

			`for idx in frame_indices:`
			`cap.set(cv2.CAP_PROP_POS_FRAMES, idx)`
			`ret, frame = cap.read()`
			`if ret:`
			`# Convert BGR to RGB`
			`frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)`
			`frames.append(Image.fromarray(frame_rgb))`

			`cap.release()`

			`# Clean up temp file`
			`import os`

			`os.unlink(tmp_path)`

			`if not frames:`
			`return {`
			`"description": "No frames could be extracted from video",`
			`"frame_count": 0,`
			`"generation_time": time.time() - start,`
			`}`

			`# Generate descriptions for each sampled frame`
			`descriptions = []`
			`for frame in frames:`
			`inputs = self.processor(frame, return_tensors="pt").to(self.device)`

			`with torch.no_grad():`
			`generated_ids = self.model.generate(`
			`**inputs, max_new_tokens=50, num_beams=3, early_stopping=True`
			`)`

			`desc = self.processor.decode(generated_ids[0], skip_special_tokens=True)`
			`descriptions.append(desc.strip())`

			`# Combine descriptions into a coherent summary`
			`# Use the most common elements or create a timeline`
			`unique_descriptions = list(`
			`dict.fromkeys(descriptions)`
			`) # Remove duplicates preserving order`

			`if len(unique_descriptions) == 1:`
			`combined = unique_descriptions[0]`
			`else:`
			`combined = "Video shows: " + "; ".join(unique_descriptions[:4])`

			`return {`
			`"description": combined,`
			`"frame_count": len(frames),`
			`"generation_time": time.time() - start,`
			`}`

			`except Exception as e:`
			`logger.error("Video description failed", error=str(e))`
			`return {`
			`"description": f"Error describing video: {str(e)}",`
			`"frame_count": 0,`
			`"generation_time": time.time() - start,`
			`}`

			`async def answer_question(self, image_data: bytes, question: str) -> dict:`
			`"""Visual question answering - ask a question about an image"""`
			`# Use describe_image with the question as a prompt`
			`return await self.describe_image(image_data, prompt=question)`


			`_service = None`


			`def get_vision_service():`
			`global _service`
			`if _service is None:`
			`_service = VisionService()`
			`return _service`