new(all): Initial import.

2024-10-26 16:26:11 -03:00 · 2024-10-26 16:26:11 -03:00 · 5ebde5b646
commit 5ebde5b646
parent e6d2ffa35a
15 changed files with 9645 additions and 151 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 node_modules
 .env
 output.txt
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -5,12 +5,12 @@
        "name": "Electron: Main",
        "type": "node",
        "request": "launch",
-        "program": "${workspaceFolder}/node_modules/electron/dist/electron.js",
+        "sourceMaps": true,
        "args": ["${workspaceFolder}/dist/main/main.js"],
        "outFiles": ["${workspaceFolder}/dist/**/*.js"],
        "cwd": "${workspaceFolder}",
        "sourceMaps": true,
        "protocol": "inspector",
        "console": "integratedTerminal",
        "windows": {
          "runtimeExecutable": "${workspaceFolder}/node_modules/.bin/electron.cmd"
        },
--- a/dist/main/main.js
+++ b/dist/main/main.js
@ -23,8 +23,11 @@ var __importStar = (this && this.__importStar) || function (mod) {
    return result;
 };
 Object.defineProperty(exports, "__esModule", { value: true });
 require('dotenv').config();
 const electron_1 = require("electron");
 const path = __importStar(require("path"));
 // In main.ts
 const electron_2 = require("electron");
 const recorder_service_1 = require("../services/recorder.service");
 const player_service_1 = require("../services/player.service");
 const recorder = new recorder_service_1.RecorderService();
@ -70,3 +73,18 @@ electron_1.ipcMain.handle('stop-recording', async () => {
 electron_1.ipcMain.handle('execute-basic-code', async (_, code) => {
    await player.executeBasicCode(code);
 });
 // Add microphone permission check for macOS
 electron_1.ipcMain.handle('check-microphone-permission', async () => {
    if (process.platform === 'darwin') {
        const status = await electron_2.systemPreferences.getMediaAccessStatus('microphone');
        if (status !== 'granted') {
            const success = await electron_2.systemPreferences.askForMediaAccess('microphone');
            return success;
        }
        return true;
    }
    // On Windows/Linux, permissions are handled by the OS
    return true;
 });
 // Enable required permissions
 electron_1.app.commandLine.appendSwitch('enable-speech-dispatcher');
--- a/dist/preload/preload.js
+++ b/dist/preload/preload.js
@ -1 +0,0 @@
 // Preload script goes here
--- a/dist/services/openai.service.js
+++ b/dist/services/openai.service.js
@ -1,56 +1,150 @@
 "use strict";
 var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
    if (k2 === undefined) k2 = k;
    var desc = Object.getOwnPropertyDescriptor(m, k);
    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
      desc = { enumerable: true, get: function() { return m[k]; } };
    }
    Object.defineProperty(o, k2, desc);
 }) : (function(o, m, k, k2) {
    if (k2 === undefined) k2 = k;
    o[k2] = m[k];
 }));
 var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
    Object.defineProperty(o, "default", { enumerable: true, value: v });
 }) : function(o, v) {
    o["default"] = v;
 });
 var __importStar = (this && this.__importStar) || function (mod) {
    if (mod && mod.__esModule) return mod;
    var result = {};
    if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
    __setModuleDefault(result, mod);
    return result;
 };
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.OpenAIService = void 0;
 const openai_1 = require("openai");
-const fs = __importStar(require("fs"));
+const { Readable } = require('stream');
 class OpenAIService {
    constructor() {
-        this.client = new openai_1.AzureOpenAI({ dangerouslyAllowBrowser: true,
+        this.client = new openai_1.AzureOpenAI({
            dangerouslyAllowBrowser: true,
            endpoint: process.env.AZURE_OPEN_AI_ENDPOINT || '',
-            deployment: process.env.AZURE_OPEN_AI_IMAGE_MODEL || '',
+            apiVersion: process.env.OPENAI_API_VERSION || '2024-02-15-preview',
            apiVersion: process.env.OPENAI_API_VERSION || '',
            apiKey: process.env.AZURE_OPEN_AI_KEY || ''
        });
    }
-    async analyzeScreen(imagePath) {
+    async transcribeAudio(audioBlob) {
-        const imageBuffer = fs.readFileSync(imagePath);
+        try {
-        const base64Image = imageBuffer.toString('base64');
+            // Convert Blob to ArrayBuffer
            const arrayBuffer = await audioBlob.arrayBuffer();
            // Convert Buffer to a Readable stream
            const buffer = Buffer.from(arrayBuffer);
            const stream = new Readable();
            stream.push(buffer);
            stream.push(null); // Signal the end of the stream
            const response = await this.client.audio.transcriptions.create({
                file: stream,
                model: process.env.AZURE_OPEN_AI_WHISPER_MODEL || 'whisper-1',
                language: 'en',
                response_format: 'verbose_json'
            });
            return {
                text: response.text,
                //@ts-ignore
                segments: response.segments?.map(seg => ({
                    text: seg.text,
                    start: seg.start,
                    end: seg.end
                })) || []
            };
        }
        catch (error) {
            console.error('Error in transcribeAudio:', error);
            throw new Error('Failed to transcribe audio');
        }
    }
    async analyzeScreenWithContext(context) {
        try {
            const response = await this.client.chat.completions.create({
-            model: process.env.AZURE_OPEN_AI_LLM_MODEL || '',
+                model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
                messages: [
                    {
                        role: 'system',
                        content: `You are an AI that analyzes screenshots and voice commands to determine user intentions for automation.
                     You should identify UI elements and return specific actions in JSON format.
                     Focus on the area near the cursor position when relevant.`
                    },
                    {
                        role: 'user',
                        content: [
-                        { type: 'text', text: 'Analyze this screenshot and identify all interactive elements (buttons, text fields, etc). Return their locations and identifiers.' },
+                            {
-                        { type: 'image_url', image_url: { url: `data:image/png;base64,${base64Image}` } }
+                                type: 'text',
-                    ],
+                                text: `Analyze this screenshot with the following context:
                      Voice Command: "${context.transcription}"
                      Cursor Position: x=${context.cursorPosition.x}, y=${context.cursorPosition.y}
                      Identify the most likely action based on the voice command and cursor position.
                      Return in format: {
                        "type": "click|type|move",
                        "identifier": "element-id or descriptive name",
                        "value": "text to type (for type actions)",
                        "confidence": 0-1,
                        "bounds": {"x": number, "y": number, "width": number, "height": number}
                      }`
                            },
                            {
                                type: 'image_url',
                                image_url: {
                                    url: `data:image/png;base64,${context.screenshot}`
                                }
                            }
                        ]
                    }
                ],
                max_tokens: 500,
                temperature: 0.3
            });
-        return JSON.parse(response.choices[0].message.content || '{}');
+            const result = JSON.parse(response.choices[0].message.content || '{}');
            return result;
        }
        catch (error) {
            console.error('Error in analyzeScreenWithContext:', error);
            throw new Error('Failed to analyze screen context');
        }
    }
    async analyzeScreen(screenshot) {
        try {
            const response = await this.client.chat.completions.create({
                model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
                messages: [
                    {
                        role: 'system',
                        content: 'You are an AI that analyzes screenshots to identify interactive UI elements and their properties.'
                    },
                    {
                        role: 'user',
                        content: [
                            {
                                type: 'text',
                                text: `Analyze this screenshot and identify all interactive elements (buttons, text fields, dropdowns, etc).
                       For each element, provide:
                       - Type of element
                       - Identifier or descriptive name
                       - Location and size
                       - Any visible text or labels
                       - State (focused, disabled, etc)
                       Return in format: {
                         "elements": [{
                           "type": "button|input|dropdown|etc",
                           "identifier": "element-id or descriptive name",
                           "bounds": {"x": number, "y": number, "width": number, "height": number},
                           "text": "visible text",
                           "state": {"focused": boolean, "disabled": boolean}
                         }]
                       }`
                            },
                            {
                                type: 'image_url',
                                image_url: {
                                    url: `data:image/png;base64,${screenshot}`
                                }
                            }
                        ]
                    }
                ],
                max_tokens: 1000,
                temperature: 0.3
            });
            const result = JSON.parse(response.choices[0].message.content || '{}');
            return {
                elements: result.elements || [],
                timestamp: Date.now()
            };
        }
        catch (error) {
            console.error('Error in analyzeScreen:', error);
            throw new Error('Failed to analyze screen');
        }
    }
 }
 exports.OpenAIService = OpenAIService;
--- a/dist/services/recorder.service.js
+++ b/dist/services/recorder.service.js
@ -1,39 +1,198 @@
 "use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
-    return (mod && mod.__esModule) ? mod : { "default": mod };
+    if (k2 === undefined) k2 = k;
    var desc = Object.getOwnPropertyDescriptor(m, k);
    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
      desc = { enumerable: true, get: function() { return m[k]; } };
    }
    Object.defineProperty(o, k2, desc);
 }) : (function(o, m, k, k2) {
    if (k2 === undefined) k2 = k;
    o[k2] = m[k];
 }));
 var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
    Object.defineProperty(o, "default", { enumerable: true, value: v });
 }) : function(o, v) {
    o["default"] = v;
 });
 var __importStar = (this && this.__importStar) || function (mod) {
    if (mod && mod.__esModule) return mod;
    var result = {};
    if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
    __setModuleDefault(result, mod);
    return result;
 };
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.RecorderService = void 0;
 const electron_1 = require("electron");
-const dotenv_1 = __importDefault(require("dotenv"));
+const openai_service_1 = require("../services/openai.service");
-dotenv_1.default.config();
+const _ = require('lodash');
-const openai_service_1 = require("./openai.service");
+const path = __importStar(require("path"));
 const fs = __importStar(require("fs"));
 class RecorderService {
    constructor() {
        this.events = [];
        this.recording = false;
        this.currentScreenshot = '';
        this.lastTranscription = '';
        this.recordingProcess = null;
        this.currentAudioFile = '';
        this.silenceTimer = null;
        this.isProcessingAudio = false;
        this.handleAudioLevel = _.debounce(async (_, level) => {
            if (!this.recording)
                return;
            const SILENCE_THRESHOLD = 0.01;
            const SILENCE_DURATION = 1000;
            if (level < SILENCE_THRESHOLD) {
                if (!this.silenceTimer && !this.isProcessingAudio) {
                    this.silenceTimer = setTimeout(async () => {
                        if (this.recording) {
                            await this.processSilence();
                        }
                    }, SILENCE_DURATION);
                }
            }
            else {
                if (this.silenceTimer) {
                    clearTimeout(this.silenceTimer);
                    this.silenceTimer = null;
                }
            }
        }, 100);
        this.handleAudioChunk = async (_, chunk) => {
            if (!this.recording)
                return;
            try {
                const audioFilePath = path.join(this.tempDir, `audio-${Date.now()}.wav`);
                fs.writeFileSync(audioFilePath, chunk);
                if (this.silenceTimer) {
                    clearTimeout(this.silenceTimer);
                    this.silenceTimer = null;
                    await this.processAudioFile(audioFilePath);
                }
            }
            catch (error) {
                console.error('Error handling audio chunk:', error);
            }
        };
        this.openAIService = new openai_service_1.OpenAIService();
        this.tempDir = path.join(process.cwd(), 'temp_recordings');
        if (!fs.existsSync(this.tempDir)) {
            fs.mkdirSync(this.tempDir, { recursive: true });
        }
    }
    async startRecording() {
        try {
            this.recording = true;
            this.events = [];
-        this.requestScreenshot();
+            await this.setupAudioRecording();
            await this.requestScreenshot();
            electron_1.ipcRenderer.on('keyboard-event', this.keyboardHandleEvent); // Listen for keyboard events
        }
-    stopRecording() {
+        catch (error) {
            console.error('Failed to start recording:', error);
            this.recording = false;
-        return this.generateBasicCode();
+            throw error;
        }
-    requestScreenshot() {
+    }
-        // Notify renderer process to capture a screenshot
+    async setupAudioRecording() {
-        const allWebContents = electron_1.screen.getAllDisplays();
+        try {
-        allWebContents.forEach((webContents) => {
+            this.recordingProcess = await electron_1.ipcRenderer.invoke('start-audio-recording');
-            //@ts-ignores
+            electron_1.ipcRenderer.on('audio-level', this.handleAudioLevel);
-            webContents.send('request-screenshot');
+            electron_1.ipcRenderer.on('audio-chunk', this.handleAudioChunk);
        }
        catch (error) {
            console.error('Error setting up audio recording:', error);
            throw new Error(`Failed to setup audio recording: ${error.message}`);
        }
    }
    async processSilence() {
        if (this.isProcessingAudio)
            return;
        this.isProcessingAudio = true;
        try {
            const audioFilePath = await electron_1.ipcRenderer.invoke('save-audio-chunk');
            if (audioFilePath) {
                this.currentAudioFile = audioFilePath;
                await this.processAudioFile(audioFilePath);
                await this.requestScreenshot();
            }
        }
        catch (error) {
            console.error('Error processing silence:', error);
        }
        finally {
            this.isProcessingAudio = false;
        }
    }
    async processAudioFile(audioFilePath) {
        try {
            const audioBuffer = fs.readFileSync(audioFilePath);
            const transcription = await this.openAIService.transcribeAudio(new Blob([audioBuffer], { type: 'audio/wav' }));
            if (transcription.text.trim()) {
                await this.processTranscription(transcription);
            }
            fs.unlinkSync(audioFilePath);
        }
        catch (error) {
            console.error('Error processing audio file:', error);
        }
    }
    async processTranscription(transcription) {
        this.lastTranscription = transcription.text;
        const analysis = await this.openAIService.analyzeScreenWithContext({
            screenshot: this.currentScreenshot,
            transcription: this.lastTranscription,
            cursorPosition: await electron_1.ipcRenderer.invoke('get-cursor-position')
        });
        if (analysis) {
            this.events.push({
                type: analysis.type,
                identifier: analysis.identifier,
                value: analysis.value,
                timestamp: Date.now(),
                narration: this.lastTranscription
            });
        }
    }
    async stopRecording() {
        this.recording = false;
        if (this.silenceTimer) {
            clearTimeout(this.silenceTimer);
            this.silenceTimer = null;
        }
        await electron_1.ipcRenderer.invoke('stop-audio-recording');
        electron_1.ipcRenderer.removeListener('audio-level', this.handleAudioLevel);
        electron_1.ipcRenderer.removeListener('audio-chunk', this.handleAudioChunk);
        electron_1.ipcRenderer.removeListener('keyboard-event', this.keyboardHandleEvent); // Remove keyboard listener
        if (this.currentAudioFile && fs.existsSync(this.currentAudioFile)) {
            fs.unlinkSync(this.currentAudioFile);
        }
        return this.generateBasicCode();
    }
    async requestScreenshot() {
        try {
            const sources = await electron_1.ipcRenderer.invoke('get-screenshot');
            const screenSource = sources[0];
            await this.screenshotHandleEvent(null, screenSource.thumbnail);
        }
        catch (error) {
            console.error('Error capturing screenshot:', error);
        }
    }
    async screenshotHandleEvent(_, screenshot) {
-        this.currentScreenshot = screenshot; // Store the screenshot as a base64 image
+        this.currentScreenshot = screenshot;
    }
    async keyboardHandleEvent(_, event) {
        if (!this.recording)
            return;
        this.events.push({
            type: 'type',
            identifier: event.key,
            timestamp: Date.now(),
            narration: this.lastTranscription
        });
    }
    async mouseHandleEvent(_, event) {
        if (!this.recording)
@ -45,44 +204,39 @@ class RecorderService {
                type: 'click',
                identifier: element.identifier,
                timestamp: Date.now(),
-            });
+                narration: this.lastTranscription
        }
    }
    async keyboardHandleEvent(_, event) {
        if (!this.recording)
            return;
        const analysis = await this.openAIService.analyzeScreen(this.currentScreenshot);
        const focusedElement = this.findFocusedElement(analysis);
        if (focusedElement) {
            this.events.push({
                type: 'type',
                identifier: focusedElement.identifier,
                value: event.key,
                timestamp: Date.now(),
            });
        }
    }
    findElementAtPosition(analysis, x, y) {
        //@ts-nocheck
        return analysis.elements.find((element) => {
            const bounds = element.bounds;
-            return x >= bounds.x && x <= bounds.x + bounds.width && y >= bounds.y && y <= bounds.y + bounds.height;
+            return x >= bounds.x &&
                x <= bounds.x + bounds.width &&
                y >= bounds.y &&
                y <= bounds.y + bounds.height;
        });
    }
    findFocusedElement(analysis) {
        //@ts-ignore
        return analysis.elements.find((element) => element.focused);
    }
    generateBasicCode() {
        let basicCode = '10 REM BotDesktop Automation Script\n';
        let lineNumber = 20;
        for (const event of this.events) {
            basicCode += `${lineNumber} REM ${event.narration}\n`;
            lineNumber += 10;
            switch (event.type) {
                case 'click':
                    basicCode += `${lineNumber} CLICK "${event.identifier}"\n`;
                    break;
                case 'type':
                    basicCode += `${lineNumber} TYPE "${event.identifier}"\n`;
                    break;
                case 'type':
                    basicCode += `${lineNumber} TYPE "${event.identifier}" "${event.value}"\n`;
                    break;
                case 'move':
                    basicCode += `${lineNumber} MOVE "${event.identifier}"\n`;
                    break;
            }
            lineNumber += 10;
        }
--- a/gencode.sh
+++ b/gencode.sh
@ -0,0 +1,12 @@
 #!/bin/bash
 # Remove output.txt if it exists to start fresh
 rm -f output.txt
 # Find all .ts and .tsx files excluding node_modules, and concatenate filename and contents into output.txt
 find . -type f \( -name "*.ts" -o -name "*.tsx" \) -not -path "*/node_modules/*" | while read -r file; do
  echo -e "\n// File: $file\n" >> output.txt
  cat "$file" >> output.txt
 done
 echo "All TypeScript (.ts and .tsx) code has been combined into output.txt with filenames as headers, excluding node_modules"
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -10,13 +10,15 @@
    "test": "vitest"
  },
  "dependencies": {
    "dotenv": "^16.4.5",
    "node-global-key-listener": "^0.3.0",
    "node-mouse": "^0.0.2",
    "@types/node": "^20.0.0",
    "@types/react": "^18.0.0",
    "@types/react-dom": "^18.0.0",
    "debounce": "^2.2.0",
    "dotenv": "^16.4.5",
    "electron": "^28.0.0",
    "lodash": "^4.17.21",
    "node-global-key-listener": "^0.3.0",
    "node-mouse": "^0.0.2",
    "openai": "^4.28.0",
    "react": "^18.2.0",
    "react-dom": "^18.2.0",
--- a/src/main/main.ts
+++ b/src/main/main.ts
@ -1,6 +1,8 @@
 require('dotenv').config();
 import { app, BrowserWindow, ipcMain } from 'electron';
 import * as path from 'path';
-
+// In main.ts
 import {  systemPreferences } from 'electron';
 import { RecorderService } from '../services/recorder.service';
 import { PlayerService } from '../services/player.service';
@ -56,3 +58,21 @@ ipcMain.handle('stop-recording', async () => {
 ipcMain.handle('execute-basic-code', async (_, code: string) => {
  await player.executeBasicCode(code);
 });
 // Add microphone permission check for macOS
 ipcMain.handle('check-microphone-permission', async () => {
  if (process.platform === 'darwin') {
    const status = await systemPreferences.getMediaAccessStatus('microphone');
    if (status !== 'granted') {
      const success = await systemPreferences.askForMediaAccess('microphone');
      return success;
    }
    return true;
  }
  // On Windows/Linux, permissions are handled by the OS
  return true;
 });
 // Enable required permissions
 app.commandLine.appendSwitch('enable-speech-dispatcher');
--- a/src/preload/preload.ts
+++ b/src/preload/preload.ts
@ -1 +0,0 @@
 // Preload script goes here
--- a/src/services/openai.service.ts
+++ b/src/services/openai.service.ts
@ -1,36 +1,155 @@
 import { AzureOpenAI } from 'openai';
 import * as fs from 'fs';
-import { ScreenAnalysis } from './types';
+import { ScreenAnalysis, ScreenContext, WhisperResponse, AutomationAction } from './types';
 const { Readable } = require('stream');
 export class OpenAIService {
  private client: AzureOpenAI;
  constructor() {
-    this.client = new AzureOpenAI({ dangerouslyAllowBrowser: true,
+    this.client = new AzureOpenAI({
      dangerouslyAllowBrowser: true,
      endpoint: process.env.AZURE_OPEN_AI_ENDPOINT || '',
-      deployment: process.env.AZURE_OPEN_AI_IMAGE_MODEL || '',
+      apiVersion: process.env.OPENAI_API_VERSION || '2024-02-15-preview',
      apiVersion: process.env.OPENAI_API_VERSION || '',
      apiKey: process.env.AZURE_OPEN_AI_KEY || ''
    });
  }
-  async analyzeScreen(imagePath: string): Promise<ScreenAnalysis> {
+  async transcribeAudio(audioBlob: Blob): Promise<WhisperResponse> {
-    const imageBuffer = fs.readFileSync(imagePath);
+    try {
-    const base64Image = imageBuffer.toString('base64');
+      // Convert Blob to ArrayBuffer
      const arrayBuffer = await audioBlob.arrayBuffer();
      // Convert Buffer to a Readable stream
      const buffer = Buffer.from(arrayBuffer);
      const stream = new Readable();
      stream.push(buffer);
      stream.push(null); // Signal the end of the stream
      const response = await this.client.audio.transcriptions.create({
        file: stream,
        model: process.env.AZURE_OPEN_AI_WHISPER_MODEL || 'whisper-1',
        language: 'en',
        response_format: 'verbose_json'
      });      return {
        text: response.text,
        //@ts-ignore
        segments: response.segments?.map(seg => ({
          text: seg.text,
          start: seg.start,
          end: seg.end
        })) || []
      };
    } catch (error) {
      console.error('Error in transcribeAudio:', error);
      throw new Error('Failed to transcribe audio');
    }
  }
  async analyzeScreenWithContext(context: ScreenContext): Promise<AutomationAction> {
    try {
      const response = await this.client.chat.completions.create({
-      model: process.env.AZURE_OPEN_AI_LLM_MODEL || '',
+        model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
        messages: [
          {
            role: 'system',
            content: `You are an AI that analyzes screenshots and voice commands to determine user intentions for automation.
                     You should identify UI elements and return specific actions in JSON format.
                     Focus on the area near the cursor position when relevant.`
          },
          {
            role: 'user',
            content: [
-            { type: 'text', text: 'Analyze this screenshot and identify all interactive elements (buttons, text fields, etc). Return their locations and identifiers.' },
+              {
-            { type: 'image_url', image_url: { url: `data:image/png;base64,${base64Image}` } }
+                type: 'text',
-          ],
+                text: `Analyze this screenshot with the following context:
                      Voice Command: "${context.transcription}"
                      Cursor Position: x=${context.cursorPosition.x}, y=${context.cursorPosition.y}
                      Identify the most likely action based on the voice command and cursor position.
                      Return in format: {
                        "type": "click|type|move",
                        "identifier": "element-id or descriptive name",
                        "value": "text to type (for type actions)",
                        "confidence": 0-1,
                        "bounds": {"x": number, "y": number, "width": number, "height": number}
                      }`
              },
              {
                type: 'image_url',
                image_url: {
                  url: `data:image/png;base64,${context.screenshot}`
                }
              }
            ]
          }
        ],
        max_tokens: 500,
        temperature: 0.3
      });
-    return JSON.parse(response.choices[0].message.content || '{}');
+      const result = JSON.parse(response.choices[0].message.content || '{}');
      return result;
    } catch (error) {
      console.error('Error in analyzeScreenWithContext:', error);
      throw new Error('Failed to analyze screen context');
    }
  }
  async analyzeScreen(screenshot: string): Promise<ScreenAnalysis> {
    try {
      const response = await this.client.chat.completions.create({
        model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
        messages: [
          {
            role: 'system',
            content: 'You are an AI that analyzes screenshots to identify interactive UI elements and their properties.'
          },
          {
            role: 'user',
            content: [
              {
                type: 'text',
                text: `Analyze this screenshot and identify all interactive elements (buttons, text fields, dropdowns, etc).
                       For each element, provide:
                       - Type of element
                       - Identifier or descriptive name
                       - Location and size
                       - Any visible text or labels
                       - State (focused, disabled, etc)
                       Return in format: {
                         "elements": [{
                           "type": "button|input|dropdown|etc",
                           "identifier": "element-id or descriptive name",
                           "bounds": {"x": number, "y": number, "width": number, "height": number},
                           "text": "visible text",
                           "state": {"focused": boolean, "disabled": boolean}
                         }]
                       }`
              },
              {
                type: 'image_url',
                image_url: {
                  url: `data:image/png;base64,${screenshot}`
                }
              }
            ]
          }
        ],
        max_tokens: 1000,
        temperature: 0.3
      });
      const result = JSON.parse(response.choices[0].message.content || '{}');
      return {
        elements: result.elements || [],
        timestamp: Date.now()
      };
    } catch (error) {
      console.error('Error in analyzeScreen:', error);
      throw new Error('Failed to analyze screen');
    }
  }
 }
--- a/src/services/recorder.service.ts
+++ b/src/services/recorder.service.ts
@ -1,41 +1,192 @@
-import { screen, ipcMain } from 'electron';
+import { ipcRenderer } from 'electron';
-import { AutomationEvent, ScreenAnalysis } from './types';
+import { AutomationEvent, ScreenAnalysis, WhisperResponse } from '../services/types';
-import dotenv from 'dotenv';
+import { OpenAIService } from '../services/openai.service';
-dotenv.config();
+const _ = require('lodash');
-import { OpenAIService } from './openai.service';
+import * as path from 'path';
 import * as fs from 'fs';
 export class RecorderService {
  private events: AutomationEvent[] = [];
  private recording: boolean = false;
  private openAIService: OpenAIService;
  private currentScreenshot: string = '';
  private lastTranscription: string = '';
  private recordingProcess: any = null;
  private tempDir: string;
  private currentAudioFile: string = '';
  private silenceTimer: NodeJS.Timeout | null = null;
  private isProcessingAudio: boolean = false;
  constructor() {
    this.openAIService = new OpenAIService();
    this.tempDir = path.join(process.cwd(), 'temp_recordings');
    if (!fs.existsSync(this.tempDir)) {
      fs.mkdirSync(this.tempDir, { recursive: true });
    }
  }
  public async startRecording() {
    try {
      this.recording = true;
      this.events = [];
-    this.requestScreenshot();
+      await this.setupAudioRecording();
      await this.requestScreenshot();
      ipcRenderer.on('keyboard-event', this.keyboardHandleEvent); // Listen for keyboard events
    } catch (error) {
      console.error('Failed to start recording:', error);
      this.recording = false;
      throw error;
    }
  }
-  public stopRecording(): string {
+  private async setupAudioRecording() {
    try {
      this.recordingProcess = await ipcRenderer.invoke('start-audio-recording');
      ipcRenderer.on('audio-level', this.handleAudioLevel);
      ipcRenderer.on('audio-chunk', this.handleAudioChunk);
    } catch (error) {
      console.error('Error setting up audio recording:', error);
      throw new Error(`Failed to setup audio recording: ${error.message}`);
    }
  }
  private handleAudioLevel = _.debounce(async (_: any, level: number) => {
    if (!this.recording) return;
    const SILENCE_THRESHOLD = 0.01;
    const SILENCE_DURATION = 1000;
    if (level < SILENCE_THRESHOLD) {
      if (!this.silenceTimer && !this.isProcessingAudio) {
        this.silenceTimer = setTimeout(async () => {
          if (this.recording) {
            await this.processSilence();
          }
        }, SILENCE_DURATION);
      }
    } else {
      if (this.silenceTimer) {
        clearTimeout(this.silenceTimer);
        this.silenceTimer = null;
      }
    }
  }, 100);
  private handleAudioChunk = async (_: any, chunk: Buffer) => {
    if (!this.recording) return;
    try {
      const audioFilePath = path.join(this.tempDir, `audio-${Date.now()}.wav`);
      fs.writeFileSync(audioFilePath, chunk);
      if (this.silenceTimer) {
        clearTimeout(this.silenceTimer);
        this.silenceTimer = null;
        await this.processAudioFile(audioFilePath);
      }
    } catch (error) {
      console.error('Error handling audio chunk:', error);
    }
  };
  private async processSilence() {
    if (this.isProcessingAudio) return;
    this.isProcessingAudio = true;
    try {
      const audioFilePath = await ipcRenderer.invoke('save-audio-chunk');
      if (audioFilePath) {
        this.currentAudioFile = audioFilePath;
        await this.processAudioFile(audioFilePath);
        await this.requestScreenshot();
      }
    } catch (error) {
      console.error('Error processing silence:', error);
    } finally {
      this.isProcessingAudio = false;
    }
  }
  private async processAudioFile(audioFilePath: string) {
    try {
      const audioBuffer = fs.readFileSync(audioFilePath);
      const transcription = await this.openAIService.transcribeAudio(
        new Blob([audioBuffer], { type: 'audio/wav' })
      );
      if (transcription.text.trim()) {
        await this.processTranscription(transcription);
      }
      fs.unlinkSync(audioFilePath);
    } catch (error) {
      console.error('Error processing audio file:', error);
    }
  }
  private async processTranscription(transcription: WhisperResponse) {
    this.lastTranscription = transcription.text;
    const analysis = await this.openAIService.analyzeScreenWithContext({
      screenshot: this.currentScreenshot,
      transcription: this.lastTranscription,
      cursorPosition: await ipcRenderer.invoke('get-cursor-position')
    });
    if (analysis) {
      this.events.push({
        type: analysis.type,
        identifier: analysis.identifier,
        value: analysis.value,
        timestamp: Date.now(),
        narration: this.lastTranscription
      });
    }
  }
  public async stopRecording(): Promise<string> {
    this.recording = false;
    if (this.silenceTimer) {
      clearTimeout(this.silenceTimer);
      this.silenceTimer = null;
    }
    await ipcRenderer.invoke('stop-audio-recording');
    ipcRenderer.removeListener('audio-level', this.handleAudioLevel);
    ipcRenderer.removeListener('audio-chunk', this.handleAudioChunk);
    ipcRenderer.removeListener('keyboard-event', this.keyboardHandleEvent); // Remove keyboard listener
    if (this.currentAudioFile && fs.existsSync(this.currentAudioFile)) {
      fs.unlinkSync(this.currentAudioFile);
    }
    return this.generateBasicCode();
  }
-  private requestScreenshot() {
+  private async requestScreenshot() {
-    // Notify renderer process to capture a screenshot
+    try {
-    const allWebContents = screen.getAllDisplays();
+      const sources = await ipcRenderer.invoke('get-screenshot');
-    allWebContents.forEach((webContents) => {
+      const screenSource = sources[0];
-      //@ts-ignores
+      await this.screenshotHandleEvent(null, screenSource.thumbnail);
-      webContents.send('request-screenshot');
+    } catch (error) {
-    });
+      console.error('Error capturing screenshot:', error);
    }
  }
  public async screenshotHandleEvent(_: any, screenshot: string) {
-    this.currentScreenshot = screenshot; // Store the screenshot as a base64 image
+    this.currentScreenshot = screenshot;
  }
  public async keyboardHandleEvent(_: any, event: KeyboardEvent) {
    if (!this.recording) return;
    this.events.push({
      type: 'type',
      identifier: event.key,
      timestamp: Date.now(),
      narration: this.lastTranscription
    });
  }
  public async mouseHandleEvent(_: any, event: any) {
@ -49,50 +200,43 @@ export class RecorderService {
        type: 'click',
        identifier: element.identifier,
        timestamp: Date.now(),
-      });
+        narration: this.lastTranscription
    }
  }
  public async keyboardHandleEvent(_: any, event: any) {
    if (!this.recording) return;
    const analysis = await this.openAIService.analyzeScreen(this.currentScreenshot);
    const focusedElement = this.findFocusedElement(analysis);
    if (focusedElement) {
      this.events.push({
        type: 'type',
        identifier: focusedElement.identifier,
        value: event.key,
        timestamp: Date.now(),
      });
    }
  }
  private findElementAtPosition(analysis: ScreenAnalysis, x: number, y: number) {
    //@ts-nocheck
    return analysis.elements.find((element) => {
      const bounds = element.bounds;
-      return x >= bounds.x && x <= bounds.x + bounds.width && y >= bounds.y && y <= bounds.y + bounds.height;
+      return x >= bounds.x && 
             x <= bounds.x + bounds.width && 
             y >= bounds.y && 
             y <= bounds.y + bounds.height;
    });
  }
  private findFocusedElement(analysis: ScreenAnalysis) {
    //@ts-ignore
    return analysis.elements.find((element) => element.focused);
  }
  private generateBasicCode(): string {
    let basicCode = '10 REM BotDesktop Automation Script\n';
    let lineNumber = 20;
    for (const event of this.events) {
      basicCode += `${lineNumber} REM ${event.narration}\n`;
      lineNumber += 10;
      switch (event.type) {
        case 'click':
          basicCode += `${lineNumber} CLICK "${event.identifier}"\n`;
          break;
        case 'type':
          basicCode += `${lineNumber} TYPE "${event.identifier}"\n`;
          break;
        case 'type':
          basicCode += `${lineNumber} TYPE "${event.identifier}" "${event.value}"\n`;
          break;
        case 'move':
          basicCode += `${lineNumber} MOVE "${event.identifier}"\n`;
          break;
      }
      lineNumber += 10;
    }
--- a/src/services/types.ts
+++ b/src/services/types.ts
@ -1,11 +1,37 @@
 export interface AutomationAction {
  type: 'click' | 'type' | 'move';
  identifier: string;
  value?: string;
  confidence: number;
  bounds: {
    x: number;
    y: number;
    width: number;
    height: number;
  };
 }
 export interface AutomationEvent {
  type: 'click' | 'type' | 'move';
  identifier: string;
  value?: string;
  timestamp: number;
  narration: string;
 }
 export interface WhisperResponse {
  text: string;
  segments:any;
 }
 export interface ScreenContext {
  screenshot: string;
  transcription: string;
  cursorPosition: { x: number, y: number };
 }
 export interface ScreenAnalysis {
  timestamp: number,
  elements: {
    identifier: string;
    type: string;
--- a/webpack.config.js
+++ b/webpack.config.js
@ -2,6 +2,7 @@ const path = require('path');
 const HtmlWebpackPlugin = require('html-webpack-plugin');
 module.exports = {
  devtool: 'source-map',
  entry: './src/renderer/index.tsx',
  target: 'electron-renderer',
  module: {