new(all): Initial import.

2024-10-26 16:26:11 -03:00 · 2024-10-26 16:26:11 -03:00 · 5ebde5b646
commit 5ebde5b646
parent e6d2ffa35a
15 changed files with 9645 additions and 151 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 node_modules
 .env
+output.txt
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -5,12 +5,12 @@
        "name": "Electron: Main",
        "type": "node",
        "request": "launch",
-        "program": "${workspaceFolder}/node_modules/electron/dist/electron.js",
+        "sourceMaps": true,
        "args": ["${workspaceFolder}/dist/main/main.js"],
        "outFiles": ["${workspaceFolder}/dist/**/*.js"],
        "cwd": "${workspaceFolder}",
-        "sourceMaps": true,
        "protocol": "inspector",
+        "console": "integratedTerminal",
        "windows": {
          "runtimeExecutable": "${workspaceFolder}/node_modules/.bin/electron.cmd"
        },
--- a/dist/main/main.js
+++ b/dist/main/main.js
@ -23,8 +23,11 @@ var __importStar = (this && this.__importStar) || function (mod) {
    return result;
 };
 Object.defineProperty(exports, "__esModule", { value: true });
+require('dotenv').config();
 const electron_1 = require("electron");
 const path = __importStar(require("path"));
+// In main.ts
+const electron_2 = require("electron");
 const recorder_service_1 = require("../services/recorder.service");
 const player_service_1 = require("../services/player.service");
 const recorder = new recorder_service_1.RecorderService();
@ -70,3 +73,18 @@ electron_1.ipcMain.handle('stop-recording', async () => {
 electron_1.ipcMain.handle('execute-basic-code', async (_, code) => {
    await player.executeBasicCode(code);
 });
+// Add microphone permission check for macOS
+electron_1.ipcMain.handle('check-microphone-permission', async () => {
+    if (process.platform === 'darwin') {
+        const status = await electron_2.systemPreferences.getMediaAccessStatus('microphone');
+        if (status !== 'granted') {
+            const success = await electron_2.systemPreferences.askForMediaAccess('microphone');
+            return success;
+        }
+        return true;
+    }
+    // On Windows/Linux, permissions are handled by the OS
+    return true;
+});
+// Enable required permissions
+electron_1.app.commandLine.appendSwitch('enable-speech-dispatcher');
--- a/dist/preload/preload.js
+++ b/dist/preload/preload.js
@ -1 +0,0 @@
-// Preload script goes here
--- a/dist/services/openai.service.js
+++ b/dist/services/openai.service.js
@ -1,56 +1,150 @@
 "use strict";
-var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
-    if (k2 === undefined) k2 = k;
-    var desc = Object.getOwnPropertyDescriptor(m, k);
-    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
-      desc = { enumerable: true, get: function() { return m[k]; } };
-    }
-    Object.defineProperty(o, k2, desc);
-}) : (function(o, m, k, k2) {
-    if (k2 === undefined) k2 = k;
-    o[k2] = m[k];
-}));
-var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
-    Object.defineProperty(o, "default", { enumerable: true, value: v });
-}) : function(o, v) {
-    o["default"] = v;
-});
-var __importStar = (this && this.__importStar) || function (mod) {
-    if (mod && mod.__esModule) return mod;
-    var result = {};
-    if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
-    __setModuleDefault(result, mod);
-    return result;
-};
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.OpenAIService = void 0;
 const openai_1 = require("openai");
-const fs = __importStar(require("fs"));
+const { Readable } = require('stream');
 class OpenAIService {
    constructor() {
-        this.client = new openai_1.AzureOpenAI({ dangerouslyAllowBrowser: true,
+        this.client = new openai_1.AzureOpenAI({
+            dangerouslyAllowBrowser: true,
            endpoint: process.env.AZURE_OPEN_AI_ENDPOINT || '',
-            deployment: process.env.AZURE_OPEN_AI_IMAGE_MODEL || '',
-            apiVersion: process.env.OPENAI_API_VERSION || '',
+            apiVersion: process.env.OPENAI_API_VERSION || '2024-02-15-preview',
            apiKey: process.env.AZURE_OPEN_AI_KEY || ''
        });
    }
-    async analyzeScreen(imagePath) {
-        const imageBuffer = fs.readFileSync(imagePath);
-        const base64Image = imageBuffer.toString('base64');
+    async transcribeAudio(audioBlob) {
+        try {
+            // Convert Blob to ArrayBuffer
+            const arrayBuffer = await audioBlob.arrayBuffer();
+            // Convert Buffer to a Readable stream
+            const buffer = Buffer.from(arrayBuffer);
+            const stream = new Readable();
+            stream.push(buffer);
+            stream.push(null); // Signal the end of the stream
+            const response = await this.client.audio.transcriptions.create({
+                file: stream,
+                model: process.env.AZURE_OPEN_AI_WHISPER_MODEL || 'whisper-1',
+                language: 'en',
+                response_format: 'verbose_json'
+            });
+            return {
+                text: response.text,
+                //@ts-ignore
+                segments: response.segments?.map(seg => ({
+                    text: seg.text,
+                    start: seg.start,
+                    end: seg.end
+                })) || []
+            };
+        }
+        catch (error) {
+            console.error('Error in transcribeAudio:', error);
+            throw new Error('Failed to transcribe audio');
+        }
+    }
+    async analyzeScreenWithContext(context) {
+        try {
            const response = await this.client.chat.completions.create({
-            model: process.env.AZURE_OPEN_AI_LLM_MODEL || '',
+                model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
                messages: [
+                    {
+                        role: 'system',
+                        content: `You are an AI that analyzes screenshots and voice commands to determine user intentions for automation.
+                     You should identify UI elements and return specific actions in JSON format.
+                     Focus on the area near the cursor position when relevant.`
+                    },
                    {
                        role: 'user',
                        content: [
-                        { type: 'text', text: 'Analyze this screenshot and identify all interactive elements (buttons, text fields, etc). Return their locations and identifiers.' },
-                        { type: 'image_url', image_url: { url: `data:image/png;base64,${base64Image}` } }
-                    ],
+                            {
+                                type: 'text',
+                                text: `Analyze this screenshot with the following context:
+                      Voice Command: "${context.transcription}"
+                      Cursor Position: x=${context.cursorPosition.x}, y=${context.cursorPosition.y}
+                      
+                      Identify the most likely action based on the voice command and cursor position.
+                      Return in format: {
+                        "type": "click|type|move",
+                        "identifier": "element-id or descriptive name",
+                        "value": "text to type (for type actions)",
+                        "confidence": 0-1,
+                        "bounds": {"x": number, "y": number, "width": number, "height": number}
+                      }`
                            },
+                            {
+                                type: 'image_url',
+                                image_url: {
+                                    url: `data:image/png;base64,${context.screenshot}`
+                                }
+                            }
+                        ]
+                    }
                ],
+                max_tokens: 500,
+                temperature: 0.3
            });
-        return JSON.parse(response.choices[0].message.content || '{}');
+            const result = JSON.parse(response.choices[0].message.content || '{}');
+            return result;
+        }
+        catch (error) {
+            console.error('Error in analyzeScreenWithContext:', error);
+            throw new Error('Failed to analyze screen context');
+        }
+    }
+    async analyzeScreen(screenshot) {
+        try {
+            const response = await this.client.chat.completions.create({
+                model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
+                messages: [
+                    {
+                        role: 'system',
+                        content: 'You are an AI that analyzes screenshots to identify interactive UI elements and their properties.'
+                    },
+                    {
+                        role: 'user',
+                        content: [
+                            {
+                                type: 'text',
+                                text: `Analyze this screenshot and identify all interactive elements (buttons, text fields, dropdowns, etc).
+                       For each element, provide:
+                       - Type of element
+                       - Identifier or descriptive name
+                       - Location and size
+                       - Any visible text or labels
+                       - State (focused, disabled, etc)
+                       
+                       Return in format: {
+                         "elements": [{
+                           "type": "button|input|dropdown|etc",
+                           "identifier": "element-id or descriptive name",
+                           "bounds": {"x": number, "y": number, "width": number, "height": number},
+                           "text": "visible text",
+                           "state": {"focused": boolean, "disabled": boolean}
+                         }]
+                       }`
+                            },
+                            {
+                                type: 'image_url',
+                                image_url: {
+                                    url: `data:image/png;base64,${screenshot}`
+                                }
+                            }
+                        ]
+                    }
+                ],
+                max_tokens: 1000,
+                temperature: 0.3
+            });
+            const result = JSON.parse(response.choices[0].message.content || '{}');
+            return {
+                elements: result.elements || [],
+                timestamp: Date.now()
+            };
+        }
+        catch (error) {
+            console.error('Error in analyzeScreen:', error);
+            throw new Error('Failed to analyze screen');
+        }
    }
 }
 exports.OpenAIService = OpenAIService;
--- a/dist/services/recorder.service.js
+++ b/dist/services/recorder.service.js
@ -1,39 +1,198 @@
 "use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
-    return (mod && mod.__esModule) ? mod : { "default": mod };
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || function (mod) {
+    if (mod && mod.__esModule) return mod;
+    var result = {};
+    if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
+    __setModuleDefault(result, mod);
+    return result;
 };
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.RecorderService = void 0;
 const electron_1 = require("electron");
-const dotenv_1 = __importDefault(require("dotenv"));
-dotenv_1.default.config();
-const openai_service_1 = require("./openai.service");
+const openai_service_1 = require("../services/openai.service");
+const _ = require('lodash');
+const path = __importStar(require("path"));
+const fs = __importStar(require("fs"));
 class RecorderService {
    constructor() {
        this.events = [];
        this.recording = false;
        this.currentScreenshot = '';
+        this.lastTranscription = '';
+        this.recordingProcess = null;
+        this.currentAudioFile = '';
+        this.silenceTimer = null;
+        this.isProcessingAudio = false;
+        this.handleAudioLevel = _.debounce(async (_, level) => {
+            if (!this.recording)
+                return;
+            const SILENCE_THRESHOLD = 0.01;
+            const SILENCE_DURATION = 1000;
+            if (level < SILENCE_THRESHOLD) {
+                if (!this.silenceTimer && !this.isProcessingAudio) {
+                    this.silenceTimer = setTimeout(async () => {
+                        if (this.recording) {
+                            await this.processSilence();
+                        }
+                    }, SILENCE_DURATION);
+                }
+            }
+            else {
+                if (this.silenceTimer) {
+                    clearTimeout(this.silenceTimer);
+                    this.silenceTimer = null;
+                }
+            }
+        }, 100);
+        this.handleAudioChunk = async (_, chunk) => {
+            if (!this.recording)
+                return;
+            try {
+                const audioFilePath = path.join(this.tempDir, `audio-${Date.now()}.wav`);
+                fs.writeFileSync(audioFilePath, chunk);
+                if (this.silenceTimer) {
+                    clearTimeout(this.silenceTimer);
+                    this.silenceTimer = null;
+                    await this.processAudioFile(audioFilePath);
+                }
+            }
+            catch (error) {
+                console.error('Error handling audio chunk:', error);
+            }
+        };
        this.openAIService = new openai_service_1.OpenAIService();
+        this.tempDir = path.join(process.cwd(), 'temp_recordings');
+        if (!fs.existsSync(this.tempDir)) {
+            fs.mkdirSync(this.tempDir, { recursive: true });
+        }
    }
    async startRecording() {
+        try {
            this.recording = true;
            this.events = [];
-        this.requestScreenshot();
+            await this.setupAudioRecording();
+            await this.requestScreenshot();
+            electron_1.ipcRenderer.on('keyboard-event', this.keyboardHandleEvent); // Listen for keyboard events
        }
-    stopRecording() {
+        catch (error) {
+            console.error('Failed to start recording:', error);
            this.recording = false;
-        return this.generateBasicCode();
+            throw error;
        }
-    requestScreenshot() {
-        // Notify renderer process to capture a screenshot
-        const allWebContents = electron_1.screen.getAllDisplays();
-        allWebContents.forEach((webContents) => {
-            //@ts-ignores
-            webContents.send('request-screenshot');
+    }
+    async setupAudioRecording() {
+        try {
+            this.recordingProcess = await electron_1.ipcRenderer.invoke('start-audio-recording');
+            electron_1.ipcRenderer.on('audio-level', this.handleAudioLevel);
+            electron_1.ipcRenderer.on('audio-chunk', this.handleAudioChunk);
+        }
+        catch (error) {
+            console.error('Error setting up audio recording:', error);
+            throw new Error(`Failed to setup audio recording: ${error.message}`);
+        }
+    }
+    async processSilence() {
+        if (this.isProcessingAudio)
+            return;
+        this.isProcessingAudio = true;
+        try {
+            const audioFilePath = await electron_1.ipcRenderer.invoke('save-audio-chunk');
+            if (audioFilePath) {
+                this.currentAudioFile = audioFilePath;
+                await this.processAudioFile(audioFilePath);
+                await this.requestScreenshot();
+            }
+        }
+        catch (error) {
+            console.error('Error processing silence:', error);
+        }
+        finally {
+            this.isProcessingAudio = false;
+        }
+    }
+    async processAudioFile(audioFilePath) {
+        try {
+            const audioBuffer = fs.readFileSync(audioFilePath);
+            const transcription = await this.openAIService.transcribeAudio(new Blob([audioBuffer], { type: 'audio/wav' }));
+            if (transcription.text.trim()) {
+                await this.processTranscription(transcription);
+            }
+            fs.unlinkSync(audioFilePath);
+        }
+        catch (error) {
+            console.error('Error processing audio file:', error);
+        }
+    }
+    async processTranscription(transcription) {
+        this.lastTranscription = transcription.text;
+        const analysis = await this.openAIService.analyzeScreenWithContext({
+            screenshot: this.currentScreenshot,
+            transcription: this.lastTranscription,
+            cursorPosition: await electron_1.ipcRenderer.invoke('get-cursor-position')
+        });
+        if (analysis) {
+            this.events.push({
+                type: analysis.type,
+                identifier: analysis.identifier,
+                value: analysis.value,
+                timestamp: Date.now(),
+                narration: this.lastTranscription
            });
        }
+    }
+    async stopRecording() {
+        this.recording = false;
+        if (this.silenceTimer) {
+            clearTimeout(this.silenceTimer);
+            this.silenceTimer = null;
+        }
+        await electron_1.ipcRenderer.invoke('stop-audio-recording');
+        electron_1.ipcRenderer.removeListener('audio-level', this.handleAudioLevel);
+        electron_1.ipcRenderer.removeListener('audio-chunk', this.handleAudioChunk);
+        electron_1.ipcRenderer.removeListener('keyboard-event', this.keyboardHandleEvent); // Remove keyboard listener
+        if (this.currentAudioFile && fs.existsSync(this.currentAudioFile)) {
+            fs.unlinkSync(this.currentAudioFile);
+        }
+        return this.generateBasicCode();
+    }
+    async requestScreenshot() {
+        try {
+            const sources = await electron_1.ipcRenderer.invoke('get-screenshot');
+            const screenSource = sources[0];
+            await this.screenshotHandleEvent(null, screenSource.thumbnail);
+        }
+        catch (error) {
+            console.error('Error capturing screenshot:', error);
+        }
+    }
    async screenshotHandleEvent(_, screenshot) {
-        this.currentScreenshot = screenshot; // Store the screenshot as a base64 image
+        this.currentScreenshot = screenshot;
+    }
+    async keyboardHandleEvent(_, event) {
+        if (!this.recording)
+            return;
+        this.events.push({
+            type: 'type',
+            identifier: event.key,
+            timestamp: Date.now(),
+            narration: this.lastTranscription
+        });
    }
    async mouseHandleEvent(_, event) {
        if (!this.recording)
@ -45,44 +204,39 @@ class RecorderService {
                type: 'click',
                identifier: element.identifier,
                timestamp: Date.now(),
-            });
-        }
-    }
-    async keyboardHandleEvent(_, event) {
-        if (!this.recording)
-            return;
-        const analysis = await this.openAIService.analyzeScreen(this.currentScreenshot);
-        const focusedElement = this.findFocusedElement(analysis);
-        if (focusedElement) {
-            this.events.push({
-                type: 'type',
-                identifier: focusedElement.identifier,
-                value: event.key,
-                timestamp: Date.now(),
+                narration: this.lastTranscription
            });
        }
    }
    findElementAtPosition(analysis, x, y) {
+        //@ts-nocheck
        return analysis.elements.find((element) => {
            const bounds = element.bounds;
-            return x >= bounds.x && x <= bounds.x + bounds.width && y >= bounds.y && y <= bounds.y + bounds.height;
+            return x >= bounds.x &&
+                x <= bounds.x + bounds.width &&
+                y >= bounds.y &&
+                y <= bounds.y + bounds.height;
        });
    }
-    findFocusedElement(analysis) {
-        //@ts-ignore
-        return analysis.elements.find((element) => element.focused);
-    }
    generateBasicCode() {
        let basicCode = '10 REM BotDesktop Automation Script\n';
        let lineNumber = 20;
        for (const event of this.events) {
+            basicCode += `${lineNumber} REM ${event.narration}\n`;
+            lineNumber += 10;
            switch (event.type) {
                case 'click':
                    basicCode += `${lineNumber} CLICK "${event.identifier}"\n`;
                    break;
+                case 'type':
+                    basicCode += `${lineNumber} TYPE "${event.identifier}"\n`;
+                    break;
                case 'type':
                    basicCode += `${lineNumber} TYPE "${event.identifier}" "${event.value}"\n`;
                    break;
+                case 'move':
+                    basicCode += `${lineNumber} MOVE "${event.identifier}"\n`;
+                    break;
            }
            lineNumber += 10;
        }
--- a/gencode.sh
+++ b/gencode.sh
@ -0,0 +1,12 @@
+#!/bin/bash
+
+# Remove output.txt if it exists to start fresh
+rm -f output.txt
+
+# Find all .ts and .tsx files excluding node_modules, and concatenate filename and contents into output.txt
+find . -type f \( -name "*.ts" -o -name "*.tsx" \) -not -path "*/node_modules/*" | while read -r file; do
+  echo -e "\n// File: $file\n" >> output.txt
+  cat "$file" >> output.txt
+done
+
+echo "All TypeScript (.ts and .tsx) code has been combined into output.txt with filenames as headers, excluding node_modules"
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -10,13 +10,15 @@
    "test": "vitest"
  },
  "dependencies": {
-    "dotenv": "^16.4.5",
-    "node-global-key-listener": "^0.3.0",
-    "node-mouse": "^0.0.2",
    "@types/node": "^20.0.0",
    "@types/react": "^18.0.0",
    "@types/react-dom": "^18.0.0",
+    "debounce": "^2.2.0",
+    "dotenv": "^16.4.5",
    "electron": "^28.0.0",
+    "lodash": "^4.17.21",
+    "node-global-key-listener": "^0.3.0",
+    "node-mouse": "^0.0.2",
    "openai": "^4.28.0",
    "react": "^18.2.0",
    "react-dom": "^18.2.0",
--- a/src/main/main.ts
+++ b/src/main/main.ts
@ -1,6 +1,8 @@
+require('dotenv').config();
 import { app, BrowserWindow, ipcMain } from 'electron';
 import * as path from 'path';
-
+// In main.ts
+import {  systemPreferences } from 'electron';
 import { RecorderService } from '../services/recorder.service';
 import { PlayerService } from '../services/player.service';

@ -56,3 +58,21 @@ ipcMain.handle('stop-recording', async () => {
 ipcMain.handle('execute-basic-code', async (_, code: string) => {
  await player.executeBasicCode(code);
 });
+
+
+// Add microphone permission check for macOS
+ipcMain.handle('check-microphone-permission', async () => {
+  if (process.platform === 'darwin') {
+    const status = await systemPreferences.getMediaAccessStatus('microphone');
+    if (status !== 'granted') {
+      const success = await systemPreferences.askForMediaAccess('microphone');
+      return success;
+    }
+    return true;
+  }
+  // On Windows/Linux, permissions are handled by the OS
+  return true;
+});
+
+// Enable required permissions
+app.commandLine.appendSwitch('enable-speech-dispatcher');
--- a/src/preload/preload.ts
+++ b/src/preload/preload.ts
@ -1 +0,0 @@
-// Preload script goes here
--- a/src/services/openai.service.ts
+++ b/src/services/openai.service.ts
@ -1,36 +1,155 @@
 import { AzureOpenAI } from 'openai';
 import * as fs from 'fs';
-import { ScreenAnalysis } from './types';
+import { ScreenAnalysis, ScreenContext, WhisperResponse, AutomationAction } from './types';
+
+const { Readable } = require('stream');

 export class OpenAIService {
  private client: AzureOpenAI;

  constructor() {
-    this.client = new AzureOpenAI({ dangerouslyAllowBrowser: true,
+    this.client = new AzureOpenAI({
+      dangerouslyAllowBrowser: true,
      endpoint: process.env.AZURE_OPEN_AI_ENDPOINT || '',
-      deployment: process.env.AZURE_OPEN_AI_IMAGE_MODEL || '',
-      apiVersion: process.env.OPENAI_API_VERSION || '',
+      apiVersion: process.env.OPENAI_API_VERSION || '2024-02-15-preview',
      apiKey: process.env.AZURE_OPEN_AI_KEY || ''
    });
  }

-  async analyzeScreen(imagePath: string): Promise<ScreenAnalysis> {
-    const imageBuffer = fs.readFileSync(imagePath);
-    const base64Image = imageBuffer.toString('base64');
+  async transcribeAudio(audioBlob: Blob): Promise<WhisperResponse> {
+    try {
+      // Convert Blob to ArrayBuffer
+      const arrayBuffer = await audioBlob.arrayBuffer();

+      // Convert Buffer to a Readable stream
+      const buffer = Buffer.from(arrayBuffer);
+      const stream = new Readable();
+      stream.push(buffer);
+      stream.push(null); // Signal the end of the stream
+      
+      const response = await this.client.audio.transcriptions.create({
+        file: stream,
+        model: process.env.AZURE_OPEN_AI_WHISPER_MODEL || 'whisper-1',
+        language: 'en',
+        response_format: 'verbose_json'
+      });      return {
+        text: response.text,
+        //@ts-ignore
+        segments: response.segments?.map(seg => ({
+          text: seg.text,
+          start: seg.start,
+          end: seg.end
+        })) || []
+      };
+    } catch (error) {
+      console.error('Error in transcribeAudio:', error);
+      throw new Error('Failed to transcribe audio');
+    }
+  }
+
+  async analyzeScreenWithContext(context: ScreenContext): Promise<AutomationAction> {
+    try {
      const response = await this.client.chat.completions.create({
-      model: process.env.AZURE_OPEN_AI_LLM_MODEL || '',
+        model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
        messages: [
+          {
+            role: 'system',
+            content: `You are an AI that analyzes screenshots and voice commands to determine user intentions for automation.
+                     You should identify UI elements and return specific actions in JSON format.
+                     Focus on the area near the cursor position when relevant.`
+          },
          {
            role: 'user',
            content: [
-            { type: 'text', text: 'Analyze this screenshot and identify all interactive elements (buttons, text fields, etc). Return their locations and identifiers.' },
-            { type: 'image_url', image_url: { url: `data:image/png;base64,${base64Image}` } }
-          ],
+              {
+                type: 'text',
+                text: `Analyze this screenshot with the following context:
+                      Voice Command: "${context.transcription}"
+                      Cursor Position: x=${context.cursorPosition.x}, y=${context.cursorPosition.y}
+                      
+                      Identify the most likely action based on the voice command and cursor position.
+                      Return in format: {
+                        "type": "click|type|move",
+                        "identifier": "element-id or descriptive name",
+                        "value": "text to type (for type actions)",
+                        "confidence": 0-1,
+                        "bounds": {"x": number, "y": number, "width": number, "height": number}
+                      }`
              },
+              {
+                type: 'image_url',
+                image_url: {
+                  url: `data:image/png;base64,${context.screenshot}`
+                }
+              }
+            ]
+          }
        ],
+        max_tokens: 500,
+        temperature: 0.3
      });

-    return JSON.parse(response.choices[0].message.content || '{}');
+      const result = JSON.parse(response.choices[0].message.content || '{}');
+      return result;
+    } catch (error) {
+      console.error('Error in analyzeScreenWithContext:', error);
+      throw new Error('Failed to analyze screen context');
+    }
+  }
+
+  async analyzeScreen(screenshot: string): Promise<ScreenAnalysis> {
+    try {
+      const response = await this.client.chat.completions.create({
+        model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
+        messages: [
+          {
+            role: 'system',
+            content: 'You are an AI that analyzes screenshots to identify interactive UI elements and their properties.'
+          },
+          {
+            role: 'user',
+            content: [
+              {
+                type: 'text',
+                text: `Analyze this screenshot and identify all interactive elements (buttons, text fields, dropdowns, etc).
+                       For each element, provide:
+                       - Type of element
+                       - Identifier or descriptive name
+                       - Location and size
+                       - Any visible text or labels
+                       - State (focused, disabled, etc)
+                       
+                       Return in format: {
+                         "elements": [{
+                           "type": "button|input|dropdown|etc",
+                           "identifier": "element-id or descriptive name",
+                           "bounds": {"x": number, "y": number, "width": number, "height": number},
+                           "text": "visible text",
+                           "state": {"focused": boolean, "disabled": boolean}
+                         }]
+                       }`
+              },
+              {
+                type: 'image_url',
+                image_url: {
+                  url: `data:image/png;base64,${screenshot}`
+                }
+              }
+            ]
+          }
+        ],
+        max_tokens: 1000,
+        temperature: 0.3
+      });
+
+      const result = JSON.parse(response.choices[0].message.content || '{}');
+      return {
+        elements: result.elements || [],
+        timestamp: Date.now()
+      };
+    } catch (error) {
+      console.error('Error in analyzeScreen:', error);
+      throw new Error('Failed to analyze screen');
+    }
  }
 }
--- a/src/services/recorder.service.ts
+++ b/src/services/recorder.service.ts
@ -1,41 +1,192 @@
-import { screen, ipcMain } from 'electron';
-import { AutomationEvent, ScreenAnalysis } from './types';
-import dotenv from 'dotenv';
-dotenv.config();
-import { OpenAIService } from './openai.service';
+import { ipcRenderer } from 'electron';
+import { AutomationEvent, ScreenAnalysis, WhisperResponse } from '../services/types';
+import { OpenAIService } from '../services/openai.service';
+const _ = require('lodash');
+import * as path from 'path';
+import * as fs from 'fs';

 export class RecorderService {
  private events: AutomationEvent[] = [];
  private recording: boolean = false;
  private openAIService: OpenAIService;
  private currentScreenshot: string = '';
+  private lastTranscription: string = '';
+  private recordingProcess: any = null;
+  private tempDir: string;
+  private currentAudioFile: string = '';
+  private silenceTimer: NodeJS.Timeout | null = null;
+  private isProcessingAudio: boolean = false;

  constructor() {
    this.openAIService = new OpenAIService();
+    this.tempDir = path.join(process.cwd(), 'temp_recordings');
+    if (!fs.existsSync(this.tempDir)) {
+      fs.mkdirSync(this.tempDir, { recursive: true });
+    }
  }

  public async startRecording() {
+    try {
      this.recording = true;
      this.events = [];
-    this.requestScreenshot();
+      await this.setupAudioRecording();
+      await this.requestScreenshot();
+      ipcRenderer.on('keyboard-event', this.keyboardHandleEvent); // Listen for keyboard events
+    } catch (error) {
+      console.error('Failed to start recording:', error);
+      this.recording = false;
+      throw error;
+    }
  }

-  public stopRecording(): string {
+  private async setupAudioRecording() {
+    try {
+      this.recordingProcess = await ipcRenderer.invoke('start-audio-recording');
+      ipcRenderer.on('audio-level', this.handleAudioLevel);
+      ipcRenderer.on('audio-chunk', this.handleAudioChunk);
+    } catch (error) {
+      console.error('Error setting up audio recording:', error);
+      throw new Error(`Failed to setup audio recording: ${error.message}`);
+    }
+  }
+
+  private handleAudioLevel = _.debounce(async (_: any, level: number) => {
+    if (!this.recording) return;
+
+    const SILENCE_THRESHOLD = 0.01;
+    const SILENCE_DURATION = 1000;
+
+    if (level < SILENCE_THRESHOLD) {
+      if (!this.silenceTimer && !this.isProcessingAudio) {
+        this.silenceTimer = setTimeout(async () => {
+          if (this.recording) {
+            await this.processSilence();
+          }
+        }, SILENCE_DURATION);
+      }
+    } else {
+      if (this.silenceTimer) {
+        clearTimeout(this.silenceTimer);
+        this.silenceTimer = null;
+      }
+    }
+  }, 100);
+
+  private handleAudioChunk = async (_: any, chunk: Buffer) => {
+    if (!this.recording) return;
+
+    try {
+      const audioFilePath = path.join(this.tempDir, `audio-${Date.now()}.wav`);
+      fs.writeFileSync(audioFilePath, chunk);
+
+      if (this.silenceTimer) {
+        clearTimeout(this.silenceTimer);
+        this.silenceTimer = null;
+        await this.processAudioFile(audioFilePath);
+      }
+    } catch (error) {
+      console.error('Error handling audio chunk:', error);
+    }
+  };
+
+  private async processSilence() {
+    if (this.isProcessingAudio) return;
+    
+    this.isProcessingAudio = true;
+    try {
+      const audioFilePath = await ipcRenderer.invoke('save-audio-chunk');
+      if (audioFilePath) {
+        this.currentAudioFile = audioFilePath;
+        await this.processAudioFile(audioFilePath);
+        await this.requestScreenshot();
+      }
+    } catch (error) {
+      console.error('Error processing silence:', error);
+    } finally {
+      this.isProcessingAudio = false;
+    }
+  }
+
+  private async processAudioFile(audioFilePath: string) {
+    try {
+      const audioBuffer = fs.readFileSync(audioFilePath);
+      const transcription = await this.openAIService.transcribeAudio(
+        new Blob([audioBuffer], { type: 'audio/wav' })
+      );
+
+      if (transcription.text.trim()) {
+        await this.processTranscription(transcription);
+      }
+
+      fs.unlinkSync(audioFilePath);
+    } catch (error) {
+      console.error('Error processing audio file:', error);
+    }
+  }
+
+  private async processTranscription(transcription: WhisperResponse) {
+    this.lastTranscription = transcription.text;
+    
+    const analysis = await this.openAIService.analyzeScreenWithContext({
+      screenshot: this.currentScreenshot,
+      transcription: this.lastTranscription,
+      cursorPosition: await ipcRenderer.invoke('get-cursor-position')
+    });
+
+    if (analysis) {
+      this.events.push({
+        type: analysis.type,
+        identifier: analysis.identifier,
+        value: analysis.value,
+        timestamp: Date.now(),
+        narration: this.lastTranscription
+      });
+    }
+  }
+
+  public async stopRecording(): Promise<string> {
    this.recording = false;
+
+    if (this.silenceTimer) {
+      clearTimeout(this.silenceTimer);
+      this.silenceTimer = null;
+    }
+
+    await ipcRenderer.invoke('stop-audio-recording');
+    ipcRenderer.removeListener('audio-level', this.handleAudioLevel);
+    ipcRenderer.removeListener('audio-chunk', this.handleAudioChunk);
+    ipcRenderer.removeListener('keyboard-event', this.keyboardHandleEvent); // Remove keyboard listener
+
+    if (this.currentAudioFile && fs.existsSync(this.currentAudioFile)) {
+      fs.unlinkSync(this.currentAudioFile);
+    }
+
    return this.generateBasicCode();
  }

-  private requestScreenshot() {
-    // Notify renderer process to capture a screenshot
-    const allWebContents = screen.getAllDisplays();
-    allWebContents.forEach((webContents) => {
-      //@ts-ignores
-      webContents.send('request-screenshot');
-    });
+  private async requestScreenshot() {
+    try {
+      const sources = await ipcRenderer.invoke('get-screenshot');
+      const screenSource = sources[0];
+      await this.screenshotHandleEvent(null, screenSource.thumbnail);
+    } catch (error) {
+      console.error('Error capturing screenshot:', error);
+    }
  }

  public async screenshotHandleEvent(_: any, screenshot: string) {
-    this.currentScreenshot = screenshot; // Store the screenshot as a base64 image
+    this.currentScreenshot = screenshot;
+  }
+
+  public async keyboardHandleEvent(_: any, event: KeyboardEvent) {
+    if (!this.recording) return;
+
+    this.events.push({
+      type: 'type',
+      identifier: event.key,
+      timestamp: Date.now(),
+      narration: this.lastTranscription
+    });
  }

  public async mouseHandleEvent(_: any, event: any) {
@ -49,50 +200,43 @@ export class RecorderService {
        type: 'click',
        identifier: element.identifier,
        timestamp: Date.now(),
-      });
-    }
-  }
-
-  public async keyboardHandleEvent(_: any, event: any) {
-    if (!this.recording) return;
-
-    const analysis = await this.openAIService.analyzeScreen(this.currentScreenshot);
-    const focusedElement = this.findFocusedElement(analysis);
-
-    if (focusedElement) {
-      this.events.push({
-        type: 'type',
-        identifier: focusedElement.identifier,
-        value: event.key,
-        timestamp: Date.now(),
+        narration: this.lastTranscription
      });
    }
  }

  private findElementAtPosition(analysis: ScreenAnalysis, x: number, y: number) {
+    //@ts-nocheck
    return analysis.elements.find((element) => {
      const bounds = element.bounds;
-      return x >= bounds.x && x <= bounds.x + bounds.width && y >= bounds.y && y <= bounds.y + bounds.height;
+      return x >= bounds.x && 
+             x <= bounds.x + bounds.width && 
+             y >= bounds.y && 
+             y <= bounds.y + bounds.height;
    });
  }

-  private findFocusedElement(analysis: ScreenAnalysis) {
-    //@ts-ignore
-    return analysis.elements.find((element) => element.focused);
-  }
-
  private generateBasicCode(): string {
    let basicCode = '10 REM BotDesktop Automation Script\n';
    let lineNumber = 20;

    for (const event of this.events) {
+      basicCode += `${lineNumber} REM ${event.narration}\n`;
+      lineNumber += 10;
+
      switch (event.type) {
        case 'click':
          basicCode += `${lineNumber} CLICK "${event.identifier}"\n`;
          break;
+        case 'type':
+          basicCode += `${lineNumber} TYPE "${event.identifier}"\n`;
+          break;
        case 'type':
          basicCode += `${lineNumber} TYPE "${event.identifier}" "${event.value}"\n`;
          break;
+        case 'move':
+          basicCode += `${lineNumber} MOVE "${event.identifier}"\n`;
+          break;
      }
      lineNumber += 10;
    }
--- a/src/services/types.ts
+++ b/src/services/types.ts
@ -1,11 +1,37 @@
+export interface AutomationAction {
+  type: 'click' | 'type' | 'move';
+  identifier: string;
+  value?: string;
+  confidence: number;
+  bounds: {
+    x: number;
+    y: number;
+    width: number;
+    height: number;
+  };
+}
+
 export interface AutomationEvent {
  type: 'click' | 'type' | 'move';
  identifier: string;
  value?: string;
  timestamp: number;
+  narration: string;
+}
+
+export interface WhisperResponse {
+  text: string;
+  segments:any;
+}
+
+export interface ScreenContext {
+  screenshot: string;
+  transcription: string;
+  cursorPosition: { x: number, y: number };
 }

 export interface ScreenAnalysis {
+  timestamp: number,
  elements: {
    identifier: string;
    type: string;
--- a/webpack.config.js
+++ b/webpack.config.js
@ -2,6 +2,7 @@ const path = require('path');
 const HtmlWebpackPlugin = require('html-webpack-plugin');

 module.exports = {
+  devtool: 'source-map',
  entry: './src/renderer/index.tsx',
  target: 'electron-renderer',
  module: {