new(all): Initial import.

This commit is contained in:
me@rodrigorodriguez.com 2024-10-26 16:26:11 -03:00
parent e6d2ffa35a
commit 5ebde5b646
15 changed files with 9645 additions and 151 deletions

1
.gitignore vendored
View file

@ -1,2 +1,3 @@
node_modules node_modules
.env .env
output.txt

4
.vscode/launch.json vendored
View file

@ -5,12 +5,12 @@
"name": "Electron: Main", "name": "Electron: Main",
"type": "node", "type": "node",
"request": "launch", "request": "launch",
"program": "${workspaceFolder}/node_modules/electron/dist/electron.js", "sourceMaps": true,
"args": ["${workspaceFolder}/dist/main/main.js"], "args": ["${workspaceFolder}/dist/main/main.js"],
"outFiles": ["${workspaceFolder}/dist/**/*.js"], "outFiles": ["${workspaceFolder}/dist/**/*.js"],
"cwd": "${workspaceFolder}", "cwd": "${workspaceFolder}",
"sourceMaps": true,
"protocol": "inspector", "protocol": "inspector",
"console": "integratedTerminal",
"windows": { "windows": {
"runtimeExecutable": "${workspaceFolder}/node_modules/.bin/electron.cmd" "runtimeExecutable": "${workspaceFolder}/node_modules/.bin/electron.cmd"
}, },

18
dist/main/main.js vendored
View file

@ -23,8 +23,11 @@ var __importStar = (this && this.__importStar) || function (mod) {
return result; return result;
}; };
Object.defineProperty(exports, "__esModule", { value: true }); Object.defineProperty(exports, "__esModule", { value: true });
require('dotenv').config();
const electron_1 = require("electron"); const electron_1 = require("electron");
const path = __importStar(require("path")); const path = __importStar(require("path"));
// In main.ts
const electron_2 = require("electron");
const recorder_service_1 = require("../services/recorder.service"); const recorder_service_1 = require("../services/recorder.service");
const player_service_1 = require("../services/player.service"); const player_service_1 = require("../services/player.service");
const recorder = new recorder_service_1.RecorderService(); const recorder = new recorder_service_1.RecorderService();
@ -70,3 +73,18 @@ electron_1.ipcMain.handle('stop-recording', async () => {
electron_1.ipcMain.handle('execute-basic-code', async (_, code) => { electron_1.ipcMain.handle('execute-basic-code', async (_, code) => {
await player.executeBasicCode(code); await player.executeBasicCode(code);
}); });
// Add microphone permission check for macOS
electron_1.ipcMain.handle('check-microphone-permission', async () => {
if (process.platform === 'darwin') {
const status = await electron_2.systemPreferences.getMediaAccessStatus('microphone');
if (status !== 'granted') {
const success = await electron_2.systemPreferences.askForMediaAccess('microphone');
return success;
}
return true;
}
// On Windows/Linux, permissions are handled by the OS
return true;
});
// Enable required permissions
electron_1.app.commandLine.appendSwitch('enable-speech-dispatcher');

View file

@ -1 +0,0 @@
// Preload script goes here

View file

@ -1,56 +1,150 @@
"use strict"; "use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true }); Object.defineProperty(exports, "__esModule", { value: true });
exports.OpenAIService = void 0; exports.OpenAIService = void 0;
const openai_1 = require("openai"); const openai_1 = require("openai");
const fs = __importStar(require("fs")); const { Readable } = require('stream');
class OpenAIService { class OpenAIService {
constructor() { constructor() {
this.client = new openai_1.AzureOpenAI({ dangerouslyAllowBrowser: true, this.client = new openai_1.AzureOpenAI({
dangerouslyAllowBrowser: true,
endpoint: process.env.AZURE_OPEN_AI_ENDPOINT || '', endpoint: process.env.AZURE_OPEN_AI_ENDPOINT || '',
deployment: process.env.AZURE_OPEN_AI_IMAGE_MODEL || '', apiVersion: process.env.OPENAI_API_VERSION || '2024-02-15-preview',
apiVersion: process.env.OPENAI_API_VERSION || '',
apiKey: process.env.AZURE_OPEN_AI_KEY || '' apiKey: process.env.AZURE_OPEN_AI_KEY || ''
}); });
} }
async analyzeScreen(imagePath) { async transcribeAudio(audioBlob) {
const imageBuffer = fs.readFileSync(imagePath); try {
const base64Image = imageBuffer.toString('base64'); // Convert Blob to ArrayBuffer
const arrayBuffer = await audioBlob.arrayBuffer();
// Convert Buffer to a Readable stream
const buffer = Buffer.from(arrayBuffer);
const stream = new Readable();
stream.push(buffer);
stream.push(null); // Signal the end of the stream
const response = await this.client.audio.transcriptions.create({
file: stream,
model: process.env.AZURE_OPEN_AI_WHISPER_MODEL || 'whisper-1',
language: 'en',
response_format: 'verbose_json'
});
return {
text: response.text,
//@ts-ignore
segments: response.segments?.map(seg => ({
text: seg.text,
start: seg.start,
end: seg.end
})) || []
};
}
catch (error) {
console.error('Error in transcribeAudio:', error);
throw new Error('Failed to transcribe audio');
}
}
async analyzeScreenWithContext(context) {
try {
const response = await this.client.chat.completions.create({ const response = await this.client.chat.completions.create({
model: process.env.AZURE_OPEN_AI_LLM_MODEL || '', model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
messages: [ messages: [
{
role: 'system',
content: `You are an AI that analyzes screenshots and voice commands to determine user intentions for automation.
You should identify UI elements and return specific actions in JSON format.
Focus on the area near the cursor position when relevant.`
},
{ {
role: 'user', role: 'user',
content: [ content: [
{ type: 'text', text: 'Analyze this screenshot and identify all interactive elements (buttons, text fields, etc). Return their locations and identifiers.' }, {
{ type: 'image_url', image_url: { url: `data:image/png;base64,${base64Image}` } } type: 'text',
], text: `Analyze this screenshot with the following context:
Voice Command: "${context.transcription}"
Cursor Position: x=${context.cursorPosition.x}, y=${context.cursorPosition.y}
Identify the most likely action based on the voice command and cursor position.
Return in format: {
"type": "click|type|move",
"identifier": "element-id or descriptive name",
"value": "text to type (for type actions)",
"confidence": 0-1,
"bounds": {"x": number, "y": number, "width": number, "height": number}
}`
}, },
{
type: 'image_url',
image_url: {
url: `data:image/png;base64,${context.screenshot}`
}
}
]
}
], ],
max_tokens: 500,
temperature: 0.3
}); });
return JSON.parse(response.choices[0].message.content || '{}'); const result = JSON.parse(response.choices[0].message.content || '{}');
return result;
}
catch (error) {
console.error('Error in analyzeScreenWithContext:', error);
throw new Error('Failed to analyze screen context');
}
}
async analyzeScreen(screenshot) {
try {
const response = await this.client.chat.completions.create({
model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
messages: [
{
role: 'system',
content: 'You are an AI that analyzes screenshots to identify interactive UI elements and their properties.'
},
{
role: 'user',
content: [
{
type: 'text',
text: `Analyze this screenshot and identify all interactive elements (buttons, text fields, dropdowns, etc).
For each element, provide:
- Type of element
- Identifier or descriptive name
- Location and size
- Any visible text or labels
- State (focused, disabled, etc)
Return in format: {
"elements": [{
"type": "button|input|dropdown|etc",
"identifier": "element-id or descriptive name",
"bounds": {"x": number, "y": number, "width": number, "height": number},
"text": "visible text",
"state": {"focused": boolean, "disabled": boolean}
}]
}`
},
{
type: 'image_url',
image_url: {
url: `data:image/png;base64,${screenshot}`
}
}
]
}
],
max_tokens: 1000,
temperature: 0.3
});
const result = JSON.parse(response.choices[0].message.content || '{}');
return {
elements: result.elements || [],
timestamp: Date.now()
};
}
catch (error) {
console.error('Error in analyzeScreen:', error);
throw new Error('Failed to analyze screen');
}
} }
} }
exports.OpenAIService = OpenAIService; exports.OpenAIService = OpenAIService;

View file

@ -1,39 +1,198 @@
"use strict"; "use strict";
var __importDefault = (this && this.__importDefault) || function (mod) { var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
return (mod && mod.__esModule) ? mod : { "default": mod }; if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
}; };
Object.defineProperty(exports, "__esModule", { value: true }); Object.defineProperty(exports, "__esModule", { value: true });
exports.RecorderService = void 0; exports.RecorderService = void 0;
const electron_1 = require("electron"); const electron_1 = require("electron");
const dotenv_1 = __importDefault(require("dotenv")); const openai_service_1 = require("../services/openai.service");
dotenv_1.default.config(); const _ = require('lodash');
const openai_service_1 = require("./openai.service"); const path = __importStar(require("path"));
const fs = __importStar(require("fs"));
class RecorderService { class RecorderService {
constructor() { constructor() {
this.events = []; this.events = [];
this.recording = false; this.recording = false;
this.currentScreenshot = ''; this.currentScreenshot = '';
this.lastTranscription = '';
this.recordingProcess = null;
this.currentAudioFile = '';
this.silenceTimer = null;
this.isProcessingAudio = false;
this.handleAudioLevel = _.debounce(async (_, level) => {
if (!this.recording)
return;
const SILENCE_THRESHOLD = 0.01;
const SILENCE_DURATION = 1000;
if (level < SILENCE_THRESHOLD) {
if (!this.silenceTimer && !this.isProcessingAudio) {
this.silenceTimer = setTimeout(async () => {
if (this.recording) {
await this.processSilence();
}
}, SILENCE_DURATION);
}
}
else {
if (this.silenceTimer) {
clearTimeout(this.silenceTimer);
this.silenceTimer = null;
}
}
}, 100);
this.handleAudioChunk = async (_, chunk) => {
if (!this.recording)
return;
try {
const audioFilePath = path.join(this.tempDir, `audio-${Date.now()}.wav`);
fs.writeFileSync(audioFilePath, chunk);
if (this.silenceTimer) {
clearTimeout(this.silenceTimer);
this.silenceTimer = null;
await this.processAudioFile(audioFilePath);
}
}
catch (error) {
console.error('Error handling audio chunk:', error);
}
};
this.openAIService = new openai_service_1.OpenAIService(); this.openAIService = new openai_service_1.OpenAIService();
this.tempDir = path.join(process.cwd(), 'temp_recordings');
if (!fs.existsSync(this.tempDir)) {
fs.mkdirSync(this.tempDir, { recursive: true });
}
} }
async startRecording() { async startRecording() {
try {
this.recording = true; this.recording = true;
this.events = []; this.events = [];
this.requestScreenshot(); await this.setupAudioRecording();
await this.requestScreenshot();
electron_1.ipcRenderer.on('keyboard-event', this.keyboardHandleEvent); // Listen for keyboard events
} }
stopRecording() { catch (error) {
console.error('Failed to start recording:', error);
this.recording = false; this.recording = false;
return this.generateBasicCode(); throw error;
} }
requestScreenshot() { }
// Notify renderer process to capture a screenshot async setupAudioRecording() {
const allWebContents = electron_1.screen.getAllDisplays(); try {
allWebContents.forEach((webContents) => { this.recordingProcess = await electron_1.ipcRenderer.invoke('start-audio-recording');
//@ts-ignores electron_1.ipcRenderer.on('audio-level', this.handleAudioLevel);
webContents.send('request-screenshot'); electron_1.ipcRenderer.on('audio-chunk', this.handleAudioChunk);
}
catch (error) {
console.error('Error setting up audio recording:', error);
throw new Error(`Failed to setup audio recording: ${error.message}`);
}
}
async processSilence() {
if (this.isProcessingAudio)
return;
this.isProcessingAudio = true;
try {
const audioFilePath = await electron_1.ipcRenderer.invoke('save-audio-chunk');
if (audioFilePath) {
this.currentAudioFile = audioFilePath;
await this.processAudioFile(audioFilePath);
await this.requestScreenshot();
}
}
catch (error) {
console.error('Error processing silence:', error);
}
finally {
this.isProcessingAudio = false;
}
}
async processAudioFile(audioFilePath) {
try {
const audioBuffer = fs.readFileSync(audioFilePath);
const transcription = await this.openAIService.transcribeAudio(new Blob([audioBuffer], { type: 'audio/wav' }));
if (transcription.text.trim()) {
await this.processTranscription(transcription);
}
fs.unlinkSync(audioFilePath);
}
catch (error) {
console.error('Error processing audio file:', error);
}
}
async processTranscription(transcription) {
this.lastTranscription = transcription.text;
const analysis = await this.openAIService.analyzeScreenWithContext({
screenshot: this.currentScreenshot,
transcription: this.lastTranscription,
cursorPosition: await electron_1.ipcRenderer.invoke('get-cursor-position')
});
if (analysis) {
this.events.push({
type: analysis.type,
identifier: analysis.identifier,
value: analysis.value,
timestamp: Date.now(),
narration: this.lastTranscription
}); });
} }
}
async stopRecording() {
this.recording = false;
if (this.silenceTimer) {
clearTimeout(this.silenceTimer);
this.silenceTimer = null;
}
await electron_1.ipcRenderer.invoke('stop-audio-recording');
electron_1.ipcRenderer.removeListener('audio-level', this.handleAudioLevel);
electron_1.ipcRenderer.removeListener('audio-chunk', this.handleAudioChunk);
electron_1.ipcRenderer.removeListener('keyboard-event', this.keyboardHandleEvent); // Remove keyboard listener
if (this.currentAudioFile && fs.existsSync(this.currentAudioFile)) {
fs.unlinkSync(this.currentAudioFile);
}
return this.generateBasicCode();
}
async requestScreenshot() {
try {
const sources = await electron_1.ipcRenderer.invoke('get-screenshot');
const screenSource = sources[0];
await this.screenshotHandleEvent(null, screenSource.thumbnail);
}
catch (error) {
console.error('Error capturing screenshot:', error);
}
}
async screenshotHandleEvent(_, screenshot) { async screenshotHandleEvent(_, screenshot) {
this.currentScreenshot = screenshot; // Store the screenshot as a base64 image this.currentScreenshot = screenshot;
}
async keyboardHandleEvent(_, event) {
if (!this.recording)
return;
this.events.push({
type: 'type',
identifier: event.key,
timestamp: Date.now(),
narration: this.lastTranscription
});
} }
async mouseHandleEvent(_, event) { async mouseHandleEvent(_, event) {
if (!this.recording) if (!this.recording)
@ -45,44 +204,39 @@ class RecorderService {
type: 'click', type: 'click',
identifier: element.identifier, identifier: element.identifier,
timestamp: Date.now(), timestamp: Date.now(),
}); narration: this.lastTranscription
}
}
async keyboardHandleEvent(_, event) {
if (!this.recording)
return;
const analysis = await this.openAIService.analyzeScreen(this.currentScreenshot);
const focusedElement = this.findFocusedElement(analysis);
if (focusedElement) {
this.events.push({
type: 'type',
identifier: focusedElement.identifier,
value: event.key,
timestamp: Date.now(),
}); });
} }
} }
findElementAtPosition(analysis, x, y) { findElementAtPosition(analysis, x, y) {
//@ts-nocheck
return analysis.elements.find((element) => { return analysis.elements.find((element) => {
const bounds = element.bounds; const bounds = element.bounds;
return x >= bounds.x && x <= bounds.x + bounds.width && y >= bounds.y && y <= bounds.y + bounds.height; return x >= bounds.x &&
x <= bounds.x + bounds.width &&
y >= bounds.y &&
y <= bounds.y + bounds.height;
}); });
} }
findFocusedElement(analysis) {
//@ts-ignore
return analysis.elements.find((element) => element.focused);
}
generateBasicCode() { generateBasicCode() {
let basicCode = '10 REM BotDesktop Automation Script\n'; let basicCode = '10 REM BotDesktop Automation Script\n';
let lineNumber = 20; let lineNumber = 20;
for (const event of this.events) { for (const event of this.events) {
basicCode += `${lineNumber} REM ${event.narration}\n`;
lineNumber += 10;
switch (event.type) { switch (event.type) {
case 'click': case 'click':
basicCode += `${lineNumber} CLICK "${event.identifier}"\n`; basicCode += `${lineNumber} CLICK "${event.identifier}"\n`;
break; break;
case 'type':
basicCode += `${lineNumber} TYPE "${event.identifier}"\n`;
break;
case 'type': case 'type':
basicCode += `${lineNumber} TYPE "${event.identifier}" "${event.value}"\n`; basicCode += `${lineNumber} TYPE "${event.identifier}" "${event.value}"\n`;
break; break;
case 'move':
basicCode += `${lineNumber} MOVE "${event.identifier}"\n`;
break;
} }
lineNumber += 10; lineNumber += 10;
} }

12
gencode.sh Executable file
View file

@ -0,0 +1,12 @@
#!/bin/bash
# Remove output.txt if it exists to start fresh
rm -f output.txt
# Find all .ts and .tsx files excluding node_modules, and concatenate filename and contents into output.txt
find . -type f \( -name "*.ts" -o -name "*.tsx" \) -not -path "*/node_modules/*" | while read -r file; do
echo -e "\n// File: $file\n" >> output.txt
cat "$file" >> output.txt
done
echo "All TypeScript (.ts and .tsx) code has been combined into output.txt with filenames as headers, excluding node_modules"

8905
package-lock.json generated Normal file

File diff suppressed because it is too large Load diff

View file

@ -10,13 +10,15 @@
"test": "vitest" "test": "vitest"
}, },
"dependencies": { "dependencies": {
"dotenv": "^16.4.5",
"node-global-key-listener": "^0.3.0",
"node-mouse": "^0.0.2",
"@types/node": "^20.0.0", "@types/node": "^20.0.0",
"@types/react": "^18.0.0", "@types/react": "^18.0.0",
"@types/react-dom": "^18.0.0", "@types/react-dom": "^18.0.0",
"debounce": "^2.2.0",
"dotenv": "^16.4.5",
"electron": "^28.0.0", "electron": "^28.0.0",
"lodash": "^4.17.21",
"node-global-key-listener": "^0.3.0",
"node-mouse": "^0.0.2",
"openai": "^4.28.0", "openai": "^4.28.0",
"react": "^18.2.0", "react": "^18.2.0",
"react-dom": "^18.2.0", "react-dom": "^18.2.0",

View file

@ -1,6 +1,8 @@
require('dotenv').config();
import { app, BrowserWindow, ipcMain } from 'electron'; import { app, BrowserWindow, ipcMain } from 'electron';
import * as path from 'path'; import * as path from 'path';
// In main.ts
import { systemPreferences } from 'electron';
import { RecorderService } from '../services/recorder.service'; import { RecorderService } from '../services/recorder.service';
import { PlayerService } from '../services/player.service'; import { PlayerService } from '../services/player.service';
@ -56,3 +58,21 @@ ipcMain.handle('stop-recording', async () => {
ipcMain.handle('execute-basic-code', async (_, code: string) => { ipcMain.handle('execute-basic-code', async (_, code: string) => {
await player.executeBasicCode(code); await player.executeBasicCode(code);
}); });
// Add microphone permission check for macOS
ipcMain.handle('check-microphone-permission', async () => {
if (process.platform === 'darwin') {
const status = await systemPreferences.getMediaAccessStatus('microphone');
if (status !== 'granted') {
const success = await systemPreferences.askForMediaAccess('microphone');
return success;
}
return true;
}
// On Windows/Linux, permissions are handled by the OS
return true;
});
// Enable required permissions
app.commandLine.appendSwitch('enable-speech-dispatcher');

View file

@ -1 +0,0 @@
// Preload script goes here

View file

@ -1,36 +1,155 @@
import { AzureOpenAI } from 'openai'; import { AzureOpenAI } from 'openai';
import * as fs from 'fs'; import * as fs from 'fs';
import { ScreenAnalysis } from './types'; import { ScreenAnalysis, ScreenContext, WhisperResponse, AutomationAction } from './types';
const { Readable } = require('stream');
export class OpenAIService { export class OpenAIService {
private client: AzureOpenAI; private client: AzureOpenAI;
constructor() { constructor() {
this.client = new AzureOpenAI({ dangerouslyAllowBrowser: true, this.client = new AzureOpenAI({
dangerouslyAllowBrowser: true,
endpoint: process.env.AZURE_OPEN_AI_ENDPOINT || '', endpoint: process.env.AZURE_OPEN_AI_ENDPOINT || '',
deployment: process.env.AZURE_OPEN_AI_IMAGE_MODEL || '', apiVersion: process.env.OPENAI_API_VERSION || '2024-02-15-preview',
apiVersion: process.env.OPENAI_API_VERSION || '',
apiKey: process.env.AZURE_OPEN_AI_KEY || '' apiKey: process.env.AZURE_OPEN_AI_KEY || ''
}); });
} }
async analyzeScreen(imagePath: string): Promise<ScreenAnalysis> { async transcribeAudio(audioBlob: Blob): Promise<WhisperResponse> {
const imageBuffer = fs.readFileSync(imagePath); try {
const base64Image = imageBuffer.toString('base64'); // Convert Blob to ArrayBuffer
const arrayBuffer = await audioBlob.arrayBuffer();
// Convert Buffer to a Readable stream
const buffer = Buffer.from(arrayBuffer);
const stream = new Readable();
stream.push(buffer);
stream.push(null); // Signal the end of the stream
const response = await this.client.audio.transcriptions.create({
file: stream,
model: process.env.AZURE_OPEN_AI_WHISPER_MODEL || 'whisper-1',
language: 'en',
response_format: 'verbose_json'
}); return {
text: response.text,
//@ts-ignore
segments: response.segments?.map(seg => ({
text: seg.text,
start: seg.start,
end: seg.end
})) || []
};
} catch (error) {
console.error('Error in transcribeAudio:', error);
throw new Error('Failed to transcribe audio');
}
}
async analyzeScreenWithContext(context: ScreenContext): Promise<AutomationAction> {
try {
const response = await this.client.chat.completions.create({ const response = await this.client.chat.completions.create({
model: process.env.AZURE_OPEN_AI_LLM_MODEL || '', model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
messages: [ messages: [
{
role: 'system',
content: `You are an AI that analyzes screenshots and voice commands to determine user intentions for automation.
You should identify UI elements and return specific actions in JSON format.
Focus on the area near the cursor position when relevant.`
},
{ {
role: 'user', role: 'user',
content: [ content: [
{ type: 'text', text: 'Analyze this screenshot and identify all interactive elements (buttons, text fields, etc). Return their locations and identifiers.' }, {
{ type: 'image_url', image_url: { url: `data:image/png;base64,${base64Image}` } } type: 'text',
], text: `Analyze this screenshot with the following context:
Voice Command: "${context.transcription}"
Cursor Position: x=${context.cursorPosition.x}, y=${context.cursorPosition.y}
Identify the most likely action based on the voice command and cursor position.
Return in format: {
"type": "click|type|move",
"identifier": "element-id or descriptive name",
"value": "text to type (for type actions)",
"confidence": 0-1,
"bounds": {"x": number, "y": number, "width": number, "height": number}
}`
}, },
{
type: 'image_url',
image_url: {
url: `data:image/png;base64,${context.screenshot}`
}
}
]
}
], ],
max_tokens: 500,
temperature: 0.3
}); });
return JSON.parse(response.choices[0].message.content || '{}'); const result = JSON.parse(response.choices[0].message.content || '{}');
return result;
} catch (error) {
console.error('Error in analyzeScreenWithContext:', error);
throw new Error('Failed to analyze screen context');
}
}
async analyzeScreen(screenshot: string): Promise<ScreenAnalysis> {
try {
const response = await this.client.chat.completions.create({
model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
messages: [
{
role: 'system',
content: 'You are an AI that analyzes screenshots to identify interactive UI elements and their properties.'
},
{
role: 'user',
content: [
{
type: 'text',
text: `Analyze this screenshot and identify all interactive elements (buttons, text fields, dropdowns, etc).
For each element, provide:
- Type of element
- Identifier or descriptive name
- Location and size
- Any visible text or labels
- State (focused, disabled, etc)
Return in format: {
"elements": [{
"type": "button|input|dropdown|etc",
"identifier": "element-id or descriptive name",
"bounds": {"x": number, "y": number, "width": number, "height": number},
"text": "visible text",
"state": {"focused": boolean, "disabled": boolean}
}]
}`
},
{
type: 'image_url',
image_url: {
url: `data:image/png;base64,${screenshot}`
}
}
]
}
],
max_tokens: 1000,
temperature: 0.3
});
const result = JSON.parse(response.choices[0].message.content || '{}');
return {
elements: result.elements || [],
timestamp: Date.now()
};
} catch (error) {
console.error('Error in analyzeScreen:', error);
throw new Error('Failed to analyze screen');
}
} }
} }

View file

@ -1,41 +1,192 @@
import { screen, ipcMain } from 'electron'; import { ipcRenderer } from 'electron';
import { AutomationEvent, ScreenAnalysis } from './types'; import { AutomationEvent, ScreenAnalysis, WhisperResponse } from '../services/types';
import dotenv from 'dotenv'; import { OpenAIService } from '../services/openai.service';
dotenv.config(); const _ = require('lodash');
import { OpenAIService } from './openai.service'; import * as path from 'path';
import * as fs from 'fs';
export class RecorderService { export class RecorderService {
private events: AutomationEvent[] = []; private events: AutomationEvent[] = [];
private recording: boolean = false; private recording: boolean = false;
private openAIService: OpenAIService; private openAIService: OpenAIService;
private currentScreenshot: string = ''; private currentScreenshot: string = '';
private lastTranscription: string = '';
private recordingProcess: any = null;
private tempDir: string;
private currentAudioFile: string = '';
private silenceTimer: NodeJS.Timeout | null = null;
private isProcessingAudio: boolean = false;
constructor() { constructor() {
this.openAIService = new OpenAIService(); this.openAIService = new OpenAIService();
this.tempDir = path.join(process.cwd(), 'temp_recordings');
if (!fs.existsSync(this.tempDir)) {
fs.mkdirSync(this.tempDir, { recursive: true });
}
} }
public async startRecording() { public async startRecording() {
try {
this.recording = true; this.recording = true;
this.events = []; this.events = [];
this.requestScreenshot(); await this.setupAudioRecording();
await this.requestScreenshot();
ipcRenderer.on('keyboard-event', this.keyboardHandleEvent); // Listen for keyboard events
} catch (error) {
console.error('Failed to start recording:', error);
this.recording = false;
throw error;
}
} }
public stopRecording(): string { private async setupAudioRecording() {
try {
this.recordingProcess = await ipcRenderer.invoke('start-audio-recording');
ipcRenderer.on('audio-level', this.handleAudioLevel);
ipcRenderer.on('audio-chunk', this.handleAudioChunk);
} catch (error) {
console.error('Error setting up audio recording:', error);
throw new Error(`Failed to setup audio recording: ${error.message}`);
}
}
private handleAudioLevel = _.debounce(async (_: any, level: number) => {
if (!this.recording) return;
const SILENCE_THRESHOLD = 0.01;
const SILENCE_DURATION = 1000;
if (level < SILENCE_THRESHOLD) {
if (!this.silenceTimer && !this.isProcessingAudio) {
this.silenceTimer = setTimeout(async () => {
if (this.recording) {
await this.processSilence();
}
}, SILENCE_DURATION);
}
} else {
if (this.silenceTimer) {
clearTimeout(this.silenceTimer);
this.silenceTimer = null;
}
}
}, 100);
private handleAudioChunk = async (_: any, chunk: Buffer) => {
if (!this.recording) return;
try {
const audioFilePath = path.join(this.tempDir, `audio-${Date.now()}.wav`);
fs.writeFileSync(audioFilePath, chunk);
if (this.silenceTimer) {
clearTimeout(this.silenceTimer);
this.silenceTimer = null;
await this.processAudioFile(audioFilePath);
}
} catch (error) {
console.error('Error handling audio chunk:', error);
}
};
private async processSilence() {
if (this.isProcessingAudio) return;
this.isProcessingAudio = true;
try {
const audioFilePath = await ipcRenderer.invoke('save-audio-chunk');
if (audioFilePath) {
this.currentAudioFile = audioFilePath;
await this.processAudioFile(audioFilePath);
await this.requestScreenshot();
}
} catch (error) {
console.error('Error processing silence:', error);
} finally {
this.isProcessingAudio = false;
}
}
private async processAudioFile(audioFilePath: string) {
try {
const audioBuffer = fs.readFileSync(audioFilePath);
const transcription = await this.openAIService.transcribeAudio(
new Blob([audioBuffer], { type: 'audio/wav' })
);
if (transcription.text.trim()) {
await this.processTranscription(transcription);
}
fs.unlinkSync(audioFilePath);
} catch (error) {
console.error('Error processing audio file:', error);
}
}
private async processTranscription(transcription: WhisperResponse) {
this.lastTranscription = transcription.text;
const analysis = await this.openAIService.analyzeScreenWithContext({
screenshot: this.currentScreenshot,
transcription: this.lastTranscription,
cursorPosition: await ipcRenderer.invoke('get-cursor-position')
});
if (analysis) {
this.events.push({
type: analysis.type,
identifier: analysis.identifier,
value: analysis.value,
timestamp: Date.now(),
narration: this.lastTranscription
});
}
}
public async stopRecording(): Promise<string> {
this.recording = false; this.recording = false;
if (this.silenceTimer) {
clearTimeout(this.silenceTimer);
this.silenceTimer = null;
}
await ipcRenderer.invoke('stop-audio-recording');
ipcRenderer.removeListener('audio-level', this.handleAudioLevel);
ipcRenderer.removeListener('audio-chunk', this.handleAudioChunk);
ipcRenderer.removeListener('keyboard-event', this.keyboardHandleEvent); // Remove keyboard listener
if (this.currentAudioFile && fs.existsSync(this.currentAudioFile)) {
fs.unlinkSync(this.currentAudioFile);
}
return this.generateBasicCode(); return this.generateBasicCode();
} }
private requestScreenshot() { private async requestScreenshot() {
// Notify renderer process to capture a screenshot try {
const allWebContents = screen.getAllDisplays(); const sources = await ipcRenderer.invoke('get-screenshot');
allWebContents.forEach((webContents) => { const screenSource = sources[0];
//@ts-ignores await this.screenshotHandleEvent(null, screenSource.thumbnail);
webContents.send('request-screenshot'); } catch (error) {
}); console.error('Error capturing screenshot:', error);
}
} }
public async screenshotHandleEvent(_: any, screenshot: string) { public async screenshotHandleEvent(_: any, screenshot: string) {
this.currentScreenshot = screenshot; // Store the screenshot as a base64 image this.currentScreenshot = screenshot;
}
public async keyboardHandleEvent(_: any, event: KeyboardEvent) {
if (!this.recording) return;
this.events.push({
type: 'type',
identifier: event.key,
timestamp: Date.now(),
narration: this.lastTranscription
});
} }
public async mouseHandleEvent(_: any, event: any) { public async mouseHandleEvent(_: any, event: any) {
@ -49,50 +200,43 @@ export class RecorderService {
type: 'click', type: 'click',
identifier: element.identifier, identifier: element.identifier,
timestamp: Date.now(), timestamp: Date.now(),
}); narration: this.lastTranscription
}
}
public async keyboardHandleEvent(_: any, event: any) {
if (!this.recording) return;
const analysis = await this.openAIService.analyzeScreen(this.currentScreenshot);
const focusedElement = this.findFocusedElement(analysis);
if (focusedElement) {
this.events.push({
type: 'type',
identifier: focusedElement.identifier,
value: event.key,
timestamp: Date.now(),
}); });
} }
} }
private findElementAtPosition(analysis: ScreenAnalysis, x: number, y: number) { private findElementAtPosition(analysis: ScreenAnalysis, x: number, y: number) {
//@ts-nocheck
return analysis.elements.find((element) => { return analysis.elements.find((element) => {
const bounds = element.bounds; const bounds = element.bounds;
return x >= bounds.x && x <= bounds.x + bounds.width && y >= bounds.y && y <= bounds.y + bounds.height; return x >= bounds.x &&
x <= bounds.x + bounds.width &&
y >= bounds.y &&
y <= bounds.y + bounds.height;
}); });
} }
private findFocusedElement(analysis: ScreenAnalysis) {
//@ts-ignore
return analysis.elements.find((element) => element.focused);
}
private generateBasicCode(): string { private generateBasicCode(): string {
let basicCode = '10 REM BotDesktop Automation Script\n'; let basicCode = '10 REM BotDesktop Automation Script\n';
let lineNumber = 20; let lineNumber = 20;
for (const event of this.events) { for (const event of this.events) {
basicCode += `${lineNumber} REM ${event.narration}\n`;
lineNumber += 10;
switch (event.type) { switch (event.type) {
case 'click': case 'click':
basicCode += `${lineNumber} CLICK "${event.identifier}"\n`; basicCode += `${lineNumber} CLICK "${event.identifier}"\n`;
break; break;
case 'type':
basicCode += `${lineNumber} TYPE "${event.identifier}"\n`;
break;
case 'type': case 'type':
basicCode += `${lineNumber} TYPE "${event.identifier}" "${event.value}"\n`; basicCode += `${lineNumber} TYPE "${event.identifier}" "${event.value}"\n`;
break; break;
case 'move':
basicCode += `${lineNumber} MOVE "${event.identifier}"\n`;
break;
} }
lineNumber += 10; lineNumber += 10;
} }

View file

@ -1,11 +1,37 @@
export interface AutomationAction {
type: 'click' | 'type' | 'move';
identifier: string;
value?: string;
confidence: number;
bounds: {
x: number;
y: number;
width: number;
height: number;
};
}
export interface AutomationEvent { export interface AutomationEvent {
type: 'click' | 'type' | 'move'; type: 'click' | 'type' | 'move';
identifier: string; identifier: string;
value?: string; value?: string;
timestamp: number; timestamp: number;
narration: string;
}
export interface WhisperResponse {
text: string;
segments:any;
}
export interface ScreenContext {
screenshot: string;
transcription: string;
cursorPosition: { x: number, y: number };
} }
export interface ScreenAnalysis { export interface ScreenAnalysis {
timestamp: number,
elements: { elements: {
identifier: string; identifier: string;
type: string; type: string;

View file

@ -2,6 +2,7 @@ const path = require('path');
const HtmlWebpackPlugin = require('html-webpack-plugin'); const HtmlWebpackPlugin = require('html-webpack-plugin');
module.exports = { module.exports = {
devtool: 'source-map',
entry: './src/renderer/index.tsx', entry: './src/renderer/index.tsx',
target: 'electron-renderer', target: 'electron-renderer',
module: { module: {