new(all): Initial import.
This commit is contained in:
parent
e6d2ffa35a
commit
5ebde5b646
15 changed files with 9645 additions and 151 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -1,2 +1,3 @@
|
|||
node_modules
|
||||
.env
|
||||
output.txt
|
||||
|
|
|
|||
4
.vscode/launch.json
vendored
4
.vscode/launch.json
vendored
|
|
@ -5,12 +5,12 @@
|
|||
"name": "Electron: Main",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/node_modules/electron/dist/electron.js",
|
||||
"sourceMaps": true,
|
||||
"args": ["${workspaceFolder}/dist/main/main.js"],
|
||||
"outFiles": ["${workspaceFolder}/dist/**/*.js"],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"sourceMaps": true,
|
||||
"protocol": "inspector",
|
||||
"console": "integratedTerminal",
|
||||
"windows": {
|
||||
"runtimeExecutable": "${workspaceFolder}/node_modules/.bin/electron.cmd"
|
||||
},
|
||||
|
|
|
|||
18
dist/main/main.js
vendored
18
dist/main/main.js
vendored
|
|
@ -23,8 +23,11 @@ var __importStar = (this && this.__importStar) || function (mod) {
|
|||
return result;
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
require('dotenv').config();
|
||||
const electron_1 = require("electron");
|
||||
const path = __importStar(require("path"));
|
||||
// In main.ts
|
||||
const electron_2 = require("electron");
|
||||
const recorder_service_1 = require("../services/recorder.service");
|
||||
const player_service_1 = require("../services/player.service");
|
||||
const recorder = new recorder_service_1.RecorderService();
|
||||
|
|
@ -70,3 +73,18 @@ electron_1.ipcMain.handle('stop-recording', async () => {
|
|||
electron_1.ipcMain.handle('execute-basic-code', async (_, code) => {
|
||||
await player.executeBasicCode(code);
|
||||
});
|
||||
// Add microphone permission check for macOS
|
||||
electron_1.ipcMain.handle('check-microphone-permission', async () => {
|
||||
if (process.platform === 'darwin') {
|
||||
const status = await electron_2.systemPreferences.getMediaAccessStatus('microphone');
|
||||
if (status !== 'granted') {
|
||||
const success = await electron_2.systemPreferences.askForMediaAccess('microphone');
|
||||
return success;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// On Windows/Linux, permissions are handled by the OS
|
||||
return true;
|
||||
});
|
||||
// Enable required permissions
|
||||
electron_1.app.commandLine.appendSwitch('enable-speech-dispatcher');
|
||||
|
|
|
|||
1
dist/preload/preload.js
vendored
1
dist/preload/preload.js
vendored
|
|
@ -1 +0,0 @@
|
|||
// Preload script goes here
|
||||
164
dist/services/openai.service.js
vendored
164
dist/services/openai.service.js
vendored
|
|
@ -1,56 +1,150 @@
|
|||
"use strict";
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
||||
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
||||
}) : function(o, v) {
|
||||
o["default"] = v;
|
||||
});
|
||||
var __importStar = (this && this.__importStar) || function (mod) {
|
||||
if (mod && mod.__esModule) return mod;
|
||||
var result = {};
|
||||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
||||
__setModuleDefault(result, mod);
|
||||
return result;
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.OpenAIService = void 0;
|
||||
const openai_1 = require("openai");
|
||||
const fs = __importStar(require("fs"));
|
||||
const { Readable } = require('stream');
|
||||
class OpenAIService {
|
||||
constructor() {
|
||||
this.client = new openai_1.AzureOpenAI({ dangerouslyAllowBrowser: true,
|
||||
this.client = new openai_1.AzureOpenAI({
|
||||
dangerouslyAllowBrowser: true,
|
||||
endpoint: process.env.AZURE_OPEN_AI_ENDPOINT || '',
|
||||
deployment: process.env.AZURE_OPEN_AI_IMAGE_MODEL || '',
|
||||
apiVersion: process.env.OPENAI_API_VERSION || '',
|
||||
apiVersion: process.env.OPENAI_API_VERSION || '2024-02-15-preview',
|
||||
apiKey: process.env.AZURE_OPEN_AI_KEY || ''
|
||||
});
|
||||
}
|
||||
async analyzeScreen(imagePath) {
|
||||
const imageBuffer = fs.readFileSync(imagePath);
|
||||
const base64Image = imageBuffer.toString('base64');
|
||||
async transcribeAudio(audioBlob) {
|
||||
try {
|
||||
// Convert Blob to ArrayBuffer
|
||||
const arrayBuffer = await audioBlob.arrayBuffer();
|
||||
// Convert Buffer to a Readable stream
|
||||
const buffer = Buffer.from(arrayBuffer);
|
||||
const stream = new Readable();
|
||||
stream.push(buffer);
|
||||
stream.push(null); // Signal the end of the stream
|
||||
const response = await this.client.audio.transcriptions.create({
|
||||
file: stream,
|
||||
model: process.env.AZURE_OPEN_AI_WHISPER_MODEL || 'whisper-1',
|
||||
language: 'en',
|
||||
response_format: 'verbose_json'
|
||||
});
|
||||
return {
|
||||
text: response.text,
|
||||
//@ts-ignore
|
||||
segments: response.segments?.map(seg => ({
|
||||
text: seg.text,
|
||||
start: seg.start,
|
||||
end: seg.end
|
||||
})) || []
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error in transcribeAudio:', error);
|
||||
throw new Error('Failed to transcribe audio');
|
||||
}
|
||||
}
|
||||
async analyzeScreenWithContext(context) {
|
||||
try {
|
||||
const response = await this.client.chat.completions.create({
|
||||
model: process.env.AZURE_OPEN_AI_LLM_MODEL || '',
|
||||
model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
content: `You are an AI that analyzes screenshots and voice commands to determine user intentions for automation.
|
||||
You should identify UI elements and return specific actions in JSON format.
|
||||
Focus on the area near the cursor position when relevant.`
|
||||
},
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'Analyze this screenshot and identify all interactive elements (buttons, text fields, etc). Return their locations and identifiers.' },
|
||||
{ type: 'image_url', image_url: { url: `data:image/png;base64,${base64Image}` } }
|
||||
],
|
||||
{
|
||||
type: 'text',
|
||||
text: `Analyze this screenshot with the following context:
|
||||
Voice Command: "${context.transcription}"
|
||||
Cursor Position: x=${context.cursorPosition.x}, y=${context.cursorPosition.y}
|
||||
|
||||
Identify the most likely action based on the voice command and cursor position.
|
||||
Return in format: {
|
||||
"type": "click|type|move",
|
||||
"identifier": "element-id or descriptive name",
|
||||
"value": "text to type (for type actions)",
|
||||
"confidence": 0-1,
|
||||
"bounds": {"x": number, "y": number, "width": number, "height": number}
|
||||
}`
|
||||
},
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: {
|
||||
url: `data:image/png;base64,${context.screenshot}`
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_tokens: 500,
|
||||
temperature: 0.3
|
||||
});
|
||||
return JSON.parse(response.choices[0].message.content || '{}');
|
||||
const result = JSON.parse(response.choices[0].message.content || '{}');
|
||||
return result;
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error in analyzeScreenWithContext:', error);
|
||||
throw new Error('Failed to analyze screen context');
|
||||
}
|
||||
}
|
||||
async analyzeScreen(screenshot) {
|
||||
try {
|
||||
const response = await this.client.chat.completions.create({
|
||||
model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
content: 'You are an AI that analyzes screenshots to identify interactive UI elements and their properties.'
|
||||
},
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{
|
||||
type: 'text',
|
||||
text: `Analyze this screenshot and identify all interactive elements (buttons, text fields, dropdowns, etc).
|
||||
For each element, provide:
|
||||
- Type of element
|
||||
- Identifier or descriptive name
|
||||
- Location and size
|
||||
- Any visible text or labels
|
||||
- State (focused, disabled, etc)
|
||||
|
||||
Return in format: {
|
||||
"elements": [{
|
||||
"type": "button|input|dropdown|etc",
|
||||
"identifier": "element-id or descriptive name",
|
||||
"bounds": {"x": number, "y": number, "width": number, "height": number},
|
||||
"text": "visible text",
|
||||
"state": {"focused": boolean, "disabled": boolean}
|
||||
}]
|
||||
}`
|
||||
},
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: {
|
||||
url: `data:image/png;base64,${screenshot}`
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_tokens: 1000,
|
||||
temperature: 0.3
|
||||
});
|
||||
const result = JSON.parse(response.choices[0].message.content || '{}');
|
||||
return {
|
||||
elements: result.elements || [],
|
||||
timestamp: Date.now()
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error in analyzeScreen:', error);
|
||||
throw new Error('Failed to analyze screen');
|
||||
}
|
||||
}
|
||||
}
|
||||
exports.OpenAIService = OpenAIService;
|
||||
|
|
|
|||
222
dist/services/recorder.service.js
vendored
222
dist/services/recorder.service.js
vendored
|
|
@ -1,39 +1,198 @@
|
|||
"use strict";
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
var desc = Object.getOwnPropertyDescriptor(m, k);
|
||||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
||||
desc = { enumerable: true, get: function() { return m[k]; } };
|
||||
}
|
||||
Object.defineProperty(o, k2, desc);
|
||||
}) : (function(o, m, k, k2) {
|
||||
if (k2 === undefined) k2 = k;
|
||||
o[k2] = m[k];
|
||||
}));
|
||||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
||||
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
||||
}) : function(o, v) {
|
||||
o["default"] = v;
|
||||
});
|
||||
var __importStar = (this && this.__importStar) || function (mod) {
|
||||
if (mod && mod.__esModule) return mod;
|
||||
var result = {};
|
||||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
||||
__setModuleDefault(result, mod);
|
||||
return result;
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.RecorderService = void 0;
|
||||
const electron_1 = require("electron");
|
||||
const dotenv_1 = __importDefault(require("dotenv"));
|
||||
dotenv_1.default.config();
|
||||
const openai_service_1 = require("./openai.service");
|
||||
const openai_service_1 = require("../services/openai.service");
|
||||
const _ = require('lodash');
|
||||
const path = __importStar(require("path"));
|
||||
const fs = __importStar(require("fs"));
|
||||
class RecorderService {
|
||||
constructor() {
|
||||
this.events = [];
|
||||
this.recording = false;
|
||||
this.currentScreenshot = '';
|
||||
this.lastTranscription = '';
|
||||
this.recordingProcess = null;
|
||||
this.currentAudioFile = '';
|
||||
this.silenceTimer = null;
|
||||
this.isProcessingAudio = false;
|
||||
this.handleAudioLevel = _.debounce(async (_, level) => {
|
||||
if (!this.recording)
|
||||
return;
|
||||
const SILENCE_THRESHOLD = 0.01;
|
||||
const SILENCE_DURATION = 1000;
|
||||
if (level < SILENCE_THRESHOLD) {
|
||||
if (!this.silenceTimer && !this.isProcessingAudio) {
|
||||
this.silenceTimer = setTimeout(async () => {
|
||||
if (this.recording) {
|
||||
await this.processSilence();
|
||||
}
|
||||
}, SILENCE_DURATION);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (this.silenceTimer) {
|
||||
clearTimeout(this.silenceTimer);
|
||||
this.silenceTimer = null;
|
||||
}
|
||||
}
|
||||
}, 100);
|
||||
this.handleAudioChunk = async (_, chunk) => {
|
||||
if (!this.recording)
|
||||
return;
|
||||
try {
|
||||
const audioFilePath = path.join(this.tempDir, `audio-${Date.now()}.wav`);
|
||||
fs.writeFileSync(audioFilePath, chunk);
|
||||
if (this.silenceTimer) {
|
||||
clearTimeout(this.silenceTimer);
|
||||
this.silenceTimer = null;
|
||||
await this.processAudioFile(audioFilePath);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error handling audio chunk:', error);
|
||||
}
|
||||
};
|
||||
this.openAIService = new openai_service_1.OpenAIService();
|
||||
this.tempDir = path.join(process.cwd(), 'temp_recordings');
|
||||
if (!fs.existsSync(this.tempDir)) {
|
||||
fs.mkdirSync(this.tempDir, { recursive: true });
|
||||
}
|
||||
}
|
||||
async startRecording() {
|
||||
try {
|
||||
this.recording = true;
|
||||
this.events = [];
|
||||
this.requestScreenshot();
|
||||
await this.setupAudioRecording();
|
||||
await this.requestScreenshot();
|
||||
electron_1.ipcRenderer.on('keyboard-event', this.keyboardHandleEvent); // Listen for keyboard events
|
||||
}
|
||||
stopRecording() {
|
||||
catch (error) {
|
||||
console.error('Failed to start recording:', error);
|
||||
this.recording = false;
|
||||
return this.generateBasicCode();
|
||||
throw error;
|
||||
}
|
||||
requestScreenshot() {
|
||||
// Notify renderer process to capture a screenshot
|
||||
const allWebContents = electron_1.screen.getAllDisplays();
|
||||
allWebContents.forEach((webContents) => {
|
||||
//@ts-ignores
|
||||
webContents.send('request-screenshot');
|
||||
}
|
||||
async setupAudioRecording() {
|
||||
try {
|
||||
this.recordingProcess = await electron_1.ipcRenderer.invoke('start-audio-recording');
|
||||
electron_1.ipcRenderer.on('audio-level', this.handleAudioLevel);
|
||||
electron_1.ipcRenderer.on('audio-chunk', this.handleAudioChunk);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error setting up audio recording:', error);
|
||||
throw new Error(`Failed to setup audio recording: ${error.message}`);
|
||||
}
|
||||
}
|
||||
async processSilence() {
|
||||
if (this.isProcessingAudio)
|
||||
return;
|
||||
this.isProcessingAudio = true;
|
||||
try {
|
||||
const audioFilePath = await electron_1.ipcRenderer.invoke('save-audio-chunk');
|
||||
if (audioFilePath) {
|
||||
this.currentAudioFile = audioFilePath;
|
||||
await this.processAudioFile(audioFilePath);
|
||||
await this.requestScreenshot();
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error processing silence:', error);
|
||||
}
|
||||
finally {
|
||||
this.isProcessingAudio = false;
|
||||
}
|
||||
}
|
||||
async processAudioFile(audioFilePath) {
|
||||
try {
|
||||
const audioBuffer = fs.readFileSync(audioFilePath);
|
||||
const transcription = await this.openAIService.transcribeAudio(new Blob([audioBuffer], { type: 'audio/wav' }));
|
||||
if (transcription.text.trim()) {
|
||||
await this.processTranscription(transcription);
|
||||
}
|
||||
fs.unlinkSync(audioFilePath);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error processing audio file:', error);
|
||||
}
|
||||
}
|
||||
async processTranscription(transcription) {
|
||||
this.lastTranscription = transcription.text;
|
||||
const analysis = await this.openAIService.analyzeScreenWithContext({
|
||||
screenshot: this.currentScreenshot,
|
||||
transcription: this.lastTranscription,
|
||||
cursorPosition: await electron_1.ipcRenderer.invoke('get-cursor-position')
|
||||
});
|
||||
if (analysis) {
|
||||
this.events.push({
|
||||
type: analysis.type,
|
||||
identifier: analysis.identifier,
|
||||
value: analysis.value,
|
||||
timestamp: Date.now(),
|
||||
narration: this.lastTranscription
|
||||
});
|
||||
}
|
||||
}
|
||||
async stopRecording() {
|
||||
this.recording = false;
|
||||
if (this.silenceTimer) {
|
||||
clearTimeout(this.silenceTimer);
|
||||
this.silenceTimer = null;
|
||||
}
|
||||
await electron_1.ipcRenderer.invoke('stop-audio-recording');
|
||||
electron_1.ipcRenderer.removeListener('audio-level', this.handleAudioLevel);
|
||||
electron_1.ipcRenderer.removeListener('audio-chunk', this.handleAudioChunk);
|
||||
electron_1.ipcRenderer.removeListener('keyboard-event', this.keyboardHandleEvent); // Remove keyboard listener
|
||||
if (this.currentAudioFile && fs.existsSync(this.currentAudioFile)) {
|
||||
fs.unlinkSync(this.currentAudioFile);
|
||||
}
|
||||
return this.generateBasicCode();
|
||||
}
|
||||
async requestScreenshot() {
|
||||
try {
|
||||
const sources = await electron_1.ipcRenderer.invoke('get-screenshot');
|
||||
const screenSource = sources[0];
|
||||
await this.screenshotHandleEvent(null, screenSource.thumbnail);
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error capturing screenshot:', error);
|
||||
}
|
||||
}
|
||||
async screenshotHandleEvent(_, screenshot) {
|
||||
this.currentScreenshot = screenshot; // Store the screenshot as a base64 image
|
||||
this.currentScreenshot = screenshot;
|
||||
}
|
||||
async keyboardHandleEvent(_, event) {
|
||||
if (!this.recording)
|
||||
return;
|
||||
this.events.push({
|
||||
type: 'type',
|
||||
identifier: event.key,
|
||||
timestamp: Date.now(),
|
||||
narration: this.lastTranscription
|
||||
});
|
||||
}
|
||||
async mouseHandleEvent(_, event) {
|
||||
if (!this.recording)
|
||||
|
|
@ -45,44 +204,39 @@ class RecorderService {
|
|||
type: 'click',
|
||||
identifier: element.identifier,
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
}
|
||||
}
|
||||
async keyboardHandleEvent(_, event) {
|
||||
if (!this.recording)
|
||||
return;
|
||||
const analysis = await this.openAIService.analyzeScreen(this.currentScreenshot);
|
||||
const focusedElement = this.findFocusedElement(analysis);
|
||||
if (focusedElement) {
|
||||
this.events.push({
|
||||
type: 'type',
|
||||
identifier: focusedElement.identifier,
|
||||
value: event.key,
|
||||
timestamp: Date.now(),
|
||||
narration: this.lastTranscription
|
||||
});
|
||||
}
|
||||
}
|
||||
findElementAtPosition(analysis, x, y) {
|
||||
//@ts-nocheck
|
||||
return analysis.elements.find((element) => {
|
||||
const bounds = element.bounds;
|
||||
return x >= bounds.x && x <= bounds.x + bounds.width && y >= bounds.y && y <= bounds.y + bounds.height;
|
||||
return x >= bounds.x &&
|
||||
x <= bounds.x + bounds.width &&
|
||||
y >= bounds.y &&
|
||||
y <= bounds.y + bounds.height;
|
||||
});
|
||||
}
|
||||
findFocusedElement(analysis) {
|
||||
//@ts-ignore
|
||||
return analysis.elements.find((element) => element.focused);
|
||||
}
|
||||
generateBasicCode() {
|
||||
let basicCode = '10 REM BotDesktop Automation Script\n';
|
||||
let lineNumber = 20;
|
||||
for (const event of this.events) {
|
||||
basicCode += `${lineNumber} REM ${event.narration}\n`;
|
||||
lineNumber += 10;
|
||||
switch (event.type) {
|
||||
case 'click':
|
||||
basicCode += `${lineNumber} CLICK "${event.identifier}"\n`;
|
||||
break;
|
||||
case 'type':
|
||||
basicCode += `${lineNumber} TYPE "${event.identifier}"\n`;
|
||||
break;
|
||||
case 'type':
|
||||
basicCode += `${lineNumber} TYPE "${event.identifier}" "${event.value}"\n`;
|
||||
break;
|
||||
case 'move':
|
||||
basicCode += `${lineNumber} MOVE "${event.identifier}"\n`;
|
||||
break;
|
||||
}
|
||||
lineNumber += 10;
|
||||
}
|
||||
|
|
|
|||
12
gencode.sh
Executable file
12
gencode.sh
Executable file
|
|
@ -0,0 +1,12 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Remove output.txt if it exists to start fresh
|
||||
rm -f output.txt
|
||||
|
||||
# Find all .ts and .tsx files excluding node_modules, and concatenate filename and contents into output.txt
|
||||
find . -type f \( -name "*.ts" -o -name "*.tsx" \) -not -path "*/node_modules/*" | while read -r file; do
|
||||
echo -e "\n// File: $file\n" >> output.txt
|
||||
cat "$file" >> output.txt
|
||||
done
|
||||
|
||||
echo "All TypeScript (.ts and .tsx) code has been combined into output.txt with filenames as headers, excluding node_modules"
|
||||
8905
package-lock.json
generated
Normal file
8905
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -10,13 +10,15 @@
|
|||
"test": "vitest"
|
||||
},
|
||||
"dependencies": {
|
||||
"dotenv": "^16.4.5",
|
||||
"node-global-key-listener": "^0.3.0",
|
||||
"node-mouse": "^0.0.2",
|
||||
"@types/node": "^20.0.0",
|
||||
"@types/react": "^18.0.0",
|
||||
"@types/react-dom": "^18.0.0",
|
||||
"debounce": "^2.2.0",
|
||||
"dotenv": "^16.4.5",
|
||||
"electron": "^28.0.0",
|
||||
"lodash": "^4.17.21",
|
||||
"node-global-key-listener": "^0.3.0",
|
||||
"node-mouse": "^0.0.2",
|
||||
"openai": "^4.28.0",
|
||||
"react": "^18.2.0",
|
||||
"react-dom": "^18.2.0",
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
require('dotenv').config();
|
||||
import { app, BrowserWindow, ipcMain } from 'electron';
|
||||
import * as path from 'path';
|
||||
|
||||
// In main.ts
|
||||
import { systemPreferences } from 'electron';
|
||||
import { RecorderService } from '../services/recorder.service';
|
||||
import { PlayerService } from '../services/player.service';
|
||||
|
||||
|
|
@ -56,3 +58,21 @@ ipcMain.handle('stop-recording', async () => {
|
|||
ipcMain.handle('execute-basic-code', async (_, code: string) => {
|
||||
await player.executeBasicCode(code);
|
||||
});
|
||||
|
||||
|
||||
// Add microphone permission check for macOS
|
||||
ipcMain.handle('check-microphone-permission', async () => {
|
||||
if (process.platform === 'darwin') {
|
||||
const status = await systemPreferences.getMediaAccessStatus('microphone');
|
||||
if (status !== 'granted') {
|
||||
const success = await systemPreferences.askForMediaAccess('microphone');
|
||||
return success;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// On Windows/Linux, permissions are handled by the OS
|
||||
return true;
|
||||
});
|
||||
|
||||
// Enable required permissions
|
||||
app.commandLine.appendSwitch('enable-speech-dispatcher');
|
||||
|
|
@ -1 +0,0 @@
|
|||
// Preload script goes here
|
||||
|
|
@ -1,36 +1,155 @@
|
|||
import { AzureOpenAI } from 'openai';
|
||||
import * as fs from 'fs';
|
||||
import { ScreenAnalysis } from './types';
|
||||
import { ScreenAnalysis, ScreenContext, WhisperResponse, AutomationAction } from './types';
|
||||
|
||||
const { Readable } = require('stream');
|
||||
|
||||
export class OpenAIService {
|
||||
private client: AzureOpenAI;
|
||||
|
||||
constructor() {
|
||||
this.client = new AzureOpenAI({ dangerouslyAllowBrowser: true,
|
||||
this.client = new AzureOpenAI({
|
||||
dangerouslyAllowBrowser: true,
|
||||
endpoint: process.env.AZURE_OPEN_AI_ENDPOINT || '',
|
||||
deployment: process.env.AZURE_OPEN_AI_IMAGE_MODEL || '',
|
||||
apiVersion: process.env.OPENAI_API_VERSION || '',
|
||||
apiVersion: process.env.OPENAI_API_VERSION || '2024-02-15-preview',
|
||||
apiKey: process.env.AZURE_OPEN_AI_KEY || ''
|
||||
});
|
||||
}
|
||||
|
||||
async analyzeScreen(imagePath: string): Promise<ScreenAnalysis> {
|
||||
const imageBuffer = fs.readFileSync(imagePath);
|
||||
const base64Image = imageBuffer.toString('base64');
|
||||
async transcribeAudio(audioBlob: Blob): Promise<WhisperResponse> {
|
||||
try {
|
||||
// Convert Blob to ArrayBuffer
|
||||
const arrayBuffer = await audioBlob.arrayBuffer();
|
||||
|
||||
// Convert Buffer to a Readable stream
|
||||
const buffer = Buffer.from(arrayBuffer);
|
||||
const stream = new Readable();
|
||||
stream.push(buffer);
|
||||
stream.push(null); // Signal the end of the stream
|
||||
|
||||
const response = await this.client.audio.transcriptions.create({
|
||||
file: stream,
|
||||
model: process.env.AZURE_OPEN_AI_WHISPER_MODEL || 'whisper-1',
|
||||
language: 'en',
|
||||
response_format: 'verbose_json'
|
||||
}); return {
|
||||
text: response.text,
|
||||
//@ts-ignore
|
||||
segments: response.segments?.map(seg => ({
|
||||
text: seg.text,
|
||||
start: seg.start,
|
||||
end: seg.end
|
||||
})) || []
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('Error in transcribeAudio:', error);
|
||||
throw new Error('Failed to transcribe audio');
|
||||
}
|
||||
}
|
||||
|
||||
async analyzeScreenWithContext(context: ScreenContext): Promise<AutomationAction> {
|
||||
try {
|
||||
const response = await this.client.chat.completions.create({
|
||||
model: process.env.AZURE_OPEN_AI_LLM_MODEL || '',
|
||||
model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
content: `You are an AI that analyzes screenshots and voice commands to determine user intentions for automation.
|
||||
You should identify UI elements and return specific actions in JSON format.
|
||||
Focus on the area near the cursor position when relevant.`
|
||||
},
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'Analyze this screenshot and identify all interactive elements (buttons, text fields, etc). Return their locations and identifiers.' },
|
||||
{ type: 'image_url', image_url: { url: `data:image/png;base64,${base64Image}` } }
|
||||
],
|
||||
{
|
||||
type: 'text',
|
||||
text: `Analyze this screenshot with the following context:
|
||||
Voice Command: "${context.transcription}"
|
||||
Cursor Position: x=${context.cursorPosition.x}, y=${context.cursorPosition.y}
|
||||
|
||||
Identify the most likely action based on the voice command and cursor position.
|
||||
Return in format: {
|
||||
"type": "click|type|move",
|
||||
"identifier": "element-id or descriptive name",
|
||||
"value": "text to type (for type actions)",
|
||||
"confidence": 0-1,
|
||||
"bounds": {"x": number, "y": number, "width": number, "height": number}
|
||||
}`
|
||||
},
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: {
|
||||
url: `data:image/png;base64,${context.screenshot}`
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_tokens: 500,
|
||||
temperature: 0.3
|
||||
});
|
||||
|
||||
return JSON.parse(response.choices[0].message.content || '{}');
|
||||
const result = JSON.parse(response.choices[0].message.content || '{}');
|
||||
return result;
|
||||
} catch (error) {
|
||||
console.error('Error in analyzeScreenWithContext:', error);
|
||||
throw new Error('Failed to analyze screen context');
|
||||
}
|
||||
}
|
||||
|
||||
async analyzeScreen(screenshot: string): Promise<ScreenAnalysis> {
|
||||
try {
|
||||
const response = await this.client.chat.completions.create({
|
||||
model: process.env.AZURE_OPEN_AI_VISION_MODEL || '',
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
content: 'You are an AI that analyzes screenshots to identify interactive UI elements and their properties.'
|
||||
},
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{
|
||||
type: 'text',
|
||||
text: `Analyze this screenshot and identify all interactive elements (buttons, text fields, dropdowns, etc).
|
||||
For each element, provide:
|
||||
- Type of element
|
||||
- Identifier or descriptive name
|
||||
- Location and size
|
||||
- Any visible text or labels
|
||||
- State (focused, disabled, etc)
|
||||
|
||||
Return in format: {
|
||||
"elements": [{
|
||||
"type": "button|input|dropdown|etc",
|
||||
"identifier": "element-id or descriptive name",
|
||||
"bounds": {"x": number, "y": number, "width": number, "height": number},
|
||||
"text": "visible text",
|
||||
"state": {"focused": boolean, "disabled": boolean}
|
||||
}]
|
||||
}`
|
||||
},
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: {
|
||||
url: `data:image/png;base64,${screenshot}`
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_tokens: 1000,
|
||||
temperature: 0.3
|
||||
});
|
||||
|
||||
const result = JSON.parse(response.choices[0].message.content || '{}');
|
||||
return {
|
||||
elements: result.elements || [],
|
||||
timestamp: Date.now()
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('Error in analyzeScreen:', error);
|
||||
throw new Error('Failed to analyze screen');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,41 +1,192 @@
|
|||
import { screen, ipcMain } from 'electron';
|
||||
import { AutomationEvent, ScreenAnalysis } from './types';
|
||||
import dotenv from 'dotenv';
|
||||
dotenv.config();
|
||||
import { OpenAIService } from './openai.service';
|
||||
import { ipcRenderer } from 'electron';
|
||||
import { AutomationEvent, ScreenAnalysis, WhisperResponse } from '../services/types';
|
||||
import { OpenAIService } from '../services/openai.service';
|
||||
const _ = require('lodash');
|
||||
import * as path from 'path';
|
||||
import * as fs from 'fs';
|
||||
|
||||
export class RecorderService {
|
||||
private events: AutomationEvent[] = [];
|
||||
private recording: boolean = false;
|
||||
private openAIService: OpenAIService;
|
||||
private currentScreenshot: string = '';
|
||||
private lastTranscription: string = '';
|
||||
private recordingProcess: any = null;
|
||||
private tempDir: string;
|
||||
private currentAudioFile: string = '';
|
||||
private silenceTimer: NodeJS.Timeout | null = null;
|
||||
private isProcessingAudio: boolean = false;
|
||||
|
||||
constructor() {
|
||||
this.openAIService = new OpenAIService();
|
||||
this.tempDir = path.join(process.cwd(), 'temp_recordings');
|
||||
if (!fs.existsSync(this.tempDir)) {
|
||||
fs.mkdirSync(this.tempDir, { recursive: true });
|
||||
}
|
||||
}
|
||||
|
||||
public async startRecording() {
|
||||
try {
|
||||
this.recording = true;
|
||||
this.events = [];
|
||||
this.requestScreenshot();
|
||||
await this.setupAudioRecording();
|
||||
await this.requestScreenshot();
|
||||
ipcRenderer.on('keyboard-event', this.keyboardHandleEvent); // Listen for keyboard events
|
||||
} catch (error) {
|
||||
console.error('Failed to start recording:', error);
|
||||
this.recording = false;
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
public stopRecording(): string {
|
||||
private async setupAudioRecording() {
|
||||
try {
|
||||
this.recordingProcess = await ipcRenderer.invoke('start-audio-recording');
|
||||
ipcRenderer.on('audio-level', this.handleAudioLevel);
|
||||
ipcRenderer.on('audio-chunk', this.handleAudioChunk);
|
||||
} catch (error) {
|
||||
console.error('Error setting up audio recording:', error);
|
||||
throw new Error(`Failed to setup audio recording: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
private handleAudioLevel = _.debounce(async (_: any, level: number) => {
|
||||
if (!this.recording) return;
|
||||
|
||||
const SILENCE_THRESHOLD = 0.01;
|
||||
const SILENCE_DURATION = 1000;
|
||||
|
||||
if (level < SILENCE_THRESHOLD) {
|
||||
if (!this.silenceTimer && !this.isProcessingAudio) {
|
||||
this.silenceTimer = setTimeout(async () => {
|
||||
if (this.recording) {
|
||||
await this.processSilence();
|
||||
}
|
||||
}, SILENCE_DURATION);
|
||||
}
|
||||
} else {
|
||||
if (this.silenceTimer) {
|
||||
clearTimeout(this.silenceTimer);
|
||||
this.silenceTimer = null;
|
||||
}
|
||||
}
|
||||
}, 100);
|
||||
|
||||
private handleAudioChunk = async (_: any, chunk: Buffer) => {
|
||||
if (!this.recording) return;
|
||||
|
||||
try {
|
||||
const audioFilePath = path.join(this.tempDir, `audio-${Date.now()}.wav`);
|
||||
fs.writeFileSync(audioFilePath, chunk);
|
||||
|
||||
if (this.silenceTimer) {
|
||||
clearTimeout(this.silenceTimer);
|
||||
this.silenceTimer = null;
|
||||
await this.processAudioFile(audioFilePath);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error handling audio chunk:', error);
|
||||
}
|
||||
};
|
||||
|
||||
private async processSilence() {
|
||||
if (this.isProcessingAudio) return;
|
||||
|
||||
this.isProcessingAudio = true;
|
||||
try {
|
||||
const audioFilePath = await ipcRenderer.invoke('save-audio-chunk');
|
||||
if (audioFilePath) {
|
||||
this.currentAudioFile = audioFilePath;
|
||||
await this.processAudioFile(audioFilePath);
|
||||
await this.requestScreenshot();
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error processing silence:', error);
|
||||
} finally {
|
||||
this.isProcessingAudio = false;
|
||||
}
|
||||
}
|
||||
|
||||
private async processAudioFile(audioFilePath: string) {
|
||||
try {
|
||||
const audioBuffer = fs.readFileSync(audioFilePath);
|
||||
const transcription = await this.openAIService.transcribeAudio(
|
||||
new Blob([audioBuffer], { type: 'audio/wav' })
|
||||
);
|
||||
|
||||
if (transcription.text.trim()) {
|
||||
await this.processTranscription(transcription);
|
||||
}
|
||||
|
||||
fs.unlinkSync(audioFilePath);
|
||||
} catch (error) {
|
||||
console.error('Error processing audio file:', error);
|
||||
}
|
||||
}
|
||||
|
||||
private async processTranscription(transcription: WhisperResponse) {
|
||||
this.lastTranscription = transcription.text;
|
||||
|
||||
const analysis = await this.openAIService.analyzeScreenWithContext({
|
||||
screenshot: this.currentScreenshot,
|
||||
transcription: this.lastTranscription,
|
||||
cursorPosition: await ipcRenderer.invoke('get-cursor-position')
|
||||
});
|
||||
|
||||
if (analysis) {
|
||||
this.events.push({
|
||||
type: analysis.type,
|
||||
identifier: analysis.identifier,
|
||||
value: analysis.value,
|
||||
timestamp: Date.now(),
|
||||
narration: this.lastTranscription
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public async stopRecording(): Promise<string> {
|
||||
this.recording = false;
|
||||
|
||||
if (this.silenceTimer) {
|
||||
clearTimeout(this.silenceTimer);
|
||||
this.silenceTimer = null;
|
||||
}
|
||||
|
||||
await ipcRenderer.invoke('stop-audio-recording');
|
||||
ipcRenderer.removeListener('audio-level', this.handleAudioLevel);
|
||||
ipcRenderer.removeListener('audio-chunk', this.handleAudioChunk);
|
||||
ipcRenderer.removeListener('keyboard-event', this.keyboardHandleEvent); // Remove keyboard listener
|
||||
|
||||
if (this.currentAudioFile && fs.existsSync(this.currentAudioFile)) {
|
||||
fs.unlinkSync(this.currentAudioFile);
|
||||
}
|
||||
|
||||
return this.generateBasicCode();
|
||||
}
|
||||
|
||||
private requestScreenshot() {
|
||||
// Notify renderer process to capture a screenshot
|
||||
const allWebContents = screen.getAllDisplays();
|
||||
allWebContents.forEach((webContents) => {
|
||||
//@ts-ignores
|
||||
webContents.send('request-screenshot');
|
||||
});
|
||||
private async requestScreenshot() {
|
||||
try {
|
||||
const sources = await ipcRenderer.invoke('get-screenshot');
|
||||
const screenSource = sources[0];
|
||||
await this.screenshotHandleEvent(null, screenSource.thumbnail);
|
||||
} catch (error) {
|
||||
console.error('Error capturing screenshot:', error);
|
||||
}
|
||||
}
|
||||
|
||||
public async screenshotHandleEvent(_: any, screenshot: string) {
|
||||
this.currentScreenshot = screenshot; // Store the screenshot as a base64 image
|
||||
this.currentScreenshot = screenshot;
|
||||
}
|
||||
|
||||
public async keyboardHandleEvent(_: any, event: KeyboardEvent) {
|
||||
if (!this.recording) return;
|
||||
|
||||
this.events.push({
|
||||
type: 'type',
|
||||
identifier: event.key,
|
||||
timestamp: Date.now(),
|
||||
narration: this.lastTranscription
|
||||
});
|
||||
}
|
||||
|
||||
public async mouseHandleEvent(_: any, event: any) {
|
||||
|
|
@ -49,50 +200,43 @@ export class RecorderService {
|
|||
type: 'click',
|
||||
identifier: element.identifier,
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public async keyboardHandleEvent(_: any, event: any) {
|
||||
if (!this.recording) return;
|
||||
|
||||
const analysis = await this.openAIService.analyzeScreen(this.currentScreenshot);
|
||||
const focusedElement = this.findFocusedElement(analysis);
|
||||
|
||||
if (focusedElement) {
|
||||
this.events.push({
|
||||
type: 'type',
|
||||
identifier: focusedElement.identifier,
|
||||
value: event.key,
|
||||
timestamp: Date.now(),
|
||||
narration: this.lastTranscription
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private findElementAtPosition(analysis: ScreenAnalysis, x: number, y: number) {
|
||||
//@ts-nocheck
|
||||
return analysis.elements.find((element) => {
|
||||
const bounds = element.bounds;
|
||||
return x >= bounds.x && x <= bounds.x + bounds.width && y >= bounds.y && y <= bounds.y + bounds.height;
|
||||
return x >= bounds.x &&
|
||||
x <= bounds.x + bounds.width &&
|
||||
y >= bounds.y &&
|
||||
y <= bounds.y + bounds.height;
|
||||
});
|
||||
}
|
||||
|
||||
private findFocusedElement(analysis: ScreenAnalysis) {
|
||||
//@ts-ignore
|
||||
return analysis.elements.find((element) => element.focused);
|
||||
}
|
||||
|
||||
private generateBasicCode(): string {
|
||||
let basicCode = '10 REM BotDesktop Automation Script\n';
|
||||
let lineNumber = 20;
|
||||
|
||||
for (const event of this.events) {
|
||||
basicCode += `${lineNumber} REM ${event.narration}\n`;
|
||||
lineNumber += 10;
|
||||
|
||||
switch (event.type) {
|
||||
case 'click':
|
||||
basicCode += `${lineNumber} CLICK "${event.identifier}"\n`;
|
||||
break;
|
||||
case 'type':
|
||||
basicCode += `${lineNumber} TYPE "${event.identifier}"\n`;
|
||||
break;
|
||||
case 'type':
|
||||
basicCode += `${lineNumber} TYPE "${event.identifier}" "${event.value}"\n`;
|
||||
break;
|
||||
case 'move':
|
||||
basicCode += `${lineNumber} MOVE "${event.identifier}"\n`;
|
||||
break;
|
||||
}
|
||||
lineNumber += 10;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,11 +1,37 @@
|
|||
export interface AutomationAction {
|
||||
type: 'click' | 'type' | 'move';
|
||||
identifier: string;
|
||||
value?: string;
|
||||
confidence: number;
|
||||
bounds: {
|
||||
x: number;
|
||||
y: number;
|
||||
width: number;
|
||||
height: number;
|
||||
};
|
||||
}
|
||||
|
||||
export interface AutomationEvent {
|
||||
type: 'click' | 'type' | 'move';
|
||||
identifier: string;
|
||||
value?: string;
|
||||
timestamp: number;
|
||||
narration: string;
|
||||
}
|
||||
|
||||
export interface WhisperResponse {
|
||||
text: string;
|
||||
segments:any;
|
||||
}
|
||||
|
||||
export interface ScreenContext {
|
||||
screenshot: string;
|
||||
transcription: string;
|
||||
cursorPosition: { x: number, y: number };
|
||||
}
|
||||
|
||||
export interface ScreenAnalysis {
|
||||
timestamp: number,
|
||||
elements: {
|
||||
identifier: string;
|
||||
type: string;
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ const path = require('path');
|
|||
const HtmlWebpackPlugin = require('html-webpack-plugin');
|
||||
|
||||
module.exports = {
|
||||
devtool: 'source-map',
|
||||
entry: './src/renderer/index.tsx',
|
||||
target: 'electron-renderer',
|
||||
module: {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue