Build conversational voice experiences with StateSet’s Voice AI capabilities
Voice AI Architecture
npm install stateset-node @stateset/voice
import { StateSetClient, VoiceClient } from 'stateset-node';
const client = new StateSetClient({
apiKey: process.env.STATESET_API_KEY
});
const voice = new VoiceClient({
apiKey: process.env.STATESET_API_KEY,
defaultVoice: 'sarah', // Natural female voice
language: 'en-US'
});
// Initialize voice agent
async function setupVoiceAgent() {
const agent = await client.agents.create({
name: 'Voice Assistant',
type: 'voice',
greeting: 'Hello! How can I help you today?',
voice_config: {
provider: 'stateset', // or 'elevenlabs', 'azure', 'google'
voice_id: 'sarah',
speed: 1.0,
pitch: 1.0,
emotion: 'friendly'
}
});
return agent;
}
async function textToSpeech(text) {
try {
const audio = await voice.synthesize({
text: text,
voice: 'sarah',
format: 'mp3', // or 'wav', 'ogg'
options: {
speed: 1.0,
pitch: 1.0,
emphasis: 'moderate'
}
});
// Play audio in browser
const audioUrl = URL.createObjectURL(audio);
const audioElement = new Audio(audioUrl);
await audioElement.play();
return audio;
} catch (error) {
console.error('TTS Error:', error);
throw error;
}
}
async function advancedSynthesis(message, customerEmotion) {
// Adjust voice based on customer emotion
const voiceParams = getVoiceParameters(customerEmotion);
const ssml = `
<speak>
<prosody rate="${voiceParams.rate}" pitch="${voiceParams.pitch}">
${voiceParams.emphasis ? `<emphasis level="${voiceParams.emphasis}">` : ''}
${message}
${voiceParams.emphasis ? '</emphasis>' : ''}
</prosody>
<break time="500ms"/>
<prosody rate="0.9">
Is there anything else I can help you with?
</prosody>
</speak>
`;
const audio = await voice.synthesize({
ssml: ssml,
voice: voiceParams.voice,
format: 'mp3'
});
return audio;
}
function getVoiceParameters(emotion) {
const parameters = {
happy: { rate: '1.1', pitch: '+5%', voice: 'sarah', emphasis: 'moderate' },
sad: { rate: '0.9', pitch: '-5%', voice: 'sarah-empathetic' },
angry: { rate: '0.95', pitch: '-2%', voice: 'sarah-calm', emphasis: 'reduced' },
neutral: { rate: '1.0', pitch: '0%', voice: 'sarah' }
};
return parameters[emotion] || parameters.neutral;
}
class VoiceProviderManager {
constructor() {
this.providers = {
stateset: new StateSetVoice(),
elevenlabs: new ElevenLabsVoice(process.env.ELEVENLABS_API_KEY),
azure: new AzureVoice(process.env.AZURE_SPEECH_KEY),
google: new GoogleVoice(process.env.GOOGLE_CLOUD_KEY)
};
}
async synthesize(text, options = {}) {
const provider = this.providers[options.provider || 'stateset'];
try {
const audio = await provider.synthesize(text, options);
// Log metrics
await this.logUsage({
provider: options.provider,
characters: text.length,
voice: options.voice,
duration: audio.duration
});
return audio;
} catch (error) {
// Fallback to another provider
console.error(`${options.provider} failed:`, error);
if (options.provider !== 'stateset') {
return this.synthesize(text, { ...options, provider: 'stateset' });
}
throw error;
}
}
}
// ElevenLabs implementation
class ElevenLabsVoice {
constructor(apiKey) {
this.apiKey = apiKey;
this.baseUrl = 'https://api.elevenlabs.io/v1';
}
async synthesize(text, options) {
const response = await fetch(
`${this.baseUrl}/text-to-speech/${options.voice_id || 'sarah'}`,
{
method: 'POST',
headers: {
'xi-api-key': this.apiKey,
'Content-Type': 'application/json'
},
body: JSON.stringify({
text: text,
model_id: 'eleven_multilingual_v2',
voice_settings: {
stability: options.stability || 0.75,
similarity_boost: options.similarity || 0.75,
style: options.style || 0.5,
use_speaker_boost: true
}
})
}
);
if (!response.ok) {
throw new Error(`ElevenLabs error: ${response.status}`);
}
const audioBuffer = await response.arrayBuffer();
return new Blob([audioBuffer], { type: 'audio/mpeg' });
}
}
async function speechToText(audioBlob) {
try {
const transcript = await voice.transcribe({
audio: audioBlob,
language: 'en-US',
options: {
punctuation: true,
profanity_filter: false,
speaker_diarization: false
}
});
return {
text: transcript.text,
confidence: transcript.confidence,
words: transcript.words, // Word-level timestamps
duration: transcript.duration
};
} catch (error) {
console.error('STT Error:', error);
throw error;
}
}
class StreamingRecognition {
constructor(onTranscript, onFinalTranscript) {
this.onTranscript = onTranscript;
this.onFinalTranscript = onFinalTranscript;
this.stream = null;
}
async start() {
// Get microphone access
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
// Create WebSocket connection for streaming
this.ws = new WebSocket('wss://api.stateset.com/v1/voice/stream');
this.ws.onopen = () => {
console.log('Streaming recognition started');
// Set up audio processing
const audioContext = new AudioContext();
const source = audioContext.createMediaStreamSource(stream);
const processor = audioContext.createScriptProcessor(4096, 1, 1);
processor.onaudioprocess = (e) => {
const audioData = e.inputBuffer.getChannelData(0);
const int16Array = this.float32ToInt16(audioData);
if (this.ws.readyState === WebSocket.OPEN) {
this.ws.send(int16Array.buffer);
}
};
source.connect(processor);
processor.connect(audioContext.destination);
};
this.ws.onmessage = (event) => {
const data = JSON.parse(event.data);
if (data.type === 'transcript') {
this.onTranscript(data.transcript);
} else if (data.type === 'final') {
this.onFinalTranscript(data.transcript);
}
};
}
stop() {
if (this.ws) {
this.ws.close();
}
}
float32ToInt16(buffer) {
const l = buffer.length;
const buf = new Int16Array(l);
for (let i = 0; i < l; i++) {
buf[i] = Math.min(1, buffer[i]) * 0x7FFF;
}
return buf;
}
}
// Usage
const recognition = new StreamingRecognition(
(interim) => console.log('Interim:', interim),
(final) => console.log('Final:', final)
);
await recognition.start();
async function transcribeConversation(audioFile) {
const transcript = await voice.transcribe({
audio: audioFile,
options: {
speaker_diarization: true,
max_speakers: 2,
punctuation: true,
word_timestamps: true,
language_detection: true
}
});
// Format conversation
const conversation = transcript.utterances.map(utterance => ({
speaker: utterance.speaker,
text: utterance.text,
start: utterance.start_time,
end: utterance.end_time,
confidence: utterance.confidence
}));
return {
conversation,
speakers: transcript.speaker_labels,
language: transcript.detected_language,
summary: await generateSummary(conversation)
};
}
async function generateSummary(conversation) {
const agent = await client.agents.get('summary_agent');
const summary = await agent.process({
task: 'summarize_conversation',
conversation: conversation
});
return summary;
}
class VoiceConversation {
constructor(agentId) {
this.agentId = agentId;
this.pc = null;
this.localStream = null;
}
async start() {
// Get user's microphone
this.localStream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true
}
});
// Create peer connection
this.pc = new RTCPeerConnection({
iceServers: [{ urls: 'stun:stun.stateset.com:3478' }]
});
// Add local stream
this.localStream.getTracks().forEach(track => {
this.pc.addTrack(track, this.localStream);
});
// Handle remote stream (agent's voice)
this.pc.ontrack = (event) => {
const audio = new Audio();
audio.srcObject = event.streams[0];
audio.play();
};
// Create offer
const offer = await this.pc.createOffer();
await this.pc.setLocalDescription(offer);
// Send offer to server
const response = await fetch('/api/voice/connect', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
agent_id: this.agentId,
offer: offer.sdp
})
});
const { answer } = await response.json();
await this.pc.setRemoteDescription(new RTCSessionDescription({
type: 'answer',
sdp: answer
}));
}
async end() {
if (this.localStream) {
this.localStream.getTracks().forEach(track => track.stop());
}
if (this.pc) {
this.pc.close();
}
}
}
class VoiceActivityDetector {
constructor(onSpeechStart, onSpeechEnd) {
this.onSpeechStart = onSpeechStart;
this.onSpeechEnd = onSpeechEnd;
this.isListening = false;
this.audioContext = null;
this.analyser = null;
}
async init(stream) {
this.audioContext = new AudioContext();
const source = this.audioContext.createMediaStreamSource(stream);
this.analyser = this.audioContext.createAnalyser();
this.analyser.fftSize = 2048;
this.analyser.smoothingTimeConstant = 0.8;
source.connect(this.analyser);
const bufferLength = this.analyser.frequencyBinCount;
const dataArray = new Uint8Array(bufferLength);
let speechStartTime = null;
let silenceStartTime = null;
const speechThreshold = 30;
const silenceThreshold = 20;
const speechDebounce = 200; // ms
const silenceDebounce = 1500; // ms
const checkAudioLevel = () => {
if (!this.isListening) return;
this.analyser.getByteFrequencyData(dataArray);
const average = dataArray.reduce((a, b) => a + b) / bufferLength;
if (average > speechThreshold) {
if (!speechStartTime) {
speechStartTime = Date.now();
} else if (Date.now() - speechStartTime > speechDebounce) {
this.onSpeechStart();
silenceStartTime = null;
}
} else if (average < silenceThreshold) {
speechStartTime = null;
if (!silenceStartTime) {
silenceStartTime = Date.now();
} else if (Date.now() - silenceStartTime > silenceDebounce) {
this.onSpeechEnd();
}
}
requestAnimationFrame(checkAudioLevel);
};
this.isListening = true;
checkAudioLevel();
}
stop() {
this.isListening = false;
if (this.audioContext) {
this.audioContext.close();
}
}
}
async function createVoiceAgent() {
const agent = await client.agents.create({
name: 'Voice Customer Service',
type: 'voice',
description: 'Handles phone and voice chat support',
voice_config: {
// Voice selection
primary_voice: {
provider: 'elevenlabs',
voice_id: 'sarah_professional',
language: 'en-US'
},
fallback_voice: {
provider: 'stateset',
voice_id: 'sarah',
language: 'en-US'
},
// Speech parameters
speech: {
speed: 1.0,
pitch: 1.0,
volume: 0.9,
emotion_adaptation: true
},
// Recognition parameters
recognition: {
language: 'en-US',
alternatives: 3,
profanity_filter: true,
word_confidence: true,
automatic_punctuation: true
},
// Conversation settings
conversation: {
interrupt_sensitivity: 'medium',
end_of_speech_timeout: 2000,
background_noise_suppression: true,
echo_cancellation: true
}
},
// Voice-specific behaviors
behaviors: {
greeting: {
text: "Hello! Thank you for calling. How can I help you today?",
emotion: "friendly",
pause_after: 500
},
listening_indicators: [
"Mhm",
"I see",
"Got it",
"Okay"
],
clarification_phrases: [
"I'm sorry, could you repeat that?",
"I didn't quite catch that. Could you say it again?",
"Let me make sure I understood correctly..."
],
hold_music: {
enabled: true,
music_url: "https://cdn.stateset.com/hold-music-calm.mp3",
check_in_interval: 30000,
check_in_message: "Thank you for waiting. I'm still working on that for you."
}
}
});
return agent;
}
const voicePersonalities = {
professional: {
voice: 'sarah_professional',
speech_rate: 0.95,
pitch: 0,
style: {
formality: 'high',
enthusiasm: 'moderate',
empathy: 'high'
},
vocabulary: {
greeting: "Good [morning/afternoon/evening]. Thank you for contacting us.",
acknowledgment: "I understand your concern.",
closing: "Is there anything else I can assist you with today?"
}
},
friendly: {
voice: 'alex_casual',
speech_rate: 1.05,
pitch: '+2%',
style: {
formality: 'low',
enthusiasm: 'high',
empathy: 'high'
},
vocabulary: {
greeting: "Hey there! How's it going?",
acknowledgment: "Oh, I totally get that!",
closing: "Anything else I can help you with?"
}
},
empathetic: {
voice: 'sarah_empathetic',
speech_rate: 0.9,
pitch: '-2%',
style: {
formality: 'moderate',
enthusiasm: 'low',
empathy: 'very_high'
},
vocabulary: {
greeting: "Hello, I'm here to help you today.",
acknowledgment: "I can understand how frustrating that must be.",
closing: "Please don't hesitate to reach out if you need anything else."
}
}
};
// Apply personality based on context
async function applyVoicePersonality(agent, context) {
const personality = determinePersonality(context);
await agent.updateVoiceConfig({
voice: personality.voice,
speech: {
rate: personality.speech_rate,
pitch: personality.pitch
},
style: personality.style
});
}
function determinePersonality(context) {
if (context.customer_sentiment === 'angry' || context.issue_severity === 'high') {
return voicePersonalities.empathetic;
} else if (context.customer_type === 'business' || context.formal_request) {
return voicePersonalities.professional;
} else {
return voicePersonalities.friendly;
}
}
import twilio from 'twilio';
class PhoneVoiceAgent {
constructor(config) {
this.twilioClient = twilio(
config.accountSid,
config.authToken
);
this.statesetAgent = config.agent;
}
async handleIncomingCall(req, res) {
const twiml = new twilio.twiml.VoiceResponse();
// Initial greeting
const gather = twiml.gather({
input: 'speech',
timeout: 3,
speechTimeout: 'auto',
action: '/voice/process',
method: 'POST'
});
gather.say({
voice: 'Polly.Joanna'
}, "Hello! I'm your AI assistant. How can I help you today?");
res.type('text/xml');
res.send(twiml.toString());
}
async processVoiceInput(req, res) {
const speechResult = req.body.SpeechResult;
const callSid = req.body.CallSid;
try {
// Process with StateSet agent
const response = await this.statesetAgent.process({
input: speechResult,
context: {
channel: 'phone',
call_sid: callSid,
caller: req.body.From
}
});
// Generate voice response
const twiml = new twilio.twiml.VoiceResponse();
if (response.action === 'transfer') {
twiml.dial(response.transfer_to);
} else {
const gather = twiml.gather({
input: 'speech',
timeout: 3,
action: '/voice/process',
method: 'POST'
});
gather.say({
voice: 'Polly.Joanna'
}, response.message);
}
res.type('text/xml');
res.send(twiml.toString());
} catch (error) {
console.error('Voice processing error:', error);
const twiml = new twilio.twiml.VoiceResponse();
twiml.say("I'm sorry, I'm having trouble understanding. Let me transfer you to a human agent.");
twiml.dial(process.env.FALLBACK_PHONE_NUMBER);
res.type('text/xml');
res.send(twiml.toString());
}
}
}
class VoiceAnalytics {
async analyzeConversation(conversationId) {
const conversation = await client.conversations.get(conversationId);
const analytics = {
// Basic metrics
duration: conversation.duration,
speaker_ratio: this.calculateSpeakerRatio(conversation),
// Sentiment analysis
sentiment_timeline: await this.analyzeSentimentProgression(conversation),
emotion_detection: await this.detectEmotions(conversation),
// Conversation quality
interruptions: this.countInterruptions(conversation),
silence_periods: this.analyzeSilence(conversation),
speech_pace: this.analyzeSpeechPace(conversation),
// Content analysis
keywords: await this.extractKeywords(conversation),
topics: await this.identifyTopics(conversation),
action_items: await this.extractActionItems(conversation),
// Performance metrics
response_times: this.calculateResponseTimes(conversation),
resolution_achieved: conversation.metadata.resolved,
escalation_needed: conversation.metadata.escalated
};
return analytics;
}
calculateSpeakerRatio(conversation) {
const speakerDurations = {};
conversation.utterances.forEach(utterance => {
const speaker = utterance.speaker;
const duration = utterance.end_time - utterance.start_time;
speakerDurations[speaker] = (speakerDurations[speaker] || 0) + duration;
});
const total = Object.values(speakerDurations).reduce((a, b) => a + b, 0);
return Object.entries(speakerDurations).reduce((acc, [speaker, duration]) => {
acc[speaker] = (duration / total * 100).toFixed(1) + '%';
return acc;
}, {});
}
async analyzeSentimentProgression(conversation) {
const timeline = [];
for (const utterance of conversation.utterances) {
const sentiment = await this.analyzeSentiment(utterance.text);
timeline.push({
time: utterance.start_time,
speaker: utterance.speaker,
sentiment: sentiment.score,
magnitude: sentiment.magnitude
});
}
return timeline;
}
}
// Good: Natural interruption handling
const naturalConversation = {
allow_interruptions: true,
interruption_threshold: 0.7, // Confidence level
handle_interruption: async (context) => {
// Gracefully stop speaking
await voice.stopSpeaking();
// Acknowledge interruption
await voice.speak("Oh, sorry, go ahead!");
// Listen for their input
return await voice.listen();
}
};
// Bad: Rigid turn-taking
const rigidConversation = {
allow_interruptions: false,
force_complete_utterances: true
};
class VoiceErrorHandler {
async handle(error, context) {
switch (error.type) {
case 'recognition_failed':
return this.handleRecognitionError(context);
case 'synthesis_failed':
return this.handleSynthesisError(context);
case 'network_error':
return this.handleNetworkError(context);
default:
return this.handleGenericError(context);
}
}
async handleRecognitionError(context) {
const responses = [
"I'm sorry, I didn't catch that. Could you repeat it?",
"There seems to be some background noise. Could you speak up a bit?",
"I'm having trouble hearing you clearly. Can you try again?"
];
// Use appropriate response based on context
const response = this.selectResponse(responses, context);
return {
action: 'retry',
message: response,
adjustments: {
noise_suppression: 'aggressive',
gain_control: 1.2
}
};
}
}
class VoiceOptimizer {
constructor() {
this.cache = new Map();
this.preloadCommonPhrases();
}
async preloadCommonPhrases() {
const commonPhrases = [
"How can I help you today?",
"Let me look that up for you.",
"One moment please.",
"Is there anything else?"
];
for (const phrase of commonPhrases) {
const audio = await voice.synthesize({ text: phrase });
this.cache.set(phrase, audio);
}
}
async speak(text) {
// Check cache first
if (this.cache.has(text)) {
return this.cache.get(text);
}
// Generate and cache
const audio = await voice.synthesize({ text });
// Cache if under size limit
if (audio.size < 1000000) { // 1MB
this.cache.set(text, audio);
}
return audio;
}
}