Voice Chat

This guide shows how to wire a hands‑free “voice chat” loop using the VAD with your STT (e.g., Whisper) and optional TTS.

What you’ll build

  • A VoiceChatController that:
  • uses MicVAD to auto‑endpoint speech
  • calls your STT on each final segment
  • emits simple status/transcript events to drive UI
  • briefly pauses the mic while TTS plays to avoid barge‑in
  • A small React panel UI that listens to events and lets the user Pause/Resume/End.

Install

  • npm i @realtimex/vad-web (or @realtimex/vad-react if you want hook ergonomics).
  • Ensure assets are served (see “Assets” below).

Controller (framework‑agnostic)

// voice-chat-controller.ts
import { MicVAD } from "@realtimex/vad-web"

type Status = "loading" | "listening" | "processing" | "idle"

// Minimal event emitter using EventTarget
export class Emitter<TEvents extends Record<string, any>> {
  private target = new EventTarget()
  on<K extends keyof TEvents & string>(type: K, handler: (e: TEvents[K]) => void) {
    const wrapped = (evt: Event) => handler((evt as CustomEvent).detail)
    this.target.addEventListener(type, wrapped as EventListener)
    return () => this.target.removeEventListener(type, wrapped as EventListener)
  }
  emit<K extends keyof TEvents & string>(type: K, detail: TEvents[K]) {
    this.target.dispatchEvent(new CustomEvent(type, { detail }))
  }
}

export type VoiceChatEvents = {
  VOICECHAT_STATUS: { status: Status }
  VOICECHAT_TRANSCRIPT: { text: string; isFinal: boolean }
  VOICECHAT_SEND: { text: string }
}

export type STT = {
  // Called with a 16kHz, mono Float32Array (−1..1)
  transcribe: (audio: Float32Array) => Promise<{ text: string }>
}

type Options = {
  // Where to load assets from (see Assets section)
  baseAssetPath?: string
  onnxWASMBasePath?: string
  // TTS coordination hooks (optional)
  onTTSStart?: () => void
  onTTSEnd?: () => void
}

export class VoiceChatController extends Emitter<VoiceChatEvents> {
  private vad?: MicVAD
  private running = false
  private paused = false
  private stt: STT
  private savedVADOptions?: Partial<Parameters<typeof MicVAD.new>[0]>

  constructor(stt: STT, private opts: Options = {}) {
    super()
    this.stt = stt
  }

  async start() {
    if (this.running) return
    this.running = true
    this.paused = false
    this.emit("VOICECHAT_STATUS", { status: "loading" })

    // Configure VAD (Silero V6) with voice‑chat‑friendly thresholds
    this.vad = await MicVAD.new({
      baseAssetPath: this.opts.baseAssetPath ?? "https://cdn.jsdelivr.net/npm/@realtimex/vad-web@latest/dist/",
      onnxWASMBasePath: this.opts.onnxWASMBasePath ?? "https://cdn.jsdelivr.net/npm/@realtimex/vad-web@latest/dist/onnxruntime-web/",
      positiveSpeechThreshold: 0.5,
      negativeSpeechThreshold: 0.35,
      minSpeechMs: 250,
      redemptionMs: 800,
      preSpeechPadMs: 200,
      onSpeechStart: () => {
        this.emit("VOICECHAT_STATUS", { status: "listening" })
      },
      onSpeechRealStart: () => {
        this.emit("VOICECHAT_TRANSCRIPT", { text: "", isFinal: false })
      },
      onSpeechEnd: async (audio) => {
        if (!this.running) return
        this.emit("VOICECHAT_STATUS", { status: "processing" })
        try {
          const { text } = await this.stt.transcribe(audio)
          if (text && text.trim().length > 0) {
            this.emit("VOICECHAT_TRANSCRIPT", { text, isFinal: true })
            this.emit("VOICECHAT_SEND", { text })
          }
        } catch (e) {
          console.error("STT error", e)
        } finally {
          if (this.running && !this.paused) {
            await this.vad?.start()
            this.emit("VOICECHAT_STATUS", { status: "listening" })
          } else if (this.running) {
            this.emit("VOICECHAT_STATUS", { status: "idle" })
          }
        }
      },
    })

    await this.vad.start()
    this.emit("VOICECHAT_STATUS", { status: "listening" })
  }

  async pause() {
    if (!this.running || this.paused) return
    this.paused = true
    this.vad?.pause(() => {})
    this.emit("VOICECHAT_STATUS", { status: "idle" })
  }

  async resume() {
    if (!this.running || !this.paused) return
    this.paused = false
    await this.vad?.start()
    this.emit("VOICECHAT_STATUS", { status: "listening" })
  }

  async stop() {
    if (!this.running) return
    this.running = false
    this.paused = false
    this.vad?.destroy()
    this.vad = undefined
    this.emit("VOICECHAT_STATUS", { status: "idle" })
  }

  // Call these around TTS playback to avoid barge‑in
  async handleTTSStart() {
    await this.pause()
  }
  async handleTTSEnd() {
    if (this.running) await this.resume()
  }
}

React panel UI (example)

// VoiceChatPanel.tsx
import { useEffect, useMemo, useState } from "react"
import { createPortal } from "react-dom"
import { VoiceChatController, STT } from "./voice-chat-controller"

export function VoiceChatPanel({ open, onClose, stt }: { open: boolean; onClose: () => void; stt: STT }) {
  const controller = useMemo(() => new VoiceChatController(stt), [stt])
  const [status, setStatus] = useState<"loading" | "listening" | "processing" | "idle">("idle")
  const [lastText, setLastText] = useState("")

  useEffect(() => {
    const off1 = controller.on("VOICECHAT_STATUS", ({ status }) => setStatus(status))
    const off2 = controller.on("VOICECHAT_TRANSCRIPT", ({ text, isFinal }) => {
      setLastText(text)
      // You could also stream partials to an input UI here
    })
    if (open) controller.start()
    return () => {
      off1()
      off2()
      controller.stop()
    }
  }, [open, controller])

  if (!open) return null
  return createPortal(
    <div style={{ position: "fixed", right: 16, bottom: 16, zIndex: 1000, background: "#111", color: "#fff", padding: 16, borderRadius: 12, width: 360 }}>
      <div style={{ display: "flex", alignItems: "center", gap: 12 }}>
        <div aria-hidden style={{ width: 32, height: 32, borderRadius: 9999, background: status === "listening" ? "#ff4d4f" : "#666", boxShadow: status === "listening" ? "0 0 0 8px rgba(255,77,79,0.2)" : undefined }} />
        <div>
          <div style={{ fontWeight: 600 }}>Voice chat</div>
          <div style={{ fontSize: 12, opacity: 0.8 }}>{status}</div>
        </div>
      </div>
      <div style={{ marginTop: 12, minHeight: 40, fontSize: 14, lineHeight: 1.4, whiteSpace: "pre-wrap" }}>{lastText || (status === "listening" ? "Listening…" : "")}</div>
      <div style={{ display: "flex", gap: 8, marginTop: 12 }}>
        {status !== "listening" ? (
          <button onClick={() => controller.resume()}>Resume</button>
        ) : (
          <button onClick={() => controller.pause()}>Pause</button>
        )}
        <button onClick={() => { controller.stop(); onClose() }}>End</button>
      </div>
    </div>,
    document.body
  )
}

Wire to chat input

  • Listen for VOICECHAT_SEND and submit the text to your chat handler.
  • Optionally reflect partials into your input for consistency.
controller.on("VOICECHAT_SEND", ({ text }) => {
  // update your prompt input UI if desired
  // sendCommand(text, /* autoSubmit */ true)
})

STT stub (replace with your Whisper client)

// Example STT that returns a fake transcript. Replace with Whisper/OpenAI/etc.
export const fakeSTT: STT = {
  async transcribe(_audio: Float32Array) {
    await new Promise((r) => setTimeout(r, 400))
    return { text: "hello there" }
  },
}

TTS coordination

  • If you have TTS start/end events, call controller.handleTTSStart() and controller.handleTTSEnd() accordingly.
  • This pauses the mic while TTS plays, then resumes after playback.

Assets

  • When bundling, ensure the following from @realtimex/vad-web/dist/ are available at runtime:
  • vad.worklet.bundle.min.js
  • silero_vad_v6.onnx
  • onnxruntime-web WASM assets under a base path
  • Easiest path during development: point to the CDN in controller options.
new VoiceChatController(stt, {
  baseAssetPath: "https://cdn.jsdelivr.net/npm/@realtimex/vad-web@latest/dist/",
  onnxWASMBasePath: "https://cdn.jsdelivr.net/npm/@realtimex/vad-web@latest/dist/onnxruntime-web/",
})

Recommended thresholds (starting points)

  • positiveSpeechThreshold: 0.5
  • negativeSpeechThreshold: 0.35
  • minSpeechMs: 250
  • redemptionMs: 800
  • preSpeechPadMs: 200

Notes

  • Manual “Speak Prompt” can still use your existing recorder; the above controller is only for the hands‑free mode.
  • If you see build errors about missing assets, confirm your bundler copies the files listed in Assets.
  • If STT callbacks throw, wrap them in try/catch as above to keep the loop resilient.