livekit · szhaomsft · Dec 31, 2025 · Jan 3, 2026 · Jan 3, 2026 · Jan 3, 2026
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -0,0 +1,11 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(python:*)",
+      "Bash(venvScriptsactivate)",
+      "Bash(venv/Scripts/activate)",
+      "Bash(venv/Scripts/python.exe:*)",
+      "Bash(venv/Scripts/pip.exe install:*)"
+    ]
+  }
+}
diff --git a/examples/.env.azure b/examples/.env.azure
@@ -0,0 +1,26 @@
+# Azure Speech Services (STT/TTS)
+# Get these from Azure Portal > Cognitive Services > Speech Service
+AZURE_SPEECH_KEY=your_azure_speech_key_here
+AZURE_SPEECH_REGION=your_region_here  # e.g., eastus, westus2, etc.
+
+# Optional: Specify voice for TTS
+# See: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts
+AZURE_SPEECH_VOICE=en-US-Ava:DragonHDLatestNeural
+
+# Azure OpenAI
+# Get these from Azure Portal > Azure OpenAI Service
+AZURE_OPENAI_ENDPOINT=https://your-resource-name.openai.azure.com
+AZURE_OPENAI_API_KEY=your_azure_openai_key_here
+AZURE_OPENAI_DEPLOYMENT_NAME=gpt-4o  # Your deployment name
+AZURE_OPENAI_API_VERSION=2024-10-01-preview  # Optional, defaults to this version
+
+AZURE_VOICELIVE_ENDPOINT = "https://<region>.api.cognitive.microsoft.com/"
+AZURE_VOICELIVE_API_KEY = "speech key"
+AZURE_VOICELIVE_MODEL = "gpt-4o"
+AZURE_VOICELIVE_VOICE = "en-US-Ava:DragonHDLatestNeural"
+
+# For console mode (optional - only needed for local testing with LiveKit)
+# Get these from https://cloud.livekit.io or your self-hosted LiveKit server
+# LIVEKIT_URL=wss://your-livekit-server.livekit.cloud
+# LIVEKIT_API_KEY=your_livekit_api_key
+# LIVEKIT_API_SECRET=your_livekit_api_secret
diff --git a/examples/AZURE_SETUP.md b/examples/AZURE_SETUP.md
@@ -0,0 +1,169 @@
+# Azure Voice Agent Setup Guide
+
+This guide will help you set up and run the Azure-powered voice agent.
+
+## Prerequisites
+
+✅ Virtual environment created and activated
+✅ Dependencies installed (livekit-agents, azure plugin)
+
+## Required Azure Services
+
+You need the following Azure services set up:
+
+### 1. Azure Speech Services (for STT/TTS)
+
+**Setup:**
+1. Go to [Azure Portal](https://portal.azure.com)
+2. Create a new **Cognitive Services** resource or **Speech Service** resource
+3. Once created, go to **Keys and Endpoint**
+4. Copy:
+   - **KEY 1** (this is your `AZURE_SPEECH_KEY`)
+   - **Region** (e.g., `eastus`, `westus2`)
+
+**Documentation:** https://learn.microsoft.com/en-us/azure/ai-services/speech-service/
+
+### 2. Azure OpenAI (for LLM)
+
+**Setup:**
+1. Go to [Azure Portal](https://portal.azure.com)
+2. Create an **Azure OpenAI** resource
+3. Once created, go to **Keys and Endpoint**
+4. Copy:
+   - **Endpoint** URL (e.g., `https://your-resource.openai.azure.com`)
+   - **KEY 1** (this is your `AZURE_OPENAI_API_KEY`)
+5. Go to **Azure OpenAI Studio** > **Deployments**
+6. Deploy a model (e.g., `gpt-4o`, `gpt-4o-mini`)
+7. Note your **deployment name**
+
+**Documentation:** https://learn.microsoft.com/en-us/azure/ai-services/openai/
+
+## Environment Setup
+
+1. **Copy the example environment file:**
+   ```bash
+   cp .env.example .env
+   ```
+
+2. **Edit `.env` and fill in your Azure credentials:**
+   ```bash
+   # Azure Speech Services
+   AZURE_SPEECH_KEY=your_actual_key_here
+   AZURE_SPEECH_REGION=eastus
+
+   # Azure OpenAI
+   AZURE_OPENAI_ENDPOINT=https://your-resource-name.openai.azure.com
+   AZURE_OPENAI_API_KEY=your_actual_key_here
+   AZURE_OPENAI_DEPLOYMENT_NAME=gpt-4o
+   ```
+
+## Running the Agent
+
+### Option 1: Console Mode (Local Testing with Microphone)
+
+This mode requires a LiveKit server. You can use:
+- **LiveKit Cloud** (easiest): https://cloud.livekit.io (free tier available)
+- **Self-hosted**: https://docs.livekit.io/home/self-hosting/deployment/
+
+```bash
+# Add LiveKit credentials to .env
+LIVEKIT_URL=wss://your-project.livekit.cloud
+LIVEKIT_API_KEY=your_api_key
+LIVEKIT_API_SECRET=your_api_secret
+
+# Run in console mode, no livekit connection needed
+./venv/Scripts/python azure_agent.py console
+```
+
+### Option 2: Development Mode (with LiveKit)
+
+```bash
+./venv/Scripts/python azure_agent.py dev
+```
+
+Then connect using:
+- LiveKit Agents Playground: https://agents-playground.livekit.io/
+- Any LiveKit client SDK
+
+### Option 3: Production Mode
+
+```bash
+./venv/Scripts/python azure_agent.py start
+```
+
+## Customization
+
+### Change TTS Voice
+
+Edit `azure_agent.py` and modify the TTS initialization:
+
+```python
+tts=azure.TTS(voice="en-US-Ava:DragonHDLatestNeural"),  # HD voices for more realistic output
+# or
+tts=azure.TTS(voice="en-US-JennyNeural"),               # Non HD voice
+```
+
+Available voices: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts
+
+### Change Azure OpenAI Model
+
+In your `.env`, update:
+```bash
+AZURE_OPENAI_DEPLOYMENT_NAME=gpt-4o-mini  # For faster, cheaper responses
+# or
+AZURE_OPENAI_DEPLOYMENT_NAME=gpt-4o       # For better quality
+```
+
+### Add Custom Functions
+
+Add function tools to your agent in `azure_agent.py`:
+
+```python
+@function_tool
+async def my_custom_function(
+    self, context: RunContext, parameter: str
+):
+    """Description of what this function does.
+
+    Args:
+        parameter: Description of the parameter
+    """
+    # Your logic here
+    return "result"
+```
+
+## Troubleshooting
+
+### "Missing API keys" error
+- Make sure `.env` file exists in the project root
+- Check that all required environment variables are set
+- Restart your terminal/IDE after updating `.env`
+
+### "Speech service error"
+- Verify your `AZURE_SPEECH_KEY` and `AZURE_SPEECH_REGION` are correct
+- Check that your Speech Service is active in Azure Portal
+- Ensure you're using the correct region
+
+### "OpenAI deployment not found"
+- Verify your deployment name matches exactly (case-sensitive)
+- Make sure the model is deployed in Azure OpenAI Studio
+- Check that your API key has access to the deployment
+
+### No audio in console mode
+- Make sure your microphone is connected and working
+- Check system audio permissions
+- Try running with `--verbose` flag for more logs
+
+## Next Steps
+
+- Explore the `examples/voice_agents/` directory for more advanced examples
+- Check out the [LiveKit Agents documentation](https://docs.livekit.io/agents/)
+- Join the [LiveKit community](https://livekit.io/join-slack)
+
+## Cost Considerations
+
+- **Azure Speech**: Pay per character (TTS) and per hour (STT)
+- **Azure OpenAI**: Pay per token
+- **LiveKit Cloud**: Free tier available, paid plans for production
+
+Estimate costs: https://azure.microsoft.com/en-us/pricing/calculator/
diff --git a/examples/azure_agent.py b/examples/azure_agent.py
@@ -0,0 +1,170 @@
+"""
+Azure Voice Agent - Using Azure Speech Services and Azure OpenAI
+"""
+
+import logging
+import os
+from dotenv import load_dotenv
+
+from livekit.agents import (
+    Agent,
+    AgentSession,
+    JobContext,
+    RunContext,
+    WorkerOptions,
+    cli,
+    function_tool,
+    metrics,
+    MetricsCollectedEvent,
+    tts as tts_module,
+)
+from livekit.plugins import azure, openai, silero
+from livekit.plugins.turn_detector import multilingual
+
+logger = logging.getLogger("azure-agent")
+logger.setLevel(logging.INFO)
+
+load_dotenv()
+
+
+# Logging TTS wrapper that shows sentence breaks
+class SentenceLoggingTTS(tts_module.TTS):
+    def __init__(self, wrapped_tts: tts_module.TTS):
+        super().__init__(
+            capabilities=wrapped_tts.capabilities,
+            sample_rate=wrapped_tts.sample_rate,
+            num_channels=wrapped_tts.num_channels,
+        )
+        self._wrapped = wrapped_tts
+        self._sentence_count = 0
+
+    def synthesize(self, text: str, *, conn_options=None) -> tts_module.ChunkedStream:
+        self._sentence_count += 1
+        logger.info(f"📝 Sentence #{self._sentence_count}: '{text}'")
+
+        if conn_options:
+            return self._wrapped.synthesize(text, conn_options=conn_options)
+        return self._wrapped.synthesize(text)
+
+    def update_options(self, **kwargs):
+        if hasattr(self._wrapped, 'update_options'):
+            return self._wrapped.update_options(**kwargs)
+
+    @property
+    def model(self) -> str:
+        return self._wrapped.model
+
+    @property
+    def provider(self) -> str:
+        return self._wrapped.provider
+
+
+@function_tool
+async def get_weather(
+    context: RunContext,
+    location: str,
+):
+    """Called when the user asks about weather.
+
+    Args:
+        location: The city or location to get weather for
+    """
+    logger.info(f"Getting weather for {location}")
+    # In a real scenario, you would call a weather API here
+    return f"The weather in {location} is sunny with a temperature of 72°F."
+
+
+async def entrypoint(ctx: JobContext):
+    await ctx.connect()
+
+    # Create the agent with instructions and tools
+    agent = Agent(
+        instructions="Your name is Azure Assistant. You interact with users via voice. "
+        "Keep your responses concise and conversational. "
+        "Do not use emojis, asterisks, markdown, or special characters in your responses. "
+        "You are helpful, friendly, and professional.",
+        tools=[get_weather],
+    )
+
+    # Create Azure TTS with sentence logging
+    azure_tts = azure.TTS(
+        voice=os.getenv("AZURE_SPEECH_VOICE", "en-US-JennyNeural"),
+    )
+
+    # Wrap synthesize method to log sentences
+    original_synthesize = azure_tts.synthesize
+    sentence_count = [0]  # Use list to allow mutation in closure
+
+    def logging_synthesize(text: str, *, conn_options=None):
+        sentence_count[0] += 1
+        logger.info(f"📝 Sentence #{sentence_count[0]}: '{text}'")
+        if conn_options:
+            return original_synthesize(text, conn_options=conn_options)
+        return original_synthesize(text)
+
+    azure_tts.synthesize = logging_synthesize
+
+    # Create agent session with Azure services
+    session = AgentSession(
+        # Azure Speech-to-Text
+        stt=azure.STT(),
+
+        # Azure OpenAI for LLM
+        llm=openai.LLM.with_azure(
+            model=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME", "gpt-4o"),
+            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
+            api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-10-01-preview"),
+        ),
+
+        # Azure Text-to-Speech
+        tts=azure_tts,
+
+        # Voice Activity Detection
+        vad=silero.VAD.load(),
+
+        # Turn detection - Multilingual model
+        turn_detection=multilingual.MultilingualModel(),
+
+        # Preemptive generation for faster responses
+        preemptive_generation=True,
+
+        # Handle false interruptions
+        resume_false_interruption=True,
+        false_interruption_timeout=1.0,
+    )
+
+    # Log metrics
+    usage_collector = metrics.UsageCollector()
+
+    @session.on("metrics_collected")
+    def _on_metrics_collected(ev: MetricsCollectedEvent):
+        metrics.log_metrics(ev.metrics)
+        usage_collector.collect(ev.metrics)
+
+    # Log TTS sentence breaking
+    @session.on("agent_speech_started")
+    def _on_agent_speech_started(ev):
+        logger.info(f"🎤 TTS STARTED")
+
+    @session.on("agent_speech_committed")
+    def _on_agent_speech_committed(ev):
+        logger.info(f"🎤 TTS COMMITTED - Audio playback started")
+
+    async def log_usage():
+        summary = usage_collector.get_summary()
+        logger.info(f"Usage: {summary}")
+
+    ctx.add_shutdown_callback(log_usage)
+
+    # Start the agent session
+    await session.start(agent=agent, room=ctx.room)
+
+    # Generate initial greeting
+    await session.generate_reply(
+        instructions="Greet the user warmly and introduce yourself as Azure Assistant."
+    )
+
+
+if __name__ == "__main__":
+    cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))