From d628d977303ae720ed0bc03ba5182a521b1f10dc Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Sat, 31 Jan 2026 09:52:50 +0900 Subject: [PATCH 1/4] fix: #2370 send truncate events independent of response state --- src/agents/realtime/openai_realtime.py | 72 +++++++++---------------- tests/realtime/test_openai_realtime.py | 1 - tests/realtime/test_playback_tracker.py | 2 - 3 files changed, 26 insertions(+), 49 deletions(-) diff --git a/src/agents/realtime/openai_realtime.py b/src/agents/realtime/openai_realtime.py index cc52a394d..a2f197fe3 100644 --- a/src/agents/realtime/openai_realtime.py +++ b/src/agents/realtime/openai_realtime.py @@ -552,29 +552,19 @@ async def _send_interrupt(self, event: RealtimeModelSendInterrupt) -> None: content_index=current_item_content_index, ) ) - if not self._ongoing_response: - logger.debug( - "Skipping truncate because no response is in progress. " - f"Item id: {current_item_id}, " - f"elapsed ms: {elapsed_ms}, " - f"content index: {current_item_content_index}" - ) - else: - max_audio_ms: int | None = None - audio_limits = self._get_audio_limits( - current_item_id, current_item_content_index - ) - if audio_limits is not None: - _, max_audio_ms = audio_limits - truncated_ms = max(int(elapsed_ms), 0) - if max_audio_ms is not None: - truncated_ms = min(truncated_ms, max_audio_ms) - converted = _ConversionHelper.convert_interrupt( - current_item_id, - current_item_content_index, - truncated_ms, - ) - await self._send_raw_message(converted) + max_audio_ms: int | None = None + audio_limits = self._get_audio_limits(current_item_id, current_item_content_index) + if audio_limits is not None: + _, max_audio_ms = audio_limits + truncated_ms = max(int(elapsed_ms), 0) + if max_audio_ms is not None: + truncated_ms = min(truncated_ms, max_audio_ms) + converted = _ConversionHelper.convert_interrupt( + current_item_id, + current_item_content_index, + truncated_ms, + ) + await self._send_raw_message(converted) else: logger.debug( "Didn't interrupt bc elapsed ms is < 0. " @@ -779,30 +769,20 @@ async def _handle_ws_event(self, event: dict[str, Any]): effective_elapsed_ms = float(elapsed_override) if playback_item_id and effective_elapsed_ms is not None: - if not self._ongoing_response: - logger.debug( - "Skipping truncate because no response is in progress. " - f"Item id: {playback_item_id}, " - f"elapsed ms: {effective_elapsed_ms}, " - f"content index: {playback_content_index}" - ) - else: - max_audio_ms: int | None = None - audio_limits = self._get_audio_limits( - playback_item_id, playback_content_index - ) - if audio_limits is not None: - _, max_audio_ms = audio_limits - truncated_ms = max(int(round(effective_elapsed_ms)), 0) - if max_audio_ms is not None: - truncated_ms = min(truncated_ms, max_audio_ms) - await self._send_raw_message( - _ConversionHelper.convert_interrupt( - playback_item_id, - playback_content_index, - truncated_ms, - ) + max_audio_ms: int | None = None + audio_limits = self._get_audio_limits(playback_item_id, playback_content_index) + if audio_limits is not None: + _, max_audio_ms = audio_limits + truncated_ms = max(int(round(effective_elapsed_ms)), 0) + if max_audio_ms is not None: + truncated_ms = min(truncated_ms, max_audio_ms) + await self._send_raw_message( + _ConversionHelper.convert_interrupt( + playback_item_id, + playback_content_index, + truncated_ms, ) + ) # Reset trackers so subsequent playback state queries don't # reference audio that has been interrupted client‑side. diff --git a/tests/realtime/test_openai_realtime.py b/tests/realtime/test_openai_realtime.py index 2327be653..62f8b2c4d 100644 --- a/tests/realtime/test_openai_realtime.py +++ b/tests/realtime/test_openai_realtime.py @@ -548,7 +548,6 @@ async def test_transcription_related_and_timeouts_and_speech_started(self, model # Prepare tracker state to simulate ongoing audio model._audio_state_tracker.set_audio_format("pcm16") model._audio_state_tracker.on_audio_delta("i1", 0, b"a" * 96) - model._ongoing_response = True # Patch sending to avoid websocket dependency monkeypatch.setattr( diff --git a/tests/realtime/test_playback_tracker.py b/tests/realtime/test_playback_tracker.py index 48b83a8a9..16de96287 100644 --- a/tests/realtime/test_playback_tracker.py +++ b/tests/realtime/test_playback_tracker.py @@ -28,7 +28,6 @@ async def test_interrupt_timing_with_custom_playback_tracker(self, model): # Set up model with custom tracker directly model._playback_tracker = custom_tracker - model._ongoing_response = True # Mock send_raw_message to capture interrupt model._send_raw_message = AsyncMock() @@ -63,7 +62,6 @@ async def test_interrupt_clamps_elapsed_to_audio_length(self, model): """Test interrupt clamps elapsed time to the received audio length.""" model._send_raw_message = AsyncMock() model._audio_state_tracker.set_audio_format("pcm16") - model._ongoing_response = True # 48_000 bytes of PCM16 at 24kHz equals ~1000ms of audio. model._audio_state_tracker.on_audio_delta("item_1", 0, b"a" * 48_000) From 97dc67d47a3b3196085bb4c9fd45b4f90f3d07c6 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Sat, 31 Jan 2026 10:02:45 +0900 Subject: [PATCH 2/4] fix review comment --- src/agents/realtime/openai_realtime.py | 25 +++++++++++++------- tests/realtime/test_openai_realtime.py | 32 ++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/src/agents/realtime/openai_realtime.py b/src/agents/realtime/openai_realtime.py index a2f197fe3..57ba8d94d 100644 --- a/src/agents/realtime/openai_realtime.py +++ b/src/agents/realtime/openai_realtime.py @@ -774,15 +774,24 @@ async def _handle_ws_event(self, event: dict[str, Any]): if audio_limits is not None: _, max_audio_ms = audio_limits truncated_ms = max(int(round(effective_elapsed_ms)), 0) - if max_audio_ms is not None: - truncated_ms = min(truncated_ms, max_audio_ms) - await self._send_raw_message( - _ConversionHelper.convert_interrupt( - playback_item_id, - playback_content_index, - truncated_ms, + if max_audio_ms is not None and truncated_ms >= max_audio_ms: + logger.debug( + "Skipping truncate because playback appears complete. " + f"Item id: {playback_item_id}, " + f"elapsed ms: {effective_elapsed_ms}, " + f"content index: {playback_content_index}, " + f"audio length ms: {max_audio_ms}" + ) + else: + if max_audio_ms is not None: + truncated_ms = min(truncated_ms, max_audio_ms) + await self._send_raw_message( + _ConversionHelper.convert_interrupt( + playback_item_id, + playback_content_index, + truncated_ms, + ) ) - ) # Reset trackers so subsequent playback state queries don't # reference audio that has been interrupted client‑side. diff --git a/tests/realtime/test_openai_realtime.py b/tests/realtime/test_openai_realtime.py index 62f8b2c4d..616ccf09b 100644 --- a/tests/realtime/test_openai_realtime.py +++ b/tests/realtime/test_openai_realtime.py @@ -1,5 +1,6 @@ import asyncio import json +from datetime import datetime, timedelta from types import SimpleNamespace from typing import Any, cast from unittest.mock import AsyncMock, Mock, patch @@ -609,6 +610,37 @@ async def test_transcription_related_and_timeouts_and_speech_started(self, model assert "transcript_delta" in types assert "input_audio_timeout_triggered" in types + @pytest.mark.asyncio + async def test_speech_started_skips_truncate_when_audio_complete(self, model, monkeypatch): + model._audio_state_tracker.set_audio_format("pcm16") + model._audio_state_tracker.on_audio_delta("i1", 0, b"a" * 48_000) + state = model._audio_state_tracker.get_state("i1", 0) + assert state is not None + state.initial_received_time = datetime.now() - timedelta(seconds=5) + + monkeypatch.setattr( + model, + "_send_raw_message", + AsyncMock(), + ) + + await model._handle_ws_event( + { + "type": "input_audio_buffer.speech_started", + "event_id": "es2", + "item_id": "i1", + "audio_start_ms": 0, + "audio_end_ms": 0, + } + ) + + truncate_events = [ + call.args[0] + for call in model._send_raw_message.await_args_list + if getattr(call.args[0], "type", None) == "conversation.item.truncate" + ] + assert not truncate_events + class TestSendEventAndConfig(TestOpenAIRealtimeWebSocketModel): @pytest.mark.asyncio From 70219d0e1f322553733134ced79063ef325927e3 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Sat, 31 Jan 2026 10:11:32 +0900 Subject: [PATCH 3/4] fix review comment --- src/agents/realtime/openai_realtime.py | 6 ++++- tests/realtime/test_openai_realtime.py | 33 ++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/agents/realtime/openai_realtime.py b/src/agents/realtime/openai_realtime.py index 57ba8d94d..5bebf159b 100644 --- a/src/agents/realtime/openai_realtime.py +++ b/src/agents/realtime/openai_realtime.py @@ -774,7 +774,11 @@ async def _handle_ws_event(self, event: dict[str, Any]): if audio_limits is not None: _, max_audio_ms = audio_limits truncated_ms = max(int(round(effective_elapsed_ms)), 0) - if max_audio_ms is not None and truncated_ms >= max_audio_ms: + if ( + max_audio_ms is not None + and truncated_ms >= max_audio_ms + and not self._ongoing_response + ): logger.debug( "Skipping truncate because playback appears complete. " f"Item id: {playback_item_id}, " diff --git a/tests/realtime/test_openai_realtime.py b/tests/realtime/test_openai_realtime.py index 616ccf09b..780f3fc52 100644 --- a/tests/realtime/test_openai_realtime.py +++ b/tests/realtime/test_openai_realtime.py @@ -641,6 +641,39 @@ async def test_speech_started_skips_truncate_when_audio_complete(self, model, mo ] assert not truncate_events + @pytest.mark.asyncio + async def test_speech_started_truncates_when_response_ongoing(self, model, monkeypatch): + model._audio_state_tracker.set_audio_format("pcm16") + model._audio_state_tracker.on_audio_delta("i1", 0, b"a" * 48_000) + state = model._audio_state_tracker.get_state("i1", 0) + assert state is not None + state.initial_received_time = datetime.now() - timedelta(seconds=5) + model._ongoing_response = True + + monkeypatch.setattr( + model, + "_send_raw_message", + AsyncMock(), + ) + + await model._handle_ws_event( + { + "type": "input_audio_buffer.speech_started", + "event_id": "es3", + "item_id": "i1", + "audio_start_ms": 0, + "audio_end_ms": 0, + } + ) + + truncate_events = [ + call.args[0] + for call in model._send_raw_message.await_args_list + if getattr(call.args[0], "type", None) == "conversation.item.truncate" + ] + assert truncate_events + assert truncate_events[0].audio_end_ms == 1000 + class TestSendEventAndConfig(TestOpenAIRealtimeWebSocketModel): @pytest.mark.asyncio From c7ba0ebd16e97d06f75d1e0508297fa1a2fbe379 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 3 Feb 2026 13:24:27 +0900 Subject: [PATCH 4/4] fix review comment --- src/agents/realtime/openai_realtime.py | 15 ++++++----- tests/realtime/test_playback_tracker.py | 33 +++++++++++++++++++------ 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/src/agents/realtime/openai_realtime.py b/src/agents/realtime/openai_realtime.py index 5bebf159b..26278a674 100644 --- a/src/agents/realtime/openai_realtime.py +++ b/src/agents/realtime/openai_realtime.py @@ -557,14 +557,13 @@ async def _send_interrupt(self, event: RealtimeModelSendInterrupt) -> None: if audio_limits is not None: _, max_audio_ms = audio_limits truncated_ms = max(int(elapsed_ms), 0) - if max_audio_ms is not None: - truncated_ms = min(truncated_ms, max_audio_ms) - converted = _ConversionHelper.convert_interrupt( - current_item_id, - current_item_content_index, - truncated_ms, - ) - await self._send_raw_message(converted) + if self._ongoing_response or max_audio_ms is None or truncated_ms < max_audio_ms: + converted = _ConversionHelper.convert_interrupt( + current_item_id, + current_item_content_index, + truncated_ms, + ) + await self._send_raw_message(converted) else: logger.debug( "Didn't interrupt bc elapsed ms is < 0. " diff --git a/tests/realtime/test_playback_tracker.py b/tests/realtime/test_playback_tracker.py index 16de96287..bf442ec75 100644 --- a/tests/realtime/test_playback_tracker.py +++ b/tests/realtime/test_playback_tracker.py @@ -1,4 +1,3 @@ -from datetime import datetime, timedelta from unittest.mock import AsyncMock import pytest @@ -58,16 +57,36 @@ async def test_interrupt_skipped_when_no_audio_playing(self, model): model._send_raw_message.assert_not_called() @pytest.mark.asyncio - async def test_interrupt_clamps_elapsed_to_audio_length(self, model): - """Test interrupt clamps elapsed time to the received audio length.""" + async def test_interrupt_skips_when_elapsed_exceeds_audio_length(self, model): + """Test interrupt skips truncation when playback appears complete.""" model._send_raw_message = AsyncMock() model._audio_state_tracker.set_audio_format("pcm16") # 48_000 bytes of PCM16 at 24kHz equals ~1000ms of audio. model._audio_state_tracker.on_audio_delta("item_1", 0, b"a" * 48_000) - state = model._audio_state_tracker.get_state("item_1", 0) - assert state is not None - state.initial_received_time = datetime.now() - timedelta(seconds=2) + model._playback_tracker = RealtimePlaybackTracker() + model._playback_tracker.on_play_ms("item_1", 0, 2000.0) + + await model._send_interrupt(RealtimeModelSendInterrupt()) + + truncate_events = [ + call.args[0] + for call in model._send_raw_message.await_args_list + if getattr(call.args[0], "type", None) == "conversation.item.truncate" + ] + assert truncate_events == [] + + @pytest.mark.asyncio + async def test_interrupt_sends_truncate_when_ongoing_response(self, model): + """Test interrupt still truncates while response is ongoing.""" + model._ongoing_response = True + model._send_raw_message = AsyncMock() + model._audio_state_tracker.set_audio_format("pcm16") + + # 48_000 bytes of PCM16 at 24kHz equals ~1000ms of audio. + model._audio_state_tracker.on_audio_delta("item_1", 0, b"a" * 48_000) + model._playback_tracker = RealtimePlaybackTracker() + model._playback_tracker.on_play_ms("item_1", 0, 2000.0) await model._send_interrupt(RealtimeModelSendInterrupt()) @@ -77,7 +96,7 @@ async def test_interrupt_clamps_elapsed_to_audio_length(self, model): if getattr(call.args[0], "type", None) == "conversation.item.truncate" ] assert truncate_events - assert truncate_events[0].audio_end_ms == 1000 + assert truncate_events[0].audio_end_ms == 2000 def test_audio_state_accumulation_across_deltas(self): """Test ModelAudioTracker accumulates audio length across multiple deltas."""