From 65c2c0f893bf7110ed9b218ecdeb3c7106742708 Mon Sep 17 00:00:00 2001 From: Haiping Chen <101423@smsassist.com> Date: Mon, 10 Feb 2025 21:51:45 -0600 Subject: [PATCH 1/3] message.Role = clonedMessage.Role --- .../BotSharp.Core/Routing/RoutingService.InvokeFunction.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Infrastructure/BotSharp.Core/Routing/RoutingService.InvokeFunction.cs b/src/Infrastructure/BotSharp.Core/Routing/RoutingService.InvokeFunction.cs index 4d4cd9a89..fb074dfc4 100644 --- a/src/Infrastructure/BotSharp.Core/Routing/RoutingService.InvokeFunction.cs +++ b/src/Infrastructure/BotSharp.Core/Routing/RoutingService.InvokeFunction.cs @@ -49,7 +49,7 @@ public async Task InvokeFunction(string name, RoleDialogModel message) } // Set result to original message - message.Role = AgentRole.Function; + message.Role = clonedMessage.Role; message.PostbackFunctionName = clonedMessage.PostbackFunctionName; message.CurrentAgentId = clonedMessage.CurrentAgentId; message.Content = clonedMessage.Content; From 572205d6284d8ff220f0f2d504b4db6e387b293e Mon Sep 17 00:00:00 2001 From: Haiping Chen <101423@smsassist.com> Date: Tue, 11 Feb 2025 17:27:07 -0600 Subject: [PATCH 2/3] onInputAudioTranscriptionCompleted --- .../MLTasks/IRealTimeCompletion.cs | 9 +- .../Realtime/Models/RealtimeHubConnection.cs | 1 + .../BotSharp.Core/Realtime/RealtimeHub.cs | 63 ++++++----- .../Realtime/ConversationItemCreated.cs | 33 ++++++ .../Models/Realtime/RealtimeSessionBody.cs | 9 ++ .../Realtime/RealTimeCompletionProvider.cs | 101 +++++++++++++++--- .../Controllers/TwilioStreamController.cs | 32 +++--- .../Controllers/TwilioVoiceController.cs | 2 +- .../Functions/HandleOutboundPhoneCallFn.cs | 12 +-- .../Services/TwilioService.cs | 2 +- 10 files changed, 200 insertions(+), 64 deletions(-) create mode 100644 src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ConversationItemCreated.cs diff --git a/src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs b/src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs index c7134693b..8e1d11d55 100644 --- a/src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs +++ b/src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs @@ -14,7 +14,9 @@ Task Connect(RealtimeHubConnection conn, Action onModelAudioDeltaReceived, Action onModelAudioResponseDone, Action onAudioTranscriptDone, - Action onModelResponseDone, + Action> onModelResponseDone, + Action onConversationItemCreated, + Action onInputAudioTranscriptionCompleted, Action onUserInterrupted); Task AppenAudioBuffer(string message); @@ -22,8 +24,9 @@ Task Connect(RealtimeHubConnection conn, Task Disconnect(); Task CreateSession(Agent agent, List conversations); - Task UpdateInitialSession(RealtimeHubConnection conn); - Task InsertConversationItem(RoleDialogModel message); + Task UpdateInitialSession(RealtimeHubConnection conn); + Task InsertConversationItem(RoleDialogModel message); Task TriggerModelInference(string? instructions = null); Task> OnResponsedDone(RealtimeHubConnection conn, string response); + Task OnConversationItemCreated(RealtimeHubConnection conn, string response); } diff --git a/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeHubConnection.cs b/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeHubConnection.cs index 3e4a1f73e..60fec1dc9 100644 --- a/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeHubConnection.cs +++ b/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeHubConnection.cs @@ -4,6 +4,7 @@ public class RealtimeHubConnection { public string Event { get; set; } = null!; public string StreamId { get; set; } = null!; + public string EntryAgentId { get; set; } = null!; public string ConversationId { get; set; } = null!; public string Data { get; set; } = string.Empty; public string Model { get; set; } = null!; diff --git a/src/Infrastructure/BotSharp.Core/Realtime/RealtimeHub.cs b/src/Infrastructure/BotSharp.Core/Realtime/RealtimeHub.cs index d311fd25b..c44f74d05 100644 --- a/src/Infrastructure/BotSharp.Core/Realtime/RealtimeHub.cs +++ b/src/Infrastructure/BotSharp.Core/Realtime/RealtimeHub.cs @@ -64,11 +64,15 @@ private async Task ConnectToModel(IRealTimeCompletion completer, WebSocket userW { var hookProvider = _services.GetRequiredService(); var storage = _services.GetRequiredService(); + var convService = _services.GetRequiredService(); convService.SetConversationId(conn.ConversationId, []); var conversation = await convService.GetConversation(conn.ConversationId); + var agentService = _services.GetRequiredService(); var agent = await agentService.LoadAgent(conversation.AgentId); + conn.EntryAgentId = agent.Id; + var routing = _services.GetRequiredService(); var dialogs = convService.GetDialogHistory(); routing.Context.SetDialogs(dialogs); @@ -77,19 +81,18 @@ await completer.Connect(conn, onModelReady: async () => { // Control initial session - var data = await completer.UpdateInitialSession(conn); - await completer.SendEventToModel(data); + await completer.UpdateInitialSession(conn); + // Add dialog history foreach (var item in dialogs) { - var dialogItem = await completer.InsertConversationItem(item); - await completer.SendEventToModel(data); + await completer.InsertConversationItem(item); } if (dialogs.LastOrDefault()?.Role == AgentRole.Assistant) { - await completer.TriggerModelInference($"Rephase your last response:\r\n{dialogs.LastOrDefault()?.Content}"); + // await completer.TriggerModelInference($"Rephase your last response:\r\n{dialogs.LastOrDefault()?.Content}"); } else { @@ -108,37 +111,49 @@ await completer.Connect(conn, }, onAudioTranscriptDone: async transcript => { - var message = new RoleDialogModel(AgentRole.Assistant, transcript); - - // append transcript to conversation - storage.Append(conn.ConversationId, message); - - foreach (var hook in hookProvider.HooksOrderByPriority) - { - hook.SetAgent(agent) - .SetConversation(conversation); - if (!string.IsNullOrEmpty(transcript)) - { - await hook.OnMessageReceived(message); - } - } }, - onModelResponseDone: async response => + onModelResponseDone: async messages => { - var messages = await completer.OnResponsedDone(conn, response); foreach (var message in messages) { // Invoke function - if (message.FunctionName != null) + if (message.MessageType == "function_call") { await routing.InvokeFunction(message.FunctionName, message); - var data = await completer.InsertConversationItem(message); - await completer.SendEventToModel(data); + message.Role = AgentRole.Function; + await completer.InsertConversationItem(message); await completer.TriggerModelInference("Reply based on the function's output."); } + else + { + // append transcript to conversation + storage.Append(conn.ConversationId, message); + dialogs.Add(message); + + foreach (var hook in hookProvider.HooksOrderByPriority) + { + hook.SetAgent(agent) + .SetConversation(conversation); + + if (!string.IsNullOrEmpty(message.Content)) + { + await hook.OnMessageReceived(message); + } + } + } } }, + onConversationItemCreated: async response => + { + + }, + onInputAudioTranscriptionCompleted: async message => + { + // append transcript to conversation + storage.Append(conn.ConversationId, message); + dialogs.Add(message); + }, onUserInterrupted: async () => { var data = conn.OnModelUserInterrupted(); diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ConversationItemCreated.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ConversationItemCreated.cs new file mode 100644 index 000000000..b46d8cc7b --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/ConversationItemCreated.cs @@ -0,0 +1,33 @@ +namespace BotSharp.Plugin.OpenAI.Models.Realtime; + +public class ConversationItemCreated : ServerEventResponse +{ + [JsonPropertyName("item")] + public ConversationItemBody Item { get; set; } = new(); +} + +public class ConversationItemBody +{ + [JsonPropertyName("id")] + public string Id { get; set; } = null!; + [JsonPropertyName("type")] + public string Type { get; set; } = null!; + + [JsonPropertyName("role")] + public string Role { get; set;} = null!; + + [JsonPropertyName("content")] + public ConversationItemContent[] Content { get; set; } = []; +} + +public class ConversationItemContent +{ + [JsonPropertyName("type")] + public string Type { get; set; } = null!; + + [JsonPropertyName("transcript")] + public string Transcript { get; set; } = null!; + + [JsonPropertyName("audio")] + public string Audio { get; set; } = null!; +} diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs index a5ede20f9..1aca181b9 100644 --- a/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs @@ -28,6 +28,9 @@ public class RealtimeSessionBody [JsonPropertyName("output_audio_format")] public string OutputAudioFormat { get; set; } = "pcm16"; + [JsonPropertyName("input_audio_transcription")] + public InputAudioTranscription InputAudioTranscription { get; set; } = new(); + [JsonPropertyName("instructions")] public string Instructions { get; set; } = "You are a friendly assistant."; @@ -63,4 +66,10 @@ public class RealtimeSessionTurnDetection [JsonPropertyName("type")] public string Type { get; set; } = "server_vad"; +} + +public class InputAudioTranscription +{ + [JsonPropertyName("model")] + public string Model { get; set; } = null!; } \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs index 2a59b4420..9b981cb17 100644 --- a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs @@ -40,7 +40,9 @@ public async Task Connect(RealtimeHubConnection conn, Action onModelAudioDeltaReceived, Action onModelAudioResponseDone, Action onAudioTranscriptDone, - Action onModelResponseDone, + Action> onModelResponseDone, + Action onConversationItemCreated, + Action onInputAudioTranscriptionCompleted, Action onUserInterrupted) { var settingsService = _services.GetRequiredService(); @@ -57,10 +59,13 @@ public async Task Connect(RealtimeHubConnection conn, onModelReady(); // Receive a message - _ = ReceiveMessage(onModelAudioDeltaReceived, + _ = ReceiveMessage(conn, + onModelAudioDeltaReceived, onModelAudioResponseDone, onAudioTranscriptDone, onModelResponseDone, + onConversationItemCreated, + onInputAudioTranscriptionCompleted, onUserInterrupted); } } @@ -94,10 +99,13 @@ await SendEventToModel(new }); } - private async Task ReceiveMessage(Action onModelAudioDeltaReceived, + private async Task ReceiveMessage(RealtimeHubConnection conn, + Action onModelAudioDeltaReceived, Action onModelAudioResponseDone, Action onAudioTranscriptDone, - Action onModelResponseDone, + Action> onModelResponseDone, + Action onConversationItemCreated, + Action onInputAudioTranscriptionCompleted, Action onUserInterrupted) { var buffer = new byte[1024 * 1024 * 1]; @@ -158,7 +166,20 @@ private async Task ReceiveMessage(Action onModelAudioDeltaReceived, else if (response.Type == "response.done") { _logger.LogInformation($"{response.Type}: {receivedText}"); - onModelResponseDone(receivedText); + await Task.Delay(1000); + var messages = await OnResponsedDone(conn, receivedText); + onModelResponseDone(messages); + } + else if (response.Type == "conversation.item.created") + { + _logger.LogInformation($"{response.Type}: {receivedText}"); + onConversationItemCreated(receivedText); + } + else if (response.Type == "conversation.item.input_audio_transcription.completed") + { + _logger.LogInformation($"{response.Type}: {receivedText}"); + var message = await OnInputAudioTranscriptionCompleted(conn, receivedText); + onInputAudioTranscriptionCompleted(message); } else if (response.Type == "input_audio_buffer.speech_started") { @@ -226,7 +247,7 @@ public async Task CreateSession(Agent agent, List UpdateInitialSession(RealtimeHubConnection conn) + public async Task UpdateInitialSession(RealtimeHubConnection conn) { var convService = _services.GetRequiredService(); var conv = await convService.GetConversation(conn.ConversationId); @@ -247,6 +268,10 @@ public async Task UpdateInitialSession(RealtimeHubConnection conn) { InputAudioFormat = "g711_ulaw", OutputAudioFormat = "g711_ulaw", + InputAudioTranscription = new InputAudioTranscription + { + Model = "whisper-1", + }, Voice = "alloy", Instructions = instruction, ToolChoice = "auto", @@ -265,10 +290,10 @@ public async Task UpdateInitialSession(RealtimeHubConnection conn) } }; - return JsonSerializer.Serialize(sessionUpdate); + await SendEventToModel(sessionUpdate); } - public async Task InsertConversationItem(RoleDialogModel message) + public async Task InsertConversationItem(RoleDialogModel message) { if (message.Role == AgentRole.Function) { @@ -282,10 +307,10 @@ public async Task InsertConversationItem(RoleDialogModel message) output = message.Content } }; - return JsonSerializer.Serialize(functionConversationItem); + + await SendEventToModel(functionConversationItem); } - else if (message.Role == AgentRole.User || - message.Role == AgentRole.Assistant) + else if (message.Role == AgentRole.Assistant) { var conversationItem = new { @@ -305,7 +330,29 @@ public async Task InsertConversationItem(RoleDialogModel message) } }; - return JsonSerializer.Serialize(conversationItem); + await SendEventToModel(conversationItem); + } + else if (message.Role == AgentRole.User) + { + var conversationItem = new + { + type = "conversation.item.create", + item = new + { + type = "message", + role = message.Role, + content = new object[] + { + new + { + type = "input_text", + text = message.Content + } + } + } + }; + + await SendEventToModel(conversationItem); } else { @@ -507,16 +554,42 @@ public async Task> OnResponsedDone(RealtimeHubConnection c { if (output.Type == "function_call") { - outputs.Add(new RoleDialogModel(AgentRole.Assistant, output.Arguments) + outputs.Add(new RoleDialogModel(output.Role, output.Arguments) { + CurrentAgentId = conn.EntryAgentId, FunctionName = output.Name, FunctionArgs = output.Arguments, - MessageType = output.Type, ToolCallId = output.CallId }); } + else if (output.Type == "message") + { + var content = output.Content.FirstOrDefault(); + + outputs.Add(new RoleDialogModel(output.Role, content.Transcript) + { + CurrentAgentId = conn.EntryAgentId + }); + } } return outputs; } + + public async Task OnInputAudioTranscriptionCompleted(RealtimeHubConnection conn, string response) + { + var data = JsonSerializer.Deserialize(response); + return new RoleDialogModel(AgentRole.User, data.Transcript) + { + CurrentAgentId = conn.EntryAgentId + }; + } + + public async Task OnConversationItemCreated(RealtimeHubConnection conn, string response) + { + var item = JsonSerializer.Deserialize(response).Item; + var message = new RoleDialogModel(item.Role, item.Content.FirstOrDefault()?.Transcript); + + return message; + } } diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioStreamController.cs b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioStreamController.cs index e398ff722..e6de9fc5e 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioStreamController.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioStreamController.cs @@ -39,10 +39,15 @@ public async Task InitiateStreamConversation(ConversationalVoiceReq VoiceResponse response = null; var instruction = new ConversationalVoiceResponse { - // SpeechPaths = ["twilio/welcome.mp3"], + SpeechPaths = [], ActionOnEmptyResult = true }; + if (_context.HttpContext.Request.Query.ContainsKey("init_audio_file")) + { + instruction.SpeechPaths.Add(_context.HttpContext.Request.Query["init_audio_file"]); + } + if (_context.HttpContext.Request.Query.ContainsKey("conversation_id")) { request.ConversationId = _context.HttpContext.Request.Query["conversation_id"]; @@ -81,9 +86,18 @@ private async Task InitConversation(ConversationalVoiceRequest request) { var convService = _services.GetRequiredService(); var conversation = await convService.GetConversation(request.ConversationId); - if (conversation != null) + if (conversation == null) { - return; + var conv = new Conversation + { + Id = request.CallSid, + AgentId = _settings.AgentId, + Channel = ConversationChannel.Phone, + Title = $"Phone call from {request.From}", + Tags = [], + }; + + conversation = await convService.NewConversation(conv); } var states = new List @@ -92,17 +106,7 @@ private async Task InitConversation(ConversationalVoiceRequest request) new("calling_phone", request.From) }; - var conv = new Conversation - { - Id = request.CallSid, - AgentId = _settings.AgentId, - Channel = ConversationChannel.Phone, - Title = $"Phone call from {request.From}", - Tags = [], - }; - - conv = await convService.NewConversation(conv); - convService.SetConversationId(conv.Id, states); + convService.SetConversationId(conversation.Id, states); convService.SaveStates(); } } diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs index 191446731..069b0d92c 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs @@ -34,7 +34,7 @@ public TwilioVoiceController(TwilioSetting settings, IServiceProvider services, /// /// /// - [ValidateRequest] + // [ValidateRequest] [HttpPost("twilio/voice/welcome")] public async Task InitiateConversation(ConversationalVoiceRequest request) { diff --git a/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/HandleOutboundPhoneCallFn.cs b/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/HandleOutboundPhoneCallFn.cs index ea9b08a1f..12b827aff 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/HandleOutboundPhoneCallFn.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/HandleOutboundPhoneCallFn.cs @@ -68,7 +68,7 @@ public async Task Execute(RoleDialogModel message) var conversationId = newConv.Id; convStorage.Append(conversationId, new List { - new RoleDialogModel(AgentRole.User, "Hi, I'm calling to check my work order quote status, please help me locate my work order number and let me know what to do next.") + new RoleDialogModel(AgentRole.User, "Hi") { CurrentAgentId = entryAgentId }, @@ -80,13 +80,13 @@ public async Task Execute(RoleDialogModel message) states.SetState(StateConst.SUB_CONVERSATION_ID, conversationId); // Generate audio - /*var completion = CompletionProvider.GetAudioCompletion(_services, "openai", "tts-1"); + var completion = CompletionProvider.GetAudioCompletion(_services, "openai", "tts-1"); var data = await completion.GenerateAudioFromTextAsync(args.InitialMessage); var fileName = $"intial.mp3"; fileStorage.SaveSpeechFile(conversationId, fileName, data); // Call phone number - await sessionManager.SetAssistantReplyAsync(conversationId, 0, new AssistantMessage + /*await sessionManager.SetAssistantReplyAsync(conversationId, 0, new AssistantMessage { Content = args.InitialMessage, SpeechFileName = fileName @@ -94,11 +94,9 @@ public async Task Execute(RoleDialogModel message) var call = await CallResource.CreateAsync( // url: new Uri($"{_twilioSetting.CallbackHost}/twilio/voice/init-call?conversationId={conversationId}"), - url: new Uri($"{_twilioSetting.CallbackHost}/twilio/stream?conversation_id={conversationId}"), + url: new Uri($"{_twilioSetting.CallbackHost}/twilio/stream?conversation_id={conversationId}&init_audio_file={fileName}"), to: new PhoneNumber(args.PhoneNumber), - from: new PhoneNumber(_twilioSetting.PhoneNumber), - asyncAmd: "true", - machineDetection: "DetectMessageEnd"); + from: new PhoneNumber(_twilioSetting.PhoneNumber)); message.Content = $"The generated phone message: {args.InitialMessage}." ?? message.Content; message.StopCompletion = true; diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioService.cs b/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioService.cs index 99595b5f0..1ba7120c4 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioService.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioService.cs @@ -189,7 +189,7 @@ public VoiceResponse ReturnBidirectionalMediaStreamsInstructions(string conversa { foreach (var speechPath in conversationalVoiceResponse.SpeechPaths) { - response.Play(new Uri($"{_settings.CallbackHost}/{speechPath}")); + response.Play(new Uri($"{_settings.CallbackHost}/twilio/voice/speeches/{conversationId}/{speechPath}")); } } var connect = new Connect(); From 7fa6bc18c33332c26f1be217afd2e3eaa688b667 Mon Sep 17 00:00:00 2001 From: Haiping Chen <101423@smsassist.com> Date: Wed, 12 Feb 2025 12:29:06 -0600 Subject: [PATCH 3/3] enable ValidateRequest --- .../BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs index 069b0d92c..191446731 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs @@ -34,7 +34,7 @@ public TwilioVoiceController(TwilioSetting settings, IServiceProvider services, /// /// /// - // [ValidateRequest] + [ValidateRequest] [HttpPost("twilio/voice/welcome")] public async Task InitiateConversation(ConversationalVoiceRequest request) {