Skip to content

Add audio transcript #880

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,19 @@ Task Connect(RealtimeHubConnection conn,
Action<string> onModelAudioDeltaReceived,
Action onModelAudioResponseDone,
Action<string> onAudioTranscriptDone,
Action<string> onModelResponseDone,
Action<List<RoleDialogModel>> onModelResponseDone,
Action<string> onConversationItemCreated,
Action<RoleDialogModel> onInputAudioTranscriptionCompleted,
Action onUserInterrupted);
Task AppenAudioBuffer(string message);

Task SendEventToModel(object message);
Task Disconnect();

Task<RealtimeSession> CreateSession(Agent agent, List<RoleDialogModel> conversations);
Task<string> UpdateInitialSession(RealtimeHubConnection conn);
Task<string> InsertConversationItem(RoleDialogModel message);
Task UpdateInitialSession(RealtimeHubConnection conn);
Task InsertConversationItem(RoleDialogModel message);
Task TriggerModelInference(string? instructions = null);
Task<List<RoleDialogModel>> OnResponsedDone(RealtimeHubConnection conn, string response);
Task<RoleDialogModel> OnConversationItemCreated(RealtimeHubConnection conn, string response);
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ public class RealtimeHubConnection
{
public string Event { get; set; } = null!;
public string StreamId { get; set; } = null!;
public string EntryAgentId { get; set; } = null!;
public string ConversationId { get; set; } = null!;
public string Data { get; set; } = string.Empty;
public string Model { get; set; } = null!;
Expand Down
63 changes: 39 additions & 24 deletions src/Infrastructure/BotSharp.Core/Realtime/RealtimeHub.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,15 @@ private async Task ConnectToModel(IRealTimeCompletion completer, WebSocket userW
{
var hookProvider = _services.GetRequiredService<ConversationHookProvider>();
var storage = _services.GetRequiredService<IConversationStorage>();

var convService = _services.GetRequiredService<IConversationService>();
convService.SetConversationId(conn.ConversationId, []);
var conversation = await convService.GetConversation(conn.ConversationId);

var agentService = _services.GetRequiredService<IAgentService>();
var agent = await agentService.LoadAgent(conversation.AgentId);
conn.EntryAgentId = agent.Id;

var routing = _services.GetRequiredService<IRoutingService>();
var dialogs = convService.GetDialogHistory();
routing.Context.SetDialogs(dialogs);
Expand All @@ -77,19 +81,18 @@ await completer.Connect(conn,
onModelReady: async () =>
{
// Control initial session
var data = await completer.UpdateInitialSession(conn);
await completer.SendEventToModel(data);
await completer.UpdateInitialSession(conn);


// Add dialog history
foreach (var item in dialogs)
{
var dialogItem = await completer.InsertConversationItem(item);
await completer.SendEventToModel(data);
await completer.InsertConversationItem(item);
}

if (dialogs.LastOrDefault()?.Role == AgentRole.Assistant)
{
await completer.TriggerModelInference($"Rephase your last response:\r\n{dialogs.LastOrDefault()?.Content}");
// await completer.TriggerModelInference($"Rephase your last response:\r\n{dialogs.LastOrDefault()?.Content}");
}
else
{
Expand All @@ -108,37 +111,49 @@ await completer.Connect(conn,
},
onAudioTranscriptDone: async transcript =>
{
var message = new RoleDialogModel(AgentRole.Assistant, transcript);

// append transcript to conversation
storage.Append(conn.ConversationId, message);

foreach (var hook in hookProvider.HooksOrderByPriority)
{
hook.SetAgent(agent)
.SetConversation(conversation);

if (!string.IsNullOrEmpty(transcript))
{
await hook.OnMessageReceived(message);
}
}
},
onModelResponseDone: async response =>
onModelResponseDone: async messages =>
{
var messages = await completer.OnResponsedDone(conn, response);
foreach (var message in messages)
{
// Invoke function
if (message.FunctionName != null)
if (message.MessageType == "function_call")
{
await routing.InvokeFunction(message.FunctionName, message);
var data = await completer.InsertConversationItem(message);
await completer.SendEventToModel(data);
message.Role = AgentRole.Function;
await completer.InsertConversationItem(message);
await completer.TriggerModelInference("Reply based on the function's output.");
}
else
{
// append transcript to conversation
storage.Append(conn.ConversationId, message);
dialogs.Add(message);

foreach (var hook in hookProvider.HooksOrderByPriority)
{
hook.SetAgent(agent)
.SetConversation(conversation);

if (!string.IsNullOrEmpty(message.Content))
{
await hook.OnMessageReceived(message);
}
}
}
}
},
onConversationItemCreated: async response =>
{

},
onInputAudioTranscriptionCompleted: async message =>
{
// append transcript to conversation
storage.Append(conn.ConversationId, message);
dialogs.Add(message);
},
onUserInterrupted: async () =>
{
var data = conn.OnModelUserInterrupted();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public async Task<bool> InvokeFunction(string name, RoleDialogModel message)
}

// Set result to original message
message.Role = AgentRole.Function;
message.Role = clonedMessage.Role;
message.PostbackFunctionName = clonedMessage.PostbackFunctionName;
message.CurrentAgentId = clonedMessage.CurrentAgentId;
message.Content = clonedMessage.Content;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
namespace BotSharp.Plugin.OpenAI.Models.Realtime;

public class ConversationItemCreated : ServerEventResponse
{
[JsonPropertyName("item")]
public ConversationItemBody Item { get; set; } = new();
}

public class ConversationItemBody
{
[JsonPropertyName("id")]
public string Id { get; set; } = null!;
[JsonPropertyName("type")]
public string Type { get; set; } = null!;

[JsonPropertyName("role")]
public string Role { get; set;} = null!;

[JsonPropertyName("content")]
public ConversationItemContent[] Content { get; set; } = [];
}

public class ConversationItemContent
{
[JsonPropertyName("type")]
public string Type { get; set; } = null!;

[JsonPropertyName("transcript")]
public string Transcript { get; set; } = null!;

[JsonPropertyName("audio")]
public string Audio { get; set; } = null!;
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ public class RealtimeSessionBody
[JsonPropertyName("output_audio_format")]
public string OutputAudioFormat { get; set; } = "pcm16";

[JsonPropertyName("input_audio_transcription")]
public InputAudioTranscription InputAudioTranscription { get; set; } = new();

[JsonPropertyName("instructions")]
public string Instructions { get; set; } = "You are a friendly assistant.";

Expand Down Expand Up @@ -63,4 +66,10 @@ public class RealtimeSessionTurnDetection

[JsonPropertyName("type")]
public string Type { get; set; } = "server_vad";
}

public class InputAudioTranscription
{
[JsonPropertyName("model")]
public string Model { get; set; } = null!;
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ public async Task Connect(RealtimeHubConnection conn,
Action<string> onModelAudioDeltaReceived,
Action onModelAudioResponseDone,
Action<string> onAudioTranscriptDone,
Action<string> onModelResponseDone,
Action<List<RoleDialogModel>> onModelResponseDone,
Action<string> onConversationItemCreated,
Action<RoleDialogModel> onInputAudioTranscriptionCompleted,
Action onUserInterrupted)
{
var settingsService = _services.GetRequiredService<ILlmProviderService>();
Expand All @@ -57,10 +59,13 @@ public async Task Connect(RealtimeHubConnection conn,
onModelReady();

// Receive a message
_ = ReceiveMessage(onModelAudioDeltaReceived,
_ = ReceiveMessage(conn,
onModelAudioDeltaReceived,
onModelAudioResponseDone,
onAudioTranscriptDone,
onModelResponseDone,
onConversationItemCreated,
onInputAudioTranscriptionCompleted,
onUserInterrupted);
}
}
Expand Down Expand Up @@ -94,10 +99,13 @@ await SendEventToModel(new
});
}

private async Task ReceiveMessage(Action<string> onModelAudioDeltaReceived,
private async Task ReceiveMessage(RealtimeHubConnection conn,
Action<string> onModelAudioDeltaReceived,
Action onModelAudioResponseDone,
Action<string> onAudioTranscriptDone,
Action<string> onModelResponseDone,
Action<List<RoleDialogModel>> onModelResponseDone,
Action<string> onConversationItemCreated,
Action<RoleDialogModel> onInputAudioTranscriptionCompleted,
Action onUserInterrupted)
{
var buffer = new byte[1024 * 1024 * 1];
Expand Down Expand Up @@ -158,7 +166,20 @@ private async Task ReceiveMessage(Action<string> onModelAudioDeltaReceived,
else if (response.Type == "response.done")
{
_logger.LogInformation($"{response.Type}: {receivedText}");
onModelResponseDone(receivedText);
await Task.Delay(1000);
var messages = await OnResponsedDone(conn, receivedText);
onModelResponseDone(messages);
}
else if (response.Type == "conversation.item.created")
{
_logger.LogInformation($"{response.Type}: {receivedText}");
onConversationItemCreated(receivedText);
}
else if (response.Type == "conversation.item.input_audio_transcription.completed")
{
_logger.LogInformation($"{response.Type}: {receivedText}");
var message = await OnInputAudioTranscriptionCompleted(conn, receivedText);
onInputAudioTranscriptionCompleted(message);
}
else if (response.Type == "input_audio_buffer.speech_started")
{
Expand Down Expand Up @@ -226,7 +247,7 @@ public async Task<RealtimeSession> CreateSession(Agent agent, List<RoleDialogMod
return session;
}

public async Task<string> UpdateInitialSession(RealtimeHubConnection conn)
public async Task UpdateInitialSession(RealtimeHubConnection conn)
{
var convService = _services.GetRequiredService<IConversationService>();
var conv = await convService.GetConversation(conn.ConversationId);
Expand All @@ -247,6 +268,10 @@ public async Task<string> UpdateInitialSession(RealtimeHubConnection conn)
{
InputAudioFormat = "g711_ulaw",
OutputAudioFormat = "g711_ulaw",
InputAudioTranscription = new InputAudioTranscription
{
Model = "whisper-1",
},
Voice = "alloy",
Instructions = instruction,
ToolChoice = "auto",
Expand All @@ -265,10 +290,10 @@ public async Task<string> UpdateInitialSession(RealtimeHubConnection conn)
}
};

return JsonSerializer.Serialize(sessionUpdate);
await SendEventToModel(sessionUpdate);
}

public async Task<string> InsertConversationItem(RoleDialogModel message)
public async Task InsertConversationItem(RoleDialogModel message)
{
if (message.Role == AgentRole.Function)
{
Expand All @@ -282,10 +307,10 @@ public async Task<string> InsertConversationItem(RoleDialogModel message)
output = message.Content
}
};
return JsonSerializer.Serialize(functionConversationItem);

await SendEventToModel(functionConversationItem);
}
else if (message.Role == AgentRole.User ||
message.Role == AgentRole.Assistant)
else if (message.Role == AgentRole.Assistant)
{
var conversationItem = new
{
Expand All @@ -305,7 +330,29 @@ public async Task<string> InsertConversationItem(RoleDialogModel message)
}
};

return JsonSerializer.Serialize(conversationItem);
await SendEventToModel(conversationItem);
}
else if (message.Role == AgentRole.User)
{
var conversationItem = new
{
type = "conversation.item.create",
item = new
{
type = "message",
role = message.Role,
content = new object[]
{
new
{
type = "input_text",
text = message.Content
}
}
}
};

await SendEventToModel(conversationItem);
}
else
{
Expand Down Expand Up @@ -507,16 +554,42 @@ public async Task<List<RoleDialogModel>> OnResponsedDone(RealtimeHubConnection c
{
if (output.Type == "function_call")
{
outputs.Add(new RoleDialogModel(AgentRole.Assistant, output.Arguments)
outputs.Add(new RoleDialogModel(output.Role, output.Arguments)
{
CurrentAgentId = conn.EntryAgentId,
FunctionName = output.Name,
FunctionArgs = output.Arguments,
MessageType = output.Type,
ToolCallId = output.CallId
});
}
else if (output.Type == "message")
{
var content = output.Content.FirstOrDefault();

outputs.Add(new RoleDialogModel(output.Role, content.Transcript)
{
CurrentAgentId = conn.EntryAgentId
});
}
}

return outputs;
}

public async Task<RoleDialogModel> OnInputAudioTranscriptionCompleted(RealtimeHubConnection conn, string response)
{
var data = JsonSerializer.Deserialize<ResponseAudioTranscript>(response);
return new RoleDialogModel(AgentRole.User, data.Transcript)
{
CurrentAgentId = conn.EntryAgentId
};
}

public async Task<RoleDialogModel> OnConversationItemCreated(RealtimeHubConnection conn, string response)
{
var item = JsonSerializer.Deserialize<ConversationItemCreated>(response).Item;
var message = new RoleDialogModel(item.Role, item.Content.FirstOrDefault()?.Transcript);

return message;
}
}
Loading