diff --git a/Assets/Prefabs/DemoIntegration.prefab b/Assets/Prefabs/DemoIntegration.prefab index ceafb60..9be27b0 100644 --- a/Assets/Prefabs/DemoIntegration.prefab +++ b/Assets/Prefabs/DemoIntegration.prefab @@ -134,6 +134,140 @@ MonoBehaviour: m_hasFontAssetChanged: 0 m_baseMaterial: {fileID: 0} m_maskOffset: {x: 0, y: 0, z: 0, w: 0} +--- !u!1 &348816848496444118 +GameObject: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + serializedVersion: 6 + m_Component: + - component: {fileID: 4243238183943022742} + - component: {fileID: 3953792458671070900} + - component: {fileID: 2621590784734090863} + m_Layer: 5 + m_Name: VADEnergyText + m_TagString: Untagged + m_Icon: {fileID: 0} + m_NavMeshLayer: 0 + m_StaticEditorFlags: 0 + m_IsActive: 1 +--- !u!224 &4243238183943022742 +RectTransform: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 348816848496444118} + m_LocalRotation: {x: -0, y: -0, z: -0, w: 1} + m_LocalPosition: {x: 0, y: 0, z: 0} + m_LocalScale: {x: 1, y: 1, z: 1} + m_ConstrainProportionsScale: 0 + m_Children: [] + m_Father: {fileID: 5337751238952555557} + m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} + m_AnchorMin: {x: 0, y: 1} + m_AnchorMax: {x: 0, y: 1} + m_AnchoredPosition: {x: 868, y: -209.6} + m_SizeDelta: {x: 60, y: 32} + m_Pivot: {x: 0.5, y: 0.5} +--- !u!222 &3953792458671070900 +CanvasRenderer: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 348816848496444118} + m_CullTransparentMesh: 1 +--- !u!114 &2621590784734090863 +MonoBehaviour: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 348816848496444118} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 11500000, guid: f4688fdb7df04437aeb418b961361dc5, type: 3} + m_Name: + m_EditorClassIdentifier: + m_Material: {fileID: 0} + m_Color: {r: 1, g: 1, b: 1, a: 1} + m_RaycastTarget: 1 + m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0} + m_Maskable: 1 + m_OnCullStateChanged: + m_PersistentCalls: + m_Calls: [] + m_text: + m_isRightToLeft: 0 + m_fontAsset: {fileID: 11400000, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2} + m_sharedMaterial: {fileID: 2180264, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2} + m_fontSharedMaterials: [] + m_fontMaterial: {fileID: 0} + m_fontMaterials: [] + m_fontColor32: + serializedVersion: 2 + rgba: 4294925056 + m_fontColor: {r: 0, g: 0.35566974, b: 1, a: 1} + m_enableVertexGradient: 0 + m_colorMode: 3 + m_fontColorGradient: + topLeft: {r: 1, g: 1, b: 1, a: 1} + topRight: {r: 1, g: 1, b: 1, a: 1} + bottomLeft: {r: 1, g: 1, b: 1, a: 1} + bottomRight: {r: 1, g: 1, b: 1, a: 1} + m_fontColorGradientPreset: {fileID: 0} + m_spriteAsset: {fileID: 0} + m_tintAllSprites: 0 + m_StyleSheet: {fileID: 0} + m_TextStyleHashCode: -1183493901 + m_overrideHtmlColors: 0 + m_faceColor: + serializedVersion: 2 + rgba: 4294967295 + m_fontSize: 12 + m_fontSizeBase: 12 + m_fontWeight: 400 + m_enableAutoSizing: 0 + m_fontSizeMin: 0 + m_fontSizeMax: 0 + m_fontStyle: 0 + m_HorizontalAlignment: 1 + m_VerticalAlignment: 512 + m_textAlignment: 65535 + m_characterSpacing: 0 + m_wordSpacing: 0 + m_lineSpacing: 0 + m_lineSpacingMax: 0 + m_paragraphSpacing: 0 + m_charWidthMaxAdj: 0 + m_enableWordWrapping: 0 + m_wordWrappingRatios: 0.4 + m_overflowMode: 0 + m_linkedTextComponent: {fileID: 0} + parentLinkedComponent: {fileID: 0} + m_enableKerning: 0 + m_enableExtraPadding: 0 + checkPaddingRequired: 0 + m_isRichText: 1 + m_parseCtrlCharacters: 1 + m_isOrthographic: 1 + m_isCullingEnabled: 0 + m_horizontalMapping: 0 + m_verticalMapping: 0 + m_uvLineOffset: 0 + m_geometrySortingOrder: 0 + m_IsTextObjectScaleStatic: 0 + m_VertexBufferAutoSizeReduction: 0 + m_useMaxVisibleDescender: 1 + m_pageToDisplay: 1 + m_margin: {x: 0, y: 0, z: 0, w: 0} + m_isUsingLegacyAnimationComponent: 0 + m_isVolumetricText: 0 + m_hasFontAssetChanged: 0 + m_baseMaterial: {fileID: 0} + m_maskOffset: {x: 0, y: 0, z: 0, w: 0} --- !u!1 &725768174149704672 GameObject: m_ObjectHideFlags: 0 @@ -2619,6 +2753,7 @@ RectTransform: - {fileID: 9136329686101699382} - {fileID: 3417785804709643581} - {fileID: 9059768842823625612} + - {fileID: 4243238183943022742} - {fileID: 5444497812387959711} - {fileID: 2959617767447010246} - {fileID: 5320398551607356139} @@ -2647,9 +2782,12 @@ MonoBehaviour: m_Script: {fileID: 11500000, guid: d24f73bbca3ae4c68b6ff8dde98d452c, type: 3} m_Name: m_EditorClassIdentifier: - audioController: {fileID: 0} + pushToTalkKey: 32 + audioRecorder: {fileID: 0} + audioPlayer: {fileID: 0} eventsText: {fileID: 4026597692773154623} conversationText: {fileID: 2825327695736519003} + vadEnergyText: {fileID: 2621590784734090863} pushToTalkButton: {fileID: 2178975479906841517} connectButton: {fileID: 8428551282407799376} pushToTalkButtonText: {fileID: 751159222626287672} diff --git a/Assets/Prefabs/RealtimeAPI.prefab b/Assets/Prefabs/RealtimeAPI.prefab index d620fdc..22b954a 100644 --- a/Assets/Prefabs/RealtimeAPI.prefab +++ b/Assets/Prefabs/RealtimeAPI.prefab @@ -10,9 +10,10 @@ GameObject: m_Component: - component: {fileID: 6875561144310920953} - component: {fileID: 7521549165741144525} - - component: {fileID: 68441443065667448} + - component: {fileID: 5778159951835551788} + - component: {fileID: 4277790966370773839} m_Layer: 0 - m_Name: AudioController + m_Name: Audio m_TagString: Untagged m_Icon: {fileID: 0} m_NavMeshLayer: 0 @@ -129,7 +130,7 @@ AudioSource: m_PreInfinity: 2 m_PostInfinity: 2 m_RotationOrder: 4 ---- !u!114 &68441443065667448 +--- !u!114 &5778159951835551788 MonoBehaviour: m_ObjectHideFlags: 0 m_CorrespondingSourceObject: {fileID: 0} @@ -138,15 +139,29 @@ MonoBehaviour: m_GameObject: {fileID: 5144007666192891506} m_Enabled: 1 m_EditorHideFlags: 0 - m_Script: {fileID: 11500000, guid: 9799235ff1b5f46148fca9921bdef04f, type: 3} + m_Script: {fileID: 11500000, guid: 158779b4cfa684eadbc49287121f2c45, type: 3} m_Name: m_EditorClassIdentifier: listeningMode: 0 sampleRate: 24000 - interruptResponseOnNewRecording: 1 - vadThreshold: 0.1 - vadSilenceDuration: 2 - currentVolumeLevel: 0 + interruptResponseOnNewRecording: 0 + vadEnergyThreshold: 2 + vadSilenceDuration: 1.5 + vadFreqThreshold: 1 + fftSampleSize: 1024 +--- !u!114 &4277790966370773839 +MonoBehaviour: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 5144007666192891506} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 11500000, guid: 10dcd261ab0cf4839a08d9c49ec28b50, type: 3} + m_Name: + m_EditorClassIdentifier: + sampleRate: 24000 fftSampleSize: 1024 --- !u!1 &8086852912749008722 GameObject: @@ -194,4 +209,5 @@ MonoBehaviour: m_Name: m_EditorClassIdentifier: apiKey: - audioController: {fileID: 68441443065667448} + audioPlayer: {fileID: 4277790966370773839} + audioRecorder: {fileID: 5778159951835551788} diff --git a/Assets/Scenes/DemoScene.unity b/Assets/Scenes/DemoScene.unity index b4d1e16..636b857 100644 --- a/Assets/Scenes/DemoScene.unity +++ b/Assets/Scenes/DemoScene.unity @@ -392,17 +392,6 @@ RectTransform: m_CorrespondingSourceObject: {fileID: 5337751238952555557, guid: fc531736d51444c8280c8d93c32c7c33, type: 3} m_PrefabInstance: {fileID: 6491429648409679553} m_PrefabAsset: {fileID: 0} ---- !u!114 &1157521912 stripped -MonoBehaviour: - m_CorrespondingSourceObject: {fileID: 68441443065667448, guid: 5ca84f8d7b82048e4ba92610cbf110e9, type: 3} - m_PrefabInstance: {fileID: 909518194653723852} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 0} - m_Enabled: 1 - m_EditorHideFlags: 0 - m_Script: {fileID: 11500000, guid: 9799235ff1b5f46148fca9921bdef04f, type: 3} - m_Name: - m_EditorClassIdentifier: --- !u!1 &1990000876 GameObject: m_ObjectHideFlags: 0 @@ -562,6 +551,28 @@ PrefabInstance: m_AddedGameObjects: [] m_AddedComponents: [] m_SourcePrefab: {fileID: 100100000, guid: 5ca84f8d7b82048e4ba92610cbf110e9, type: 3} +--- !u!114 &909518194653723853 stripped +MonoBehaviour: + m_CorrespondingSourceObject: {fileID: 5778159951835551788, guid: 5ca84f8d7b82048e4ba92610cbf110e9, type: 3} + m_PrefabInstance: {fileID: 909518194653723852} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 0} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 11500000, guid: 158779b4cfa684eadbc49287121f2c45, type: 3} + m_Name: + m_EditorClassIdentifier: +--- !u!114 &909518194653723854 stripped +MonoBehaviour: + m_CorrespondingSourceObject: {fileID: 4277790966370773839, guid: 5ca84f8d7b82048e4ba92610cbf110e9, type: 3} + m_PrefabInstance: {fileID: 909518194653723852} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 0} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 11500000, guid: 10dcd261ab0cf4839a08d9c49ec28b50, type: 3} + m_Name: + m_EditorClassIdentifier: --- !u!1001 &6491429648409679553 PrefabInstance: m_ObjectHideFlags: 0 @@ -859,9 +870,13 @@ PrefabInstance: value: 0 objectReference: {fileID: 0} - target: {fileID: 5488567229014774107, guid: fc531736d51444c8280c8d93c32c7c33, type: 3} - propertyPath: audioController + propertyPath: audioPlayer + value: + objectReference: {fileID: 909518194653723854} + - target: {fileID: 5488567229014774107, guid: fc531736d51444c8280c8d93c32c7c33, type: 3} + propertyPath: audioRecorder value: - objectReference: {fileID: 1157521912} + objectReference: {fileID: 909518194653723853} - target: {fileID: 5539276234762005852, guid: fc531736d51444c8280c8d93c32c7c33, type: 3} propertyPath: m_AnchorMax.y value: 0 diff --git a/Assets/Scripts/Audio.meta b/Assets/Scripts/Audio.meta new file mode 100644 index 0000000..733a130 --- /dev/null +++ b/Assets/Scripts/Audio.meta @@ -0,0 +1,8 @@ +fileFormatVersion: 2 +guid: 8cfabb7c8930b49a6817301df1cf4e1a +folderAsset: yes +DefaultImporter: + externalObjects: {} + userData: + assetBundleName: + assetBundleVariant: diff --git a/Assets/Scripts/Audio/AudioPlayer.cs b/Assets/Scripts/Audio/AudioPlayer.cs new file mode 100644 index 0000000..9565b3c --- /dev/null +++ b/Assets/Scripts/Audio/AudioPlayer.cs @@ -0,0 +1,131 @@ +using System.Collections; +using System.Collections.Generic; +using UnityEngine; + +public class AudioPlayer : MonoBehaviour +{ + private AudioSource audioSource; + private bool isPlayingAudio = false; + private bool cancelPending = false; + public float[] aiFrequencyData { get; private set; } + private List audioBuffer = new List(); + private AudioClip playbackClip; + private const int BUFFER_SIZE = 48000; + private const float MIN_BUFFER_TIME = 0.1f; + public int sampleRate = 24000; + public int fftSampleSize = 1024; + + private void Start() + { + audioSource = GetComponent(); + audioSource.loop = false; + } + + /// + /// enqueues audio data for playback + /// + public void EnqueueAudioData(byte[] pcmAudioData) + { + if (cancelPending) return; + float[] floatData = AudioProcessingUtils.ConvertPCM16ToFloat(pcmAudioData); + audioBuffer.AddRange(floatData); + if (!isPlayingAudio) + { + StartCoroutine(PlayAudioCoroutine()); + } + } + + private IEnumerator PlayAudioCoroutine() + { + isPlayingAudio = true; + while (isPlayingAudio) + { + if (audioBuffer.Count >= sampleRate * MIN_BUFFER_TIME) + { + int samplesToPlay = Mathf.Min(BUFFER_SIZE, audioBuffer.Count); + float[] audioChunk = new float[samplesToPlay]; + audioBuffer.CopyTo(0, audioChunk, 0, samplesToPlay); + audioBuffer.RemoveRange(0, samplesToPlay); + playbackClip = AudioClip.Create("PlaybackClip", samplesToPlay, 1, sampleRate, false); + playbackClip.SetData(audioChunk, 0); + audioSource.clip = playbackClip; + audioSource.Play(); + yield return new WaitForSeconds((float)samplesToPlay / sampleRate); + } + else if (audioBuffer.Count > 0) + { + float[] audioChunk = audioBuffer.ToArray(); + audioBuffer.Clear(); + playbackClip = AudioClip.Create("PlaybackClip", audioChunk.Length, 1, sampleRate, false); + playbackClip.SetData(audioChunk, 0); + audioSource.clip = playbackClip; + audioSource.Play(); + yield return new WaitForSeconds((float)audioChunk.Length / sampleRate); + } + else if (audioBuffer.Count == 0 && !audioSource.isPlaying) + { + yield return new WaitForSeconds(0.1f); + if (audioBuffer.Count == 0) isPlayingAudio = false; + } + else + { + yield return null; + } + } + ClearAudioBuffer(); + } + + private void Update() + { + UpdateAIFrequencyData(); + } + + private void UpdateAIFrequencyData() + { + if (!audioSource.isPlaying) + { + aiFrequencyData = null; + return; + } + int fftSize = fftSampleSize; + aiFrequencyData = new float[fftSize]; + audioSource.GetSpectrumData(aiFrequencyData, 0, FFTWindow.BlackmanHarris); + } + + /// + /// cancels audio playback + /// + public void CancelAudioPlayback() + { + cancelPending = true; + StopAllCoroutines(); + ClearAudioBuffer(); + } + + /// + /// clears audio buffer + /// + public void ClearAudioBuffer() + { + audioBuffer.Clear(); + audioSource.Stop(); + isPlayingAudio = false; + aiFrequencyData = null; + } + + /// + /// checks if audio is playing + /// + public bool IsAudioPlaying() + { + return audioSource.isPlaying || audioBuffer.Count > 0; + } + + /// + /// resets cancel pending flag + /// + public void ResetCancelPending() + { + cancelPending = false; + } +} diff --git a/Assets/Scripts/AudioController.cs.meta b/Assets/Scripts/Audio/AudioPlayer.cs.meta similarity index 83% rename from Assets/Scripts/AudioController.cs.meta rename to Assets/Scripts/Audio/AudioPlayer.cs.meta index d24e105..7b213e1 100644 --- a/Assets/Scripts/AudioController.cs.meta +++ b/Assets/Scripts/Audio/AudioPlayer.cs.meta @@ -1,5 +1,5 @@ fileFormatVersion: 2 -guid: 9799235ff1b5f46148fca9921bdef04f +guid: 10dcd261ab0cf4839a08d9c49ec28b50 MonoImporter: externalObjects: {} serializedVersion: 2 diff --git a/Assets/Scripts/Audio/AudioProcessingUtils.cs b/Assets/Scripts/Audio/AudioProcessingUtils.cs new file mode 100644 index 0000000..0368d82 --- /dev/null +++ b/Assets/Scripts/Audio/AudioProcessingUtils.cs @@ -0,0 +1,139 @@ +using Unity.VisualScripting; +using UnityEngine; + +public static class AudioProcessingUtils +{ + public static float energyLast; + /// + /// converts pcm16 audio data to float array + /// + public static float[] ConvertPCM16ToFloat(byte[] pcmAudioData) + { + int length = pcmAudioData.Length / 2; + float[] floatData = new float[length]; + for (int i = 0; i < length; i++) + { + short sample = System.BitConverter.ToInt16(pcmAudioData, i * 2); + floatData[i] = sample / 32768f; + } + return floatData; + } + + /// + /// converts float audio data to base64-encoded pcm16 string + /// + public static string ConvertFloatToPCM16AndBase64(float[] audioData) + { + byte[] pcm16Audio = new byte[audioData.Length * 2]; + for (int i = 0; i < audioData.Length; i++) + { + short value = (short)(Mathf.Clamp(audioData[i], -1f, 1f) * short.MaxValue); + pcm16Audio[i * 2] = (byte)(value & 0xFF); + pcm16Audio[i * 2 + 1] = (byte)((value >> 8) & 0xFF); + } + return System.Convert.ToBase64String(pcm16Audio); + } + + /// + /// performs fft on audio data (only used for visualization atm) + /// + public static void FFT(float[] data, float[] spectrum) + { + int n = data.Length; + int m = (int)Mathf.Log(n, 2); + int j = 0; + for (int i = 0; i < n; i++) + { + if (i < j) + { + float temp = data[i]; + data[i] = data[j]; + data[j] = temp; + } + int k = n >> 1; + while (k >= 1 && k <= j) + { + j -= k; + k >>= 1; + } + j += k; + } + for (int l = 1; l <= m; l++) + { + int le = 1 << l; + int le2 = le >> 1; + float ur = 1.0f; + float ui = 0.0f; + float sr = Mathf.Cos(Mathf.PI / le2); + float si = -Mathf.Sin(Mathf.PI / le2); + for (int j1 = 0; j1 < le2; j1++) + { + for (int i = j1; i < n; i += le) + { + int ip = i + le2; + float tr = data[ip] * ur - 0 * ui; + float ti = data[ip] * ui + 0 * ur; + data[ip] = data[i] - tr; + data[i] += tr; + } + float temp = ur; + ur = temp * sr - ui * si; + ui = temp * si + ui * sr; + } + } + for (int i = 0; i < n / 2; i++) + { + spectrum[i] = Mathf.Sqrt(data[i] * data[i] + data[n - i - 1] * data[n - i - 1]); + } + } + + /// + /// performs simple voice activity detection + /// + /// credit @Macoron - source: https://raw.githubusercontent.com/Macoron/whisper.unity/275406258aca21fe7753cf0724a65f06fd464eea/Packages/com.whisper.unity/Runtime/Utils/AudioUtils.cs + /// + public static bool SimpleVad(float[] data, int sampleRate, float lastSec, float vadThd, float freqThd) + { + var nSamples = data.Length; + var nSamplesLast = (int)(sampleRate * lastSec); + + if (nSamplesLast >= nSamples) return false; + + if (freqThd > 0.0f) HighPassFilter(data, freqThd, sampleRate); + + var energyAll = 0.0f; + var energyLast = 0.0f; + + for (var i = 0; i < nSamples; i++) + { + energyAll += Mathf.Abs(data[i]); + if (i >= nSamples - nSamplesLast) energyLast += Mathf.Abs(data[i]); + } + + energyAll /= nSamples; + energyLast /= nSamplesLast; + AudioProcessingUtils.energyLast = energyLast; + + return energyLast > vadThd * energyAll; + } + + /// + /// applies high-pass filter to audio data + /// + /// credit @Macoron - source: https://raw.githubusercontent.com/Macoron/whisper.unity/275406258aca21fe7753cf0724a65f06fd464eea/Packages/com.whisper.unity/Runtime/Utils/AudioUtils.cs + /// + public static void HighPassFilter(float[] data, float cutoff, int sampleRate) + { + if (data.Length == 0) + return; + var rc = 1.0f / (2.0f * Mathf.PI * cutoff); + var dt = 1.0f / sampleRate; + var alpha = dt / (rc + dt); + var y = data[0]; + for (var i = 1; i < data.Length; i++) + { + y = alpha * (y + data[i] - data[i - 1]); + data[i] = y; + } + } +} diff --git a/Assets/Scripts/Audio/AudioProcessingUtils.cs.meta b/Assets/Scripts/Audio/AudioProcessingUtils.cs.meta new file mode 100644 index 0000000..54ebe5f --- /dev/null +++ b/Assets/Scripts/Audio/AudioProcessingUtils.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: f87299c0790ac4122948882e263e0009 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: diff --git a/Assets/Scripts/Audio/AudioRecorder.cs b/Assets/Scripts/Audio/AudioRecorder.cs new file mode 100644 index 0000000..88b7599 --- /dev/null +++ b/Assets/Scripts/Audio/AudioRecorder.cs @@ -0,0 +1,222 @@ +using System; +using System.Collections.Generic; +using UnityEngine; + +public class AudioRecorder : MonoBehaviour +{ + public ListeningMode listeningMode = ListeningMode.PushToTalk; + public int sampleRate = 24000; + [SerializeField] private bool interruptResponseOnNewRecording = false; + [SerializeField] private float vadEnergyThreshold = 0.5f; + [SerializeField] private float vadSilenceDuration = 2f; + private float vadLastSec = 1.0f; + [SerializeField] private float vadFreqThreshold = 0.0f; + private bool isVADRecording = false; + private float silenceTimer = 0f; + private int lastSamplePosition = 0; + private AudioClip microphoneClip; + private string microphoneDevice; + public float[] frequencyData { get; private set; } + public int fftSampleSize = 1024; + private List audioDataBuffer = new List(); + private int vadRecordingStartIndex = 0; + private const int MAX_BUFFER_LENGTH_SEC = 10; + public static event Action OnAudioRecorded; + public static event Action OnVADRecordingStarted; + public static event Action OnVADRecordingEnded; + private AudioPlayer audioPlayer; + + private void Start() + { + audioPlayer = GetComponent(); + if (Microphone.devices.Length == 0) + { + Debug.LogError("No microphone devices found."); + return; + } + microphoneDevice = Microphone.devices[0]; + if (listeningMode == ListeningMode.VAD) + { + StartMicrophone(); + } + } + + private void Update() + { + if (Microphone.IsRecording(microphoneDevice)) + { + UpdateCurrentFrequency(); + if (listeningMode == ListeningMode.VAD) + { + PerformVAD(); + } + } + else + { + frequencyData = null; + } + } + + /// + /// starts recording audio + /// + public void StartRecording() + { + if (interruptResponseOnNewRecording) audioPlayer.CancelAudioPlayback(); + audioPlayer.ResetCancelPending(); + microphoneDevice = Microphone.devices[0]; + microphoneClip = Microphone.Start(microphoneDevice, false, 10, sampleRate); + lastSamplePosition = 0; + } + + /// + /// stops recording audio + /// + public void StopRecording() + { + if (Microphone.IsRecording(microphoneDevice)) + { + int micPosition = Microphone.GetPosition(microphoneDevice); + int samples = micPosition; + float[] audioData = new float[samples]; + if (microphoneClip != null && micPosition != 0) + { + microphoneClip.GetData(audioData, 0); + Microphone.End(microphoneDevice); + string base64AudioData = AudioProcessingUtils.ConvertFloatToPCM16AndBase64(audioData); + OnAudioRecorded?.Invoke(base64AudioData); + } + } + frequencyData = null; + } + + /// + /// starts the microphone in loop mode + /// + public void StartMicrophone() + { + microphoneDevice = Microphone.devices[0]; + microphoneClip = Microphone.Start(microphoneDevice, true, 10, sampleRate); + lastSamplePosition = 0; + } + + /// + /// stops the microphone + /// + public void StopMicrophone() + { + if (Microphone.IsRecording(microphoneDevice)) Microphone.End(microphoneDevice); + frequencyData = null; + } + + /// + /// updates fft frequency bands for vad/visualization + /// + private void UpdateCurrentFrequency() + { + int micPosition = Microphone.GetPosition(microphoneDevice); + int sampleDiff = micPosition - lastSamplePosition; + if (sampleDiff < 0) + { + sampleDiff += microphoneClip.samples; + } + if (sampleDiff == 0) + { + return; + } + float[] samples = new float[sampleDiff]; + int startPosition = lastSamplePosition; + if (startPosition + sampleDiff <= microphoneClip.samples) + { + microphoneClip.GetData(samples, startPosition); + } + else + { + int samplesToEnd = microphoneClip.samples - startPosition; + int samplesFromStart = sampleDiff - samplesToEnd; + float[] samplesPart1 = new float[samplesToEnd]; + float[] samplesPart2 = new float[samplesFromStart]; + microphoneClip.GetData(samplesPart1, startPosition); + microphoneClip.GetData(samplesPart2, 0); + Array.Copy(samplesPart1, 0, samples, 0, samplesToEnd); + Array.Copy(samplesPart2, 0, samples, samplesToEnd, samplesFromStart); + } + float maxVolume = 0f; + foreach (var sample in samples) + { + float absSample = Mathf.Abs(sample); + if (absSample > maxVolume) + { + maxVolume = absSample; + } + } + int fftSize = fftSampleSize; + float[] fftSamples = new float[fftSize]; + int copyLength = Mathf.Min(samples.Length, fftSize); + Array.Copy(samples, samples.Length - copyLength, fftSamples, 0, copyLength); + frequencyData = new float[fftSize]; + AudioProcessingUtils.FFT(fftSamples, frequencyData); + lastSamplePosition = micPosition; + audioDataBuffer.AddRange(samples); + int maxBufferLengthSamples = sampleRate * MAX_BUFFER_LENGTH_SEC; + if (audioDataBuffer.Count > maxBufferLengthSamples) + { + int excessSamples = audioDataBuffer.Count - maxBufferLengthSamples; + audioDataBuffer.RemoveRange(0, excessSamples); + vadRecordingStartIndex -= excessSamples; + if (vadRecordingStartIndex < 0) vadRecordingStartIndex = 0; + } + } + + /// + /// checks if speech is detected (AudioProcessingUtils.SimpleVad) and starts vad rec + /// + private void PerformVAD() + { + if (!Microphone.IsRecording(microphoneDevice)) return; + bool hasSpeech = AudioProcessingUtils.SimpleVad(audioDataBuffer.ToArray(), sampleRate, vadLastSec, vadEnergyThreshold, vadFreqThreshold); + if (hasSpeech) + { + silenceTimer = 0f; + if (!isVADRecording) StartVADRecording(); + } + else if (isVADRecording) + { + silenceTimer += Time.deltaTime; + if (silenceTimer >= vadSilenceDuration) StopVADRecording(); + } + } + + /// + /// starts vad recording, i.e. by interrupting / canceling current responses and setting the recording index + /// + private void StartVADRecording() + { + if (interruptResponseOnNewRecording && !isVADRecording) audioPlayer.CancelAudioPlayback(); + audioPlayer.ResetCancelPending(); + isVADRecording = true; + silenceTimer = 0f; + vadRecordingStartIndex = audioDataBuffer.Count; + OnVADRecordingStarted?.Invoke(); + } + + /// + /// stops vad recording and processes recorded vad snippet + /// + private void StopVADRecording() + { + if (isVADRecording) + { + int recordingLength = audioDataBuffer.Count - vadRecordingStartIndex; + if (recordingLength > 0) + { + float[] audioData = audioDataBuffer.GetRange(vadRecordingStartIndex, recordingLength).ToArray(); + string base64AudioData = AudioProcessingUtils.ConvertFloatToPCM16AndBase64(audioData); + OnAudioRecorded?.Invoke(base64AudioData); + } + } + isVADRecording = false; + silenceTimer = 0f; + OnVADRecordingEnded?.Invoke(); + } +} diff --git a/Assets/Scripts/Audio/AudioRecorder.cs.meta b/Assets/Scripts/Audio/AudioRecorder.cs.meta new file mode 100644 index 0000000..b84352e --- /dev/null +++ b/Assets/Scripts/Audio/AudioRecorder.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 158779b4cfa684eadbc49287121f2c45 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: diff --git a/Assets/Scripts/ListeningMode.cs b/Assets/Scripts/Audio/ListeningMode.cs similarity index 100% rename from Assets/Scripts/ListeningMode.cs rename to Assets/Scripts/Audio/ListeningMode.cs diff --git a/Assets/Scripts/ListeningMode.cs.meta b/Assets/Scripts/Audio/ListeningMode.cs.meta similarity index 100% rename from Assets/Scripts/ListeningMode.cs.meta rename to Assets/Scripts/Audio/ListeningMode.cs.meta diff --git a/Assets/Scripts/AudioController.cs b/Assets/Scripts/AudioController.cs deleted file mode 100644 index 7fc3a9d..0000000 --- a/Assets/Scripts/AudioController.cs +++ /dev/null @@ -1,374 +0,0 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using UnityEngine; - -public class AudioController : MonoBehaviour -{ - public ListeningMode listeningMode = ListeningMode.PushToTalk; - public int sampleRate = 24000; - [SerializeField] private bool interruptResponseOnNewRecording = false; - [SerializeField] private float vadThreshold = 0.005f; - [SerializeField] private float vadSilenceDuration = 2f; - private bool isVADRecording = false; - private float silenceTimer = 0f; - private int lastSamplePosition = 0; - private AudioClip microphoneClip; - private AudioSource audioSource; - private bool isPlayingAudio = false; - private bool cancelPending = false; - private string microphoneDevice; - public float currentVolumeLevel = 0f; - public float[] frequencyData { get; private set; } - public int fftSampleSize = 1024; - public float[] aiFrequencyData { get; private set; } - public static event Action OnAudioRecorded; - public static event Action OnVADRecordingStarted; - public static event Action OnVADRecordingEnded; - private List audioBuffer = new List(); - private AudioClip playbackClip; - private const int BUFFER_SIZE = 48000; - private const float MIN_BUFFER_TIME = 0.1f; - - private void Start() - { - audioSource = GetComponent(); - audioSource.loop = false; - if (Microphone.devices.Length == 0) - { - Debug.LogError("No microphone devices found."); - return; - } - microphoneDevice = Microphone.devices[0]; - if (listeningMode == ListeningMode.VAD) - { - StartMicrophone(); - } - } - - private void Update() - { - if (Microphone.IsRecording(microphoneDevice)) - { - UpdateCurrentVolumeAndFrequency(); - if (listeningMode == ListeningMode.VAD) - { - PerformVAD(); - } - } - else - { - frequencyData = null; - } - UpdateAIFrequencyData(); - } - - public void StartRecording() - { - if (interruptResponseOnNewRecording) CancelAudioPlayback(); - if (Microphone.devices.Length == 0) return; - ResetCancelPending(); - microphoneDevice = Microphone.devices[0]; - microphoneClip = Microphone.Start(microphoneDevice, false, 10, sampleRate); - lastSamplePosition = 0; - } - - public void StopRecording() - { - if (Microphone.IsRecording(microphoneDevice)) - { - int micPosition = Microphone.GetPosition(microphoneDevice); - int samples = micPosition; - float[] audioData = new float[samples]; - if (microphoneClip != null && micPosition != 0) - { - microphoneClip.GetData(audioData, 0); - Microphone.End(microphoneDevice); - string base64AudioData = ConvertFloatToPCM16AndBase64(audioData); - OnAudioRecorded?.Invoke(base64AudioData); - } - } - frequencyData = null; - } - - public void StartMicrophone() - { - if (Microphone.devices.Length == 0) return; - microphoneDevice = Microphone.devices[0]; - microphoneClip = Microphone.Start(microphoneDevice, true, 10, sampleRate); - lastSamplePosition = 0; - } - - public void StopMicrophone() - { - if (Microphone.IsRecording(microphoneDevice)) Microphone.End(microphoneDevice); - frequencyData = null; - } - - private void UpdateCurrentVolumeAndFrequency() - { - int micPosition = Microphone.GetPosition(microphoneDevice); - int sampleDiff = micPosition - lastSamplePosition; - if (sampleDiff < 0) - { - sampleDiff += microphoneClip.samples; - } - if (sampleDiff == 0) - { - return; - } - float[] samples = new float[sampleDiff]; - int startPosition = lastSamplePosition; - if (startPosition + sampleDiff <= microphoneClip.samples) - { - microphoneClip.GetData(samples, startPosition); - } - else - { - int samplesToEnd = microphoneClip.samples - startPosition; - int samplesFromStart = sampleDiff - samplesToEnd; - float[] samplesPart1 = new float[samplesToEnd]; - float[] samplesPart2 = new float[samplesFromStart]; - microphoneClip.GetData(samplesPart1, startPosition); - microphoneClip.GetData(samplesPart2, 0); - Array.Copy(samplesPart1, 0, samples, 0, samplesToEnd); - Array.Copy(samplesPart2, 0, samples, samplesToEnd, samplesFromStart); - } - float maxVolume = 0f; - foreach (var sample in samples) - { - float absSample = Mathf.Abs(sample); - if (absSample > maxVolume) - { - maxVolume = absSample; - } - } - currentVolumeLevel = maxVolume; - int fftSize = fftSampleSize; - float[] fftSamples = new float[fftSize]; - int copyLength = Mathf.Min(samples.Length, fftSize); - Array.Copy(samples, samples.Length - copyLength, fftSamples, 0, copyLength); - frequencyData = new float[fftSize]; - FFT(fftSamples, frequencyData); - lastSamplePosition = micPosition; - } - - - private void PerformVAD() - { - if (!Microphone.IsRecording(microphoneDevice)) return; - - if (currentVolumeLevel > vadThreshold && !isVADRecording) - { - silenceTimer = 0f; - StartVADRecording(); - } - else if (isVADRecording) - { - silenceTimer += Time.deltaTime; - if (silenceTimer >= vadSilenceDuration) StopVADRecording(); - } - } - - private void StartVADRecording() - { - if (interruptResponseOnNewRecording && !isVADRecording) CancelAudioPlayback(); - ResetCancelPending(); - isVADRecording = true; - silenceTimer = 0f; - microphoneClip = Microphone.Start(microphoneDevice, false, 10, sampleRate); - OnVADRecordingStarted?.Invoke(); - } - - private void StopVADRecording() - { - if (Microphone.IsRecording(microphoneDevice)) - { - int micPosition = Microphone.GetPosition(microphoneDevice); - float[] audioData = new float[micPosition]; - microphoneClip.GetData(audioData, 0); - string base64AudioData = ConvertFloatToPCM16AndBase64(audioData); - OnAudioRecorded?.Invoke(base64AudioData); - } - - isVADRecording = false; - silenceTimer = 0f; - OnVADRecordingEnded?.Invoke(); - - Microphone.End(microphoneDevice); - StartMicrophone(); - } - - - public void EnqueueAudioData(byte[] pcmAudioData) - { - if (cancelPending) return; - - float[] floatData = ConvertPCM16ToFloat(pcmAudioData); - audioBuffer.AddRange(floatData); - - if (!isPlayingAudio) - { - StartCoroutine(PlayAudioCoroutine()); - } - } - - private IEnumerator PlayAudioCoroutine() - { - isPlayingAudio = true; - - while (isPlayingAudio) - { - if (audioBuffer.Count >= sampleRate * MIN_BUFFER_TIME) - { - int samplesToPlay = Mathf.Min(BUFFER_SIZE, audioBuffer.Count); - float[] audioChunk = new float[samplesToPlay]; - audioBuffer.CopyTo(0, audioChunk, 0, samplesToPlay); - audioBuffer.RemoveRange(0, samplesToPlay); - - playbackClip = AudioClip.Create("PlaybackClip", samplesToPlay, 1, sampleRate, false); - playbackClip.SetData(audioChunk, 0); - - audioSource.clip = playbackClip; - audioSource.Play(); - - yield return new WaitForSeconds((float)samplesToPlay / sampleRate); - } - else if (audioBuffer.Count > 0) - { - float[] audioChunk = audioBuffer.ToArray(); - audioBuffer.Clear(); - - playbackClip = AudioClip.Create("PlaybackClip", audioChunk.Length, 1, sampleRate, false); - playbackClip.SetData(audioChunk, 0); - - audioSource.clip = playbackClip; - audioSource.Play(); - - yield return new WaitForSeconds((float)audioChunk.Length / sampleRate); - } - else if (audioBuffer.Count == 0 && !audioSource.isPlaying) - { - yield return new WaitForSeconds(0.1f); - if (audioBuffer.Count == 0) isPlayingAudio = false; - } - else - { - yield return null; - } - } - - ClearAudioBuffer(); - } - - private void UpdateAIFrequencyData() - { - if (!audioSource.isPlaying) - { - aiFrequencyData = null; - return; - } - int fftSize = fftSampleSize; - aiFrequencyData = new float[fftSize]; - audioSource.GetSpectrumData(aiFrequencyData, 0, FFTWindow.BlackmanHarris); - } - - public void CancelAudioPlayback() - { - cancelPending = true; - StopAllCoroutines(); - ClearAudioBuffer(); - } - - private void ClearAudioBuffer() - { - audioBuffer.Clear(); - audioSource.Stop(); - isPlayingAudio = false; - aiFrequencyData = null; - } - - public bool IsAudioPlaying() - { - return audioSource.isPlaying || audioBuffer.Count > 0; - } - - private float[] ConvertPCM16ToFloat(byte[] pcmAudioData) - { - int length = pcmAudioData.Length / 2; - float[] floatData = new float[length]; - for (int i = 0; i < length; i++) - { - short sample = BitConverter.ToInt16(pcmAudioData, i * 2); - floatData[i] = sample / 32768f; - } - return floatData; - } - - private string ConvertFloatToPCM16AndBase64(float[] audioData) - { - byte[] pcm16Audio = new byte[audioData.Length * 2]; - for (int i = 0; i < audioData.Length; i++) - { - short value = (short)(Mathf.Clamp(audioData[i], -1f, 1f) * short.MaxValue); - pcm16Audio[i * 2] = (byte)(value & 0xFF); - pcm16Audio[i * 2 + 1] = (byte)((value >> 8) & 0xFF); - } - return Convert.ToBase64String(pcm16Audio); - } - - public void ResetCancelPending() - { - cancelPending = false; - } - - private void FFT(float[] data, float[] spectrum) - { - int n = data.Length; - int m = (int)Mathf.Log(n, 2); - int j = 0; - for (int i = 0; i < n; i++) - { - if (i < j) - { - float temp = data[i]; - data[i] = data[j]; - data[j] = temp; - } - int k = n >> 1; - while (k >= 1 && k <= j) - { - j -= k; - k >>= 1; - } - j += k; - } - for (int l = 1; l <= m; l++) - { - int le = 1 << l; - int le2 = le >> 1; - float ur = 1.0f; - float ui = 0.0f; - float sr = Mathf.Cos(Mathf.PI / le2); - float si = -Mathf.Sin(Mathf.PI / le2); - for (int j1 = 0; j1 < le2; j1++) - { - for (int i = j1; i < n; i += le) - { - int ip = i + le2; - float tr = data[ip] * ur - 0 * ui; - float ti = data[ip] * ui + 0 * ur; - data[ip] = data[i] - tr; - data[i] += tr; - } - float temp = ur; - ur = temp * sr - ui * si; - ui = temp * si + ui * sr; - } - } - for (int i = 0; i < n / 2; i++) - { - spectrum[i] = Mathf.Sqrt(data[i] * data[i] + data[n - i - 1] * data[n - i - 1]); - } - } -} diff --git a/Assets/Scripts/DemoIntegration.cs b/Assets/Scripts/DemoIntegration.cs index 87cdcbe..a6d7985 100644 --- a/Assets/Scripts/DemoIntegration.cs +++ b/Assets/Scripts/DemoIntegration.cs @@ -5,9 +5,12 @@ public class DemoIntegration : MonoBehaviour { - [SerializeField] private AudioController audioController; + [SerializeField] private KeyCode pushToTalkKey = KeyCode.Space; + [SerializeField] private AudioRecorder audioRecorder; + [SerializeField] private AudioPlayer audioPlayer; [SerializeField] private TextMeshProUGUI eventsText; [SerializeField] private TextMeshProUGUI conversationText; + [SerializeField] private TextMeshProUGUI vadEnergyText; [SerializeField] private Button pushToTalkButton; [SerializeField] private Button connectButton; [SerializeField] private TextMeshProUGUI pushToTalkButtonText; @@ -30,7 +33,6 @@ public class DemoIntegration : MonoBehaviour List conversationMessages = new List(); string currentConversationLine = ""; - float[] userBarAmplitudes; float[] aiBarAmplitudes; float barSmoothingSpeed = 5f; @@ -46,8 +48,8 @@ private void Start() RealtimeAPIWrapper.OnTranscriptReceived += OnTranscriptReceived; RealtimeAPIWrapper.OnResponseCreated += OnResponseCreated; - AudioController.OnVADRecordingStarted += OnVADRecordingStarted; - AudioController.OnVADRecordingEnded += OnVADRecordingEnded; + AudioRecorder.OnVADRecordingStarted += OnVADRecordingStarted; + AudioRecorder.OnVADRecordingEnded += OnVADRecordingEnded; manualListeningButton.onClick.AddListener(OnManualListeningMode); vadListeningButton.onClick.AddListener(OnVADListeningMode); @@ -61,27 +63,24 @@ private void Start() private void Update() { - if (audioController.listeningMode == ListeningMode.PushToTalk) + if (audioRecorder.listeningMode == ListeningMode.PushToTalk) { - if (Input.GetKeyDown(KeyCode.Space) && !isRecording) - { - StartRecording(); - } - if (Input.GetKeyUp(KeyCode.Space) && isRecording) - { - StopRecording(); - } + if (Input.GetKeyDown(pushToTalkKey) && !isRecording) StartRecording(); + if (Input.GetKeyUp(pushToTalkKey) && isRecording) StopRecording(); } UpdateFrequencyBars(); UpdateAIFrequencyBars(); } + /// + /// updates frequency bars for user audio visualization + /// private void UpdateFrequencyBars() { if (frequencyBars == null || frequencyBars.Length == 0) return; - if (!isRecording && audioController.listeningMode == ListeningMode.PushToTalk) + if (!isRecording && audioRecorder.listeningMode == ListeningMode.PushToTalk) { for (int i = 0; i < frequencyBars.Length; i++) { @@ -91,7 +90,7 @@ private void UpdateFrequencyBars() return; } - float[] spectrum = audioController.frequencyData; + float[] spectrum = audioRecorder.frequencyData; if (spectrum == null || spectrum.Length == 0) { for (int i = 0; i < frequencyBars.Length; i++) @@ -102,8 +101,8 @@ private void UpdateFrequencyBars() return; } - float sampleRate = audioController.sampleRate; - int fftSize = audioController.fftSampleSize; + float sampleRate = audioRecorder.sampleRate; + int fftSize = audioRecorder.fftSampleSize; float nyquist = sampleRate / 2f; float freqPerBin = nyquist / fftSize; float[] freqBands = new float[] { 85f, 160f, 255f, 350f, 500f, 1000f, 2000f, 3000f, 4000f, nyquist }; @@ -124,13 +123,20 @@ private void UpdateFrequencyBars() userBarAmplitudes[i] = Mathf.Lerp(userBarAmplitudes[i], amplitude, Time.deltaTime * barSmoothingSpeed); frequencyBars[i].fillAmount = userBarAmplitudes[i]; } + + + if (audioRecorder.listeningMode == ListeningMode.VAD) + vadEnergyText.text = "nrg: " + AudioProcessingUtils.energyLast.ToString("0.0000E+0"); } + /// + /// updates frequency bars for ai audio visualization + /// private void UpdateAIFrequencyBars() { if (aiFrequencyBars == null || aiFrequencyBars.Length == 0) return; - float[] spectrum = audioController.aiFrequencyData; + float[] spectrum = audioPlayer.aiFrequencyData; if (spectrum == null || spectrum.Length == 0) { for (int i = 0; i < aiFrequencyBars.Length; i++) @@ -141,8 +147,8 @@ private void UpdateAIFrequencyBars() return; } - float sampleRate = audioController.sampleRate; - int fftSize = audioController.fftSampleSize; + float sampleRate = audioPlayer.sampleRate; + int fftSize = audioPlayer.fftSampleSize; float nyquist = sampleRate / 2f; float freqPerBin = nyquist / fftSize; float[] freqBands = new float[] { 85f, 160f, 255f, 350f, 500f, 1000f, 2000f, 3000f, 4000f, nyquist }; @@ -165,43 +171,48 @@ private void UpdateAIFrequencyBars() } } + /// + /// handles push-to-talk button press + /// private void OnRecordButtonPressed() { - if (audioController.listeningMode == ListeningMode.PushToTalk) + if (audioRecorder.listeningMode == ListeningMode.PushToTalk) { - if (isRecording) - { - StopRecording(); - } - else - { - StartRecording(); - } + if (isRecording) StopRecording(); + else StartRecording(); } } + /// + /// starts audio recording + /// private void StartRecording() { - audioController.StartRecording(); + audioRecorder.StartRecording(); isRecording = true; AddLogMessage("recording..."); UpdateRecordButton(); } + /// + /// stops audio recording + /// private void StopRecording() { - audioController.StopRecording(); + audioRecorder.StopRecording(); isRecording = false; AddLogMessage("recording stopped. sending audio..."); UpdateRecordButton(); } - private void OnVADRecordingStarted() => AddLogMessage("VAD recording started..."); - private void OnVADRecordingEnded() => AddLogMessage("VAD recording ended."); + + /// + /// updates the record button UI + /// private void UpdateRecordButton() { - if (audioController.listeningMode == ListeningMode.PushToTalk) + if (audioRecorder.listeningMode == ListeningMode.PushToTalk) { pushToTalkButton.interactable = true; if (isRecording) @@ -225,46 +236,57 @@ private void UpdateRecordButton() } } + /// + /// activates manual listening mode + /// private void OnManualListeningMode() { - AddLogMessage("Manual listening mode activated (push to talk / spacebar)."); + AddLogMessage("manual listening mode activated (push to talk / spacebar)."); - audioController.listeningMode = ListeningMode.PushToTalk; - audioController.StopMicrophone(); + audioRecorder.listeningMode = ListeningMode.PushToTalk; + audioRecorder.StopMicrophone(); UpdateListeningModeButtons(); UpdateRecordButton(); + + vadEnergyText.text = ""; } + /// + /// activates VAD listening mode + /// private void OnVADListeningMode() { AddLogMessage("VAD listening mode activated (super basic client-side vad, threshold-based)."); - audioController.listeningMode = ListeningMode.VAD; - audioController.StartMicrophone(); - if (isRecording) - { - StopRecording(); - } + audioRecorder.listeningMode = ListeningMode.VAD; + audioRecorder.StartMicrophone(); + if (isRecording) StopRecording(); UpdateListeningModeButtons(); UpdateRecordButton(); } + /// + /// updates listening mode buttons UI + /// private void UpdateListeningModeButtons() { - if (audioController.listeningMode == ListeningMode.PushToTalk) + if (audioRecorder.listeningMode == ListeningMode.PushToTalk) { SetButtonActive(manualListeningButton, manualListeningButtonText); SetButtonInactive(vadListeningButton, vadListeningButtonText); } - else if (audioController.listeningMode == ListeningMode.VAD) + else if (audioRecorder.listeningMode == ListeningMode.VAD) { SetButtonActive(vadListeningButton, vadListeningButtonText); SetButtonInactive(manualListeningButton, manualListeningButtonText); } } + /// + /// sets a button to active state + /// private void SetButtonActive(Button button, TextMeshProUGUI buttonText) { buttonText.color = Color.white; @@ -275,6 +297,9 @@ private void SetButtonActive(Button button, TextMeshProUGUI buttonText) button.colors = cb; } + /// + /// sets a button to inactive state + /// private void SetButtonInactive(Button button, TextMeshProUGUI buttonText) { buttonText.color = new Color(50f / 255f, 50f / 255f, 50f / 255f); @@ -285,7 +310,9 @@ private void SetButtonInactive(Button button, TextMeshProUGUI buttonText) button.colors = cb; } - + /// + /// adds a message to the log + /// private void AddLogMessage(string message) { if (logMessages.Count >= logCountLimit) logMessages.RemoveAt(0); @@ -296,6 +323,9 @@ private void AddLogMessage(string message) UpdateEventsText(); } + /// + /// updates the events text UI (line-idx based color-fade) + /// private void UpdateEventsText() { eventsText.text = ""; @@ -307,6 +337,9 @@ private void UpdateEventsText() } } + /// + /// called when new websocket is connected - changes UI button states + /// private void OnWebSocketConnected() { AddLogMessage("connection established."); @@ -319,6 +352,9 @@ private void OnWebSocketConnected() connectButton.colors = cb; } + /// + /// called when new websocket is closed - changes UI button states + /// private void OnWebSocketClosed() { AddLogMessage("connection closed."); @@ -331,11 +367,11 @@ private void OnWebSocketClosed() if (connectButton) connectButton.colors = cb; } - private void OnSessionCreated() - { - AddLogMessage("session created."); - } + + /// + /// called when new conversation item is created - cleans current transcript line for new chunks + /// private void OnConversationItemCreated() { AddLogMessage("conversation item created."); @@ -351,11 +387,10 @@ private void OnConversationItemCreated() } - private void OnResponseDone() - { - AddLogMessage("response done."); - } + /// + /// called when new transcript chunk is received + /// private void OnTranscriptReceived(string transcriptPart) { if (string.IsNullOrEmpty(currentConversationLine)) @@ -369,7 +404,9 @@ private void OnTranscriptReceived(string transcriptPart) UpdateConversationTextInPlace(); } - + /// + /// updates the conversation text in place + /// private void UpdateConversationTextInPlace() { conversationText.text = ""; @@ -384,6 +421,9 @@ private void UpdateConversationTextInPlace() conversationText.text += $"{currentConversationLine}"; } + /// + /// updates the conversation text UI + /// private void UpdateConversationText() { conversationText.text = ""; @@ -396,8 +436,9 @@ private void UpdateConversationText() } } - private void OnResponseCreated() - { - AddLogMessage("response created."); - } + private void OnSessionCreated() => AddLogMessage("session created."); + private void OnResponseCreated() => AddLogMessage("response created."); + private void OnResponseDone() => AddLogMessage("response done."); + private void OnVADRecordingStarted() => AddLogMessage("VAD recording started..."); + private void OnVADRecordingEnded() => AddLogMessage("VAD recording ended."); } diff --git a/Assets/Scripts/RealtimeAPIWrapper.cs b/Assets/Scripts/RealtimeAPIWrapper.cs index a842b92..c95dda9 100644 --- a/Assets/Scripts/RealtimeAPIWrapper.cs +++ b/Assets/Scripts/RealtimeAPIWrapper.cs @@ -10,11 +10,13 @@ public class RealtimeAPIWrapper : MonoBehaviour { private ClientWebSocket ws; - public string apiKey = "YOUR_API_KEY"; - public AudioController audioController; + [SerializeField] string apiKey = "YOUR_API_KEY"; + public AudioPlayer audioPlayer; + public AudioRecorder audioRecorder; private StringBuilder messageBuffer = new StringBuilder(); private StringBuilder transcriptBuffer = new StringBuilder(); private bool isResponseInProgress = false; + public static event Action OnWebSocketConnected; public static event Action OnWebSocketClosed; public static event Action OnSessionCreated; @@ -31,8 +33,13 @@ public class RealtimeAPIWrapper : MonoBehaviour public static event Action OnResponseContentPartAdded; public static event Action OnResponseCancelled; - private void Start() => AudioController.OnAudioRecorded += SendAudioToAPI; + private void Start() => AudioRecorder.OnAudioRecorded += SendAudioToAPI; + private void OnApplicationQuit() => DisposeWebSocket(); + + /// + /// connects or disconnects websocket when button is pressed + /// public async void ConnectWebSocketButton() { if (ws != null) DisposeWebSocket(); @@ -43,6 +50,9 @@ public async void ConnectWebSocketButton() } } + /// + /// establishes websocket connection to the api + /// private async Task ConnectWebSocket() { try @@ -56,10 +66,13 @@ private async Task ConnectWebSocket() } catch (Exception e) { - Debug.LogError("WebSocket connection failed: " + e.Message); + Debug.LogError("websocket connection failed: " + e.Message); } } + /// + /// sends a cancel event to api if response is in progress + /// private async void SendCancelEvent() { if (ws.State == WebSocketState.Open && isResponseInProgress) @@ -76,9 +89,14 @@ private async void SendCancelEvent() } } + /// + /// sends recorded audio to the api + /// private async void SendAudioToAPI(string base64AudioData) { - if (isResponseInProgress) SendCancelEvent(); + if (isResponseInProgress) + SendCancelEvent(); + if (ws != null && ws.State == WebSocketState.Open) { var eventMessage = new @@ -114,6 +132,9 @@ private async void SendAudioToAPI(string base64AudioData) } } + /// + /// receives messages from websocket and handles them + /// private async Task ReceiveMessages() { var buffer = new byte[1024 * 128]; @@ -126,7 +147,7 @@ private async Task ReceiveMessages() if (ws.State == WebSocketState.CloseReceived) { - Debug.Log("WebSocket close received, disposing current WS instance."); + Debug.Log("websocket close received, disposing current ws instance."); DisposeWebSocket(); return; } @@ -143,24 +164,23 @@ private async Task ReceiveMessages() JObject eventMessage = JObject.Parse(fullMessage); string messageType = eventMessage["type"]?.ToString(); - if (messageHandlers.TryGetValue(messageType, out var handler)) - { - handler(eventMessage); - } - else - { - Debug.Log("Unhandled message type: " + messageType); - } + if (messageHandlers.TryGetValue(messageType, out var handler)) handler(eventMessage); + + else Debug.Log("unhandled message type: " + messageType); + } catch (Exception ex) { - Debug.LogError("Error parsing JSON: " + ex.Message); + Debug.LogError("error parsing json: " + ex.Message); } } } } } + /// + /// returns dictionary of message handlers for different message types + /// private Dictionary> GetMessageHandlers() { return new Dictionary> @@ -182,16 +202,22 @@ private Dictionary> GetMessageHandlers() }; } + /// + /// handles incoming audio delta messages from api + /// private void HandleAudioDelta(JObject eventMessage) { string base64AudioData = eventMessage["delta"]?.ToString(); if (!string.IsNullOrEmpty(base64AudioData)) { byte[] pcmAudioData = Convert.FromBase64String(base64AudioData); - audioController.EnqueueAudioData(pcmAudioData); + audioPlayer.EnqueueAudioData(pcmAudioData); } } + /// + /// handles incoming transcript delta messages from api + /// private void HandleTranscriptDelta(JObject eventMessage) { string transcriptPart = eventMessage["delta"]?.ToString(); @@ -202,15 +228,21 @@ private void HandleTranscriptDelta(JObject eventMessage) } } + /// + /// handles response.done message - checks if audio is still playing + /// private void HandleResponseDone(JObject eventMessage) { - if (!audioController.IsAudioPlaying()) + if (!audioPlayer.IsAudioPlaying()) { isResponseInProgress = false; } OnResponseDone?.Invoke(); } + /// + /// handles response.created message - resets transcript buffer + /// private void HandleResponseCreated(JObject eventMessage) { transcriptBuffer.Clear(); @@ -218,15 +250,21 @@ private void HandleResponseCreated(JObject eventMessage) OnResponseCreated?.Invoke(); } + /// + /// handles error messages from api + /// private void HandleError(JObject eventMessage) { string errorMessage = eventMessage["error"]?["message"]?.ToString(); if (!string.IsNullOrEmpty(errorMessage)) { - Debug.LogError("OpenAI error: " + errorMessage); + Debug.LogError("openai error: " + errorMessage); } } + /// + /// disposes the websocket connection + /// private async void DisposeWebSocket() { if (ws != null && (ws.State == WebSocketState.Open || ws.State == WebSocketState.CloseReceived)) @@ -238,5 +276,4 @@ private async void DisposeWebSocket() } } - private void OnApplicationQuit() => DisposeWebSocket(); }