Skip to content

Commit fd14bc5

Browse files
StephenHodgsonMylan719Milan Mikuštomkail
authored
com.rest.elevenlabs 3.4.0 (#100)
- com.utilities.rest -> 3.3.0 - com.utilities.encoder.ogg -> 4.0.2 - Added additional request properties for TextToSpeechRequest - `previous_text`, `next_text`, `previous_request_ids`, `next_request_ids`, `languageCode`, `withTimestamps` - `cacheFormat` which can be `None`, `Wav`, or `Ogg` - Added support for transcription timestamps by @tomkail - Added support for language code in TextToSpeechRequest @Mylan719 - Refactored `VoiceClip` - clip samples and data are now prioritized over the `AudioClip` - audioClip will not be created until you access the `VoiceClip.AudioClip` property - if an audio clip is not loaded, you can load it with `LoadCachedAudioClipAsync` - Refactored demo scene to use `OnAudioFilterRead` to better quality stream playback --------- Co-authored-by: Milan Mikuš <mylan719@gmail.com> Co-authored-by: Milan Mikuš <milan.mikus@riganti.cz> Co-authored-by: Tom Kail <thomas.kail@betterup.co> Co-authored-by: Tom Kail <tkail92@gmail.com>
1 parent 4600769 commit fd14bc5

26 files changed

+892
-421
lines changed

Documentation~/README.md

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ The recommended installation method is though the unity package manager and [Ope
4040
- [com.utilities.extensions](https://github.com/RageAgainstThePixel/com.utilities.extensions)
4141
- [com.utilities.audio](https://github.com/RageAgainstThePixel/com.utilities.audio)
4242
- [com.utilities.encoder.ogg](https://github.com/RageAgainstThePixel/com.utilities.encoder.ogg)
43+
- [com.utilities.encoder.wav](https://github.com/RageAgainstThePixel/com.utilities.encoder.wav)
4344
- [com.utilities.rest](https://github.com/RageAgainstThePixel/com.utilities.rest)
4445

4546
---
@@ -59,7 +60,7 @@ The recommended installation method is though the unity package manager and [Ope
5960
- [Text to Speech](#text-to-speech)
6061
- [Stream Text To Speech](#stream-text-to-speech)
6162
- [Voices](#voices)
62-
- [Get Shared Voices](#get-shared-voices) :new:
63+
- [Get Shared Voices](#get-shared-voices)
6364
- [Get All Voices](#get-all-voices)
6465
- [Get Default Voice Settings](#get-default-voice-settings)
6566
- [Get Voice](#get-voice)
@@ -70,13 +71,13 @@ The recommended installation method is though the unity package manager and [Ope
7071
- [Samples](#samples)
7172
- [Download Voice Sample](#download-voice-sample)
7273
- [Delete Voice Sample](#delete-voice-sample)
73-
- [Dubbing](#dubbing) :new:
74-
- [Dub](#dub) :new:
75-
- [Get Dubbing Metadata](#get-dubbing-metadata) :new:
76-
- [Get Transcript for Dub](#get-transcript-for-dub) :new:
77-
- [Get dubbed file](#get-dubbed-file) :new:
78-
- [Delete Dubbing Project](#delete-dubbing-project) :new:
79-
- [SFX Generation](#sfx-generation) :new:
74+
- [Dubbing](#dubbing)
75+
- [Dub](#dub)
76+
- [Get Dubbing Metadata](#get-dubbing-metadata)
77+
- [Get Transcript for Dub](#get-transcript-for-dub)
78+
- [Get dubbed file](#get-dubbed-file)
79+
- [Delete Dubbing Project](#delete-dubbing-project)
80+
- [SFX Generation](#sfx-generation)
8081
- [History](#history)
8182
- [Get History](#get-history)
8283
- [Get History Item](#get-history-item)
@@ -265,8 +266,8 @@ Convert text to speech.
265266
var api = new ElevenLabsClient();
266267
var text = "The quick brown fox jumps over the lazy dog.";
267268
var voice = (await api.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault();
268-
var defaultVoiceSettings = await api.VoicesEndpoint.GetDefaultVoiceSettingsAsync();
269-
var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(text, voice, defaultVoiceSettings);
269+
var request = new TextToSpeechRequest(voice, text);
270+
var voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(request);
270271
audioSource.PlayOneShot(voiceClip.AudioClip);
271272
```
272273

@@ -284,18 +285,14 @@ Stream text to speech.
284285
var api = new ElevenLabsClient();
285286
var text = "The quick brown fox jumps over the lazy dog.";
286287
var voice = (await api.VoicesEndpoint.GetAllVoicesAsync()).FirstOrDefault();
287-
var partialClips = new Queue<AudioClip>();
288-
var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync(
289-
text,
290-
voice,
291-
partialClip =>
292-
{
293-
// Note: Best to queue them and play them in update loop!
294-
// See TextToSpeech sample demo for details
295-
partialClips.Enqueue(partialClip);
296-
});
297-
// The full completed clip:
298-
audioSource.clip = voiceClip.AudioClip;
288+
var partialClips = new Queue<VoiceClip>();
289+
var request = new TextToSpeechRequest(voice, message, model: Model.EnglishTurboV2, outputFormat: OutputFormat.PCM_44100);
290+
var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync(request, partialClip =>
291+
{
292+
// Note: check demo scene for best practices
293+
// on how to handle playback with OnAudioFilterRead
294+
partialClips.Enqueue(partialClip);
295+
});
299296
```
300297

301298
### [Voices](https://docs.elevenlabs.io/api-reference/voices)

Editor/ElevenLabsDashboard.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,7 +1106,7 @@ private async void GenerateSynthesizedText()
11061106
Directory.CreateDirectory(downloadDir);
11071107
}
11081108

1109-
voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(speechSynthesisTextInput, currentVoiceOption, currentVoiceSettings, currentModelOption);
1109+
voiceClip = await api.TextToSpeechEndpoint.TextToSpeechAsync(new(currentVoiceOption, speechSynthesisTextInput, voiceSettings: currentVoiceSettings, model: currentModelOption));
11101110
voiceClip.CopyIntoProject(editorDownloadDirectory);
11111111
}
11121112
catch (Exception e)
@@ -1225,7 +1225,7 @@ private void RenderVoiceLab()
12251225
EditorGUILayout.Space(EndWidth);
12261226
EditorGUILayout.EndHorizontal();
12271227
EditorGUI.indentLevel++;
1228-
1228+
12291229
EditorGUILayout.BeginHorizontal();
12301230
{
12311231
EditorGUILayout.LabelField(voice.Id, EditorStyles.boldLabel);
@@ -1242,7 +1242,7 @@ private void RenderVoiceLab()
12421242
EditorGUILayout.Space(EndWidth);
12431243
EditorGUILayout.EndHorizontal();
12441244
EditorGUI.indentLevel++;
1245-
1245+
12461246
if (!voiceLabels.TryGetValue(voice.Id, out var cachedLabels))
12471247
{
12481248
cachedLabels = new Dictionary<string, string>();

Runtime/Common/CacheFormat.cs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// Licensed under the MIT License. See LICENSE in the project root for license information.
2+
3+
namespace ElevenLabs
4+
{
5+
public enum CacheFormat
6+
{
7+
None,
8+
Ogg,
9+
Wav
10+
}
11+
}

Runtime/Common/OutputFormatExtensions.cs.meta renamed to Runtime/Common/CacheFormat.cs.meta

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Runtime/Common/GeneratedClip.cs

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,12 @@
22

33
using ElevenLabs.Extensions;
44
using System;
5+
using System.Threading;
6+
using System.Threading.Tasks;
57
using UnityEngine;
68
using UnityEngine.Scripting;
9+
using Utilities.Audio;
10+
using Utilities.WebRequestRest;
711

812
namespace ElevenLabs
913
{
@@ -12,16 +16,30 @@ namespace ElevenLabs
1216
public class GeneratedClip : ISerializationCallbackReceiver
1317
{
1418
[Preserve]
15-
internal GeneratedClip(string id, string text, AudioClip audioClip, string cachedPath)
19+
internal GeneratedClip(string id, string text, AudioClip audioClip, string cachedPath = null)
1620
{
1721
this.id = id;
1822
this.text = text;
1923
TextHash = $"{id}{text}".GenerateGuid();
2024
textHash = TextHash.ToString();
21-
this.audioClip = audioClip;
2225
this.cachedPath = cachedPath;
26+
SampleRate = audioClip.frequency;
2327
}
2428

29+
[Preserve]
30+
internal GeneratedClip(string id, string text, ReadOnlyMemory<byte> clipData, int sampleRate, string cachedPath = null)
31+
{
32+
this.id = id;
33+
this.text = text;
34+
TextHash = $"{id}{text}".GenerateGuid();
35+
textHash = TextHash.ToString();
36+
this.cachedPath = cachedPath;
37+
ClipData = clipData;
38+
SampleRate = sampleRate;
39+
}
40+
41+
private readonly ReadOnlyMemory<byte> audioData;
42+
2543
[SerializeField]
2644
private string id;
2745

@@ -44,16 +62,73 @@ internal GeneratedClip(string id, string text, AudioClip audioClip, string cache
4462
private AudioClip audioClip;
4563

4664
[Preserve]
47-
public AudioClip AudioClip => audioClip;
65+
public AudioClip AudioClip
66+
{
67+
get
68+
{
69+
if (audioClip == null && !audioData.IsEmpty)
70+
{
71+
var pcmData = PCMEncoder.Decode(audioData.ToArray());
72+
audioClip = AudioClip.Create(Id, pcmData.Length, 1, SampleRate, false);
73+
audioClip.SetData(pcmData, 0);
74+
}
75+
76+
if (audioClip == null)
77+
{
78+
Debug.LogError($"{nameof(audioClip)} is null, try loading it with LoadCachedAudioClipAsync");
79+
}
80+
81+
return audioClip;
82+
}
83+
}
4884

4985
[SerializeField]
5086
private string cachedPath;
5187

5288
[Preserve]
5389
public string CachedPath => cachedPath;
5490

91+
public ReadOnlyMemory<byte> ClipData { get; }
92+
93+
private float[] clipSamples;
94+
95+
public float[] ClipSamples
96+
{
97+
get
98+
{
99+
if (!ClipData.IsEmpty)
100+
{
101+
clipSamples ??= PCMEncoder.Decode(ClipData.ToArray());
102+
}
103+
else if (audioClip != null)
104+
{
105+
clipSamples = new float[audioClip.samples];
106+
audioClip.GetData(clipSamples, 0);
107+
}
108+
109+
return clipSamples;
110+
}
111+
}
112+
113+
public int SampleRate { get; }
114+
55115
public void OnBeforeSerialize() => textHash = TextHash.ToString();
56116

57117
public void OnAfterDeserialize() => TextHash = Guid.Parse(textHash);
118+
119+
public static implicit operator AudioClip(GeneratedClip clip) => clip?.AudioClip;
120+
121+
public async Task<AudioClip> LoadCachedAudioClipAsync(CancellationToken cancellationToken = default)
122+
{
123+
var audioType = cachedPath switch
124+
{
125+
var path when path.EndsWith(".ogg") => AudioType.OGGVORBIS,
126+
var path when path.EndsWith(".wav") => AudioType.WAV,
127+
var path when path.EndsWith(".mp3") => AudioType.MPEG,
128+
_ => AudioType.UNKNOWN
129+
};
130+
131+
return await Rest.DownloadAudioClipAsync($"file://{cachedPath}", audioType, cancellationToken: cancellationToken);
132+
}
58133
}
59134
}

Runtime/Common/OutputFormatExtensions.cs

Lines changed: 0 additions & 24 deletions
This file was deleted.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Licensed under the MIT License. See LICENSE in the project root for license information.
2+
3+
using Newtonsoft.Json;
4+
using UnityEngine.Scripting;
5+
6+
namespace ElevenLabs
7+
{
8+
/// <summary>
9+
/// Represents timing information for a single character in the transcript
10+
/// </summary>
11+
[Preserve]
12+
public class TimestampedTranscriptCharacter
13+
{
14+
[Preserve]
15+
[JsonConstructor]
16+
internal TimestampedTranscriptCharacter(string character, double startTime, double endTime)
17+
{
18+
Character = character;
19+
StartTime = startTime;
20+
EndTime = endTime;
21+
}
22+
23+
/// <summary>
24+
/// The character being spoken
25+
/// </summary>
26+
[Preserve]
27+
[JsonProperty("character")]
28+
public string Character { get; }
29+
30+
/// <summary>
31+
/// The time in seconds when this character starts being spoken
32+
/// </summary>
33+
[Preserve]
34+
[JsonProperty("character_start_times_seconds")]
35+
public double StartTime { get; }
36+
37+
/// <summary>
38+
/// The time in seconds when this character finishes being spoken
39+
/// </summary>
40+
[Preserve]
41+
[JsonProperty("character_end_times_seconds")]
42+
public double EndTime { get; }
43+
}
44+
}

Runtime/Common/TimestampedTranscriptCharacter.cs.meta

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Runtime/Common/VoiceClip.cs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,26 @@ namespace ElevenLabs
1212
public sealed class VoiceClip : GeneratedClip
1313
{
1414
[Preserve]
15-
internal VoiceClip(string id, string text, Voice voice, AudioClip audioClip, string cachedPath)
15+
internal VoiceClip(string id, string text, Voice voice, AudioClip audioClip, string cachedPath = null)
1616
: base(id, text, audioClip, cachedPath)
1717
{
1818
this.voice = voice;
1919
}
2020

21+
[Preserve]
22+
internal VoiceClip(string id, string text, Voice voice, ReadOnlyMemory<byte> clipData, int sampleRate, string cachedPath = null)
23+
: base(id, text, clipData, sampleRate, cachedPath)
24+
{
25+
this.voice = voice;
26+
}
27+
2128
[SerializeField]
2229
private Voice voice;
2330

2431
[Preserve]
2532
public Voice Voice => voice;
33+
34+
[Preserve]
35+
public TimestampedTranscriptCharacter[] TimestampedTranscriptCharacters { get; internal set; }
2636
}
2737
}

Runtime/Dubbing/DubbingEndpoint.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,9 @@
77
using System.Diagnostics;
88
using System.Globalization;
99
using System.IO;
10-
using System.Linq;
1110
using System.Threading;
1211
using System.Threading.Tasks;
1312
using UnityEngine;
14-
using UnityEngine.Networking;
1513
using Utilities.WebRequestRest;
1614
using Debug = UnityEngine.Debug;
1715

Runtime/Extensions/Extensions.cs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// Licensed under the MIT License. See LICENSE in the project root for license information.
2+
3+
namespace ElevenLabs.Extensions
4+
{
5+
public static class Extensions
6+
{
7+
public static int GetSampleRate(this OutputFormat format) => format switch
8+
{
9+
OutputFormat.PCM_16000 => 16000,
10+
OutputFormat.PCM_22050 => 22050,
11+
OutputFormat.PCM_24000 => 24000,
12+
OutputFormat.PCM_44100 => 44100,
13+
_ => 44100
14+
};
15+
}
16+
}

Runtime/Extensions/Extensions.cs.meta

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)