Skip to content

Commit feffb50

Browse files
authored
Whisper Audio API Implementation (#29)
Added - Audio Transcription endpoint implemented - Audio translation endpoint implemented - Whisper sample project created Changed - Extension method for adding files merged into AddFile that takes contentType param - Sample project paths in package.json updated
1 parent a31a1c3 commit feffb50

21 files changed

+2850
-26
lines changed

Runtime/DataTypes.cs

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,31 @@ public struct ChatMessage
118118
}
119119
#endregion
120120

121+
#region Audio Transcriptions Data Types
122+
123+
public class CreateAudioRequestBase
124+
{
125+
public string File { get; set; }
126+
public string Model { get; set; }
127+
public string Prompt { get; set; }
128+
public string ResponseFormat { get; set; } = AudioResponseFormat.Json;
129+
public float? Temperature { get; set; } = 0;
130+
}
131+
132+
public class CreateAudioTranscriptionsRequest: CreateAudioRequestBase
133+
{
134+
public string Language { get; set; }
135+
}
136+
137+
public class CreateAudioTranslationRequest: CreateAudioRequestBase { }
138+
139+
public struct CreateAudioResponse: IResponse
140+
{
141+
public ApiError Error { get; set; }
142+
public string Text { get; set; }
143+
}
144+
#endregion
145+
121146
#region Completions API Data Types
122147
public sealed class CreateCompletionRequest
123148
{
@@ -364,7 +389,16 @@ public static class ImageResponseFormat
364389
public const string Url = "url";
365390
public const string Base64Json = "b64_json";
366391
}
367-
392+
393+
public static class AudioResponseFormat
394+
{
395+
public const string Json = "json";
396+
public const string Text = "text";
397+
public const string Srt = "srt";
398+
public const string VerboseJson = "verbose_json";
399+
public const string Vtt = "vtt";
400+
}
401+
368402
public static class ModerationModel
369403
{
370404
public const string Stable = "text-moderation-stable";

Runtime/OpenAIApi.cs

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Threading.Tasks;
66
using UnityEngine.Networking;
77
using System.Collections.Generic;
8+
using System.IO;
89
using Newtonsoft.Json.Serialization;
910

1011
namespace OpenAI
@@ -166,6 +167,11 @@ public async Task<CreateCompletionResponse> CreateCompletion(CreateCompletionReq
166167
return await DispatchRequest<CreateCompletionResponse>(path, UnityWebRequest.kHttpVerbPOST, payload);
167168
}
168169

170+
/// <summary>
171+
/// Creates a chat completion request as in ChatGPT.
172+
/// </summary>
173+
/// <param name="request">See <see cref="CreateChatCompletionRequest"/></param>
174+
/// <returns>See <see cref="CreateChatCompletionResponse"/></returns>
169175
public async Task<CreateChatCompletionResponse> CreateChatCompletion(CreateChatCompletionRequest request)
170176
{
171177
var path = $"{BASE_PATH}/chat/completions";
@@ -207,8 +213,8 @@ public async Task<CreateImageResponse> CreateImageEdit(CreateImageEditRequest re
207213
var path = $"{BASE_PATH}/images/edits";
208214

209215
var form = new List<IMultipartFormSection>();
210-
form.AddImage(request.Image, "image");
211-
form.AddImage(request.Mask, "mask");
216+
form.AddFile(request.Image, "image", "image/png");
217+
form.AddFile(request.Mask, "mask", "image/png");
212218
form.AddValue(request.Prompt, "prompt");
213219
form.AddValue(request.N, "n");
214220
form.AddValue(request.Size, "size");
@@ -227,7 +233,7 @@ public async Task<CreateImageResponse> CreateImageVariation(CreateImageVariation
227233
var path = $"{BASE_PATH}/images/variations";
228234

229235
var form = new List<IMultipartFormSection>();
230-
form.AddImage(request.Image, "image");
236+
form.AddFile(request.Image, "image", "image/png");
231237
form.AddValue(request.N, "n");
232238
form.AddValue(request.Size, "size");
233239
form.AddValue(request.ResponseFormat, "response_format");
@@ -247,6 +253,45 @@ public async Task<CreateEmbeddingsResponse> CreateEmbeddings(CreateEmbeddingsReq
247253
var payload = CreatePayload(request);
248254
return await DispatchRequest<CreateEmbeddingsResponse>(path, UnityWebRequest.kHttpVerbPOST, payload);
249255
}
256+
257+
/// <summary>
258+
/// Transcribes audio into the input language.
259+
/// </summary>
260+
/// <param name="request">See <see cref="CreateAudioTranscriptionsRequest"/></param>
261+
/// <returns>See <see cref="CreateAudioResponse"/></returns>
262+
public async Task<CreateAudioResponse> CreateAudioTranscription(CreateAudioTranscriptionsRequest request)
263+
{
264+
var path = $"{BASE_PATH}/audio/transcriptions";
265+
266+
var form = new List<IMultipartFormSection>();
267+
form.AddFile(request.File, "file", $"audio/{Path.GetExtension(request.File)}");
268+
form.AddValue(request.Model, "model");
269+
form.AddValue(request.Prompt, "prompt");
270+
form.AddValue(request.ResponseFormat, "response_format");
271+
form.AddValue(request.Temperature, "temperature");
272+
form.AddValue(request.Language, "language");
273+
274+
return await DispatchRequest<CreateAudioResponse>(path, form);
275+
}
276+
277+
/// <summary>
278+
/// Translates audio into into English.
279+
/// </summary>
280+
/// <param name="request">See <see cref="CreateAudioTranslationRequest"/></param>
281+
/// <returns>See <see cref="CreateAudioResponse"/></returns>
282+
public async Task<CreateAudioResponse> CreateAudioTranslation(CreateAudioTranslationRequest request)
283+
{
284+
var path = $"{BASE_PATH}/audio/translations";
285+
286+
var form = new List<IMultipartFormSection>();
287+
form.AddFile(request.File, "file", $"audio/{Path.GetExtension(request.File)}");
288+
form.AddValue(request.Model, "model");
289+
form.AddValue(request.Prompt, "prompt");
290+
form.AddValue(request.ResponseFormat, "response_format");
291+
form.AddValue(request.Temperature, "temperature");
292+
293+
return await DispatchRequest<CreateAudioResponse>(path, form);
294+
}
250295

251296
/// <summary>
252297
/// Returns a list of files that belong to the user's organization.
@@ -270,7 +315,7 @@ public async Task<OpenAIFile> CreateFile(CreateFileRequest request)
270315
var path = $"{BASE_PATH}/files";
271316

272317
var form = new List<IMultipartFormSection>();
273-
form.AddJsonl(request.File, "file");
318+
form.AddFile(request.File, "file", "application/json");
274319
form.AddValue(request.Purpose, "purpose");
275320

276321
return await DispatchRequest<OpenAIFileResponse>(path, form);

Runtime/Utils/ExtensionMethods.cs

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,19 @@ namespace OpenAI
77
public static class ExtensionMethods
88
{
99
/// <summary>
10-
/// Read a PNG file and add it to this form.
10+
/// Read a file and add it to this form.
1111
/// </summary>
1212
/// <param name="form">List of multipart form sections.</param>
1313
/// <param name="path">Path of the file to read.</param>
1414
/// <param name="name">Name of the form field.</param>
15-
public static void AddImage(this List<IMultipartFormSection> form, string path, string name)
15+
/// <param name="contentType">Content type of the file.</param>
16+
public static void AddFile(this List<IMultipartFormSection> form, string path, string name, string contentType)
1617
{
1718
if (path != null)
1819
{
1920
var data = File.ReadAllBytes(path);
2021
var fileName = Path.GetFileName(path);
21-
form.Add(new MultipartFormFileSection(name, data, fileName, "image/png"));
22-
}
23-
}
24-
25-
/// <summary>
26-
/// Read a JSONL file and add it to this form.
27-
/// </summary>
28-
/// <param name="form">List of multipart form sections.</param>
29-
/// <param name="path">Path of the file to read.</param>
30-
/// <param name="name">Name of the form field.</param>
31-
public static void AddJsonl(this List<IMultipartFormSection> form, string path, string name)
32-
{
33-
if (path != null)
34-
{
35-
var data = File.ReadAllBytes(path);
36-
var fileName = Path.GetFileName(path);
37-
form.Add(new MultipartFormFileSection(name, data, fileName, "application/json"));
22+
form.Add(new MultipartFormFileSection(name, data, fileName, contentType));
3823
}
3924
}
4025

Samples~/Whisper.meta

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Samples~/Whisper/SavWav.cs

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
// Copyright (c) 2012 Calvin Rien
2+
// http://the.darktable.com
3+
//
4+
// This software is provided 'as-is', without any express or implied warranty. In
5+
// no event will the authors be held liable for any damages arising from the use
6+
// of this software.
7+
//
8+
// Permission is granted to anyone to use this software for any purpose,
9+
// including commercial applications, and to alter it and redistribute it freely,
10+
// subject to the following restrictions:
11+
//
12+
// 1. The origin of this software must not be misrepresented; you must not claim
13+
// that you wrote the original software. If you use this software in a product,
14+
// an acknowledgment in the product documentation would be appreciated but is not
15+
// required.
16+
//
17+
// 2. Altered source versions must be plainly marked as such, and must not be
18+
// misrepresented as being the original software.
19+
//
20+
// 3. This notice may not be removed or altered from any source distribution.
21+
//
22+
// =============================================================================
23+
//
24+
// derived from Gregorio Zanon's script
25+
// http://forum.unity3d.com/threads/119295-Writing-AudioListener.GetOutputData-to-wav-problem?p=806734&viewfull=1#post806734
26+
27+
using System;
28+
using System.IO;
29+
using UnityEngine;
30+
using System.Collections.Generic;
31+
32+
public static class SavWav {
33+
34+
const int HEADER_SIZE = 44;
35+
36+
public static bool Save(string filename, AudioClip clip) {
37+
if (!filename.ToLower().EndsWith(".wav")) {
38+
filename += ".wav";
39+
}
40+
41+
var filepath = Path.Combine(Application.persistentDataPath, filename);
42+
43+
// Make sure directory exists if user is saving to sub dir.
44+
Directory.CreateDirectory(Path.GetDirectoryName(filepath));
45+
46+
using (var fileStream = CreateEmpty(filepath)) {
47+
48+
ConvertAndWrite(fileStream, clip);
49+
50+
WriteHeader(fileStream, clip);
51+
}
52+
53+
return true; // TODO: return false if there's a failure saving the file
54+
}
55+
56+
public static AudioClip TrimSilence(AudioClip clip, float min) {
57+
var samples = new float[clip.samples];
58+
59+
clip.GetData(samples, 0);
60+
61+
return TrimSilence(new List<float>(samples), min, clip.channels, clip.frequency);
62+
}
63+
64+
public static AudioClip TrimSilence(List<float> samples, float min, int channels, int hz) {
65+
return TrimSilence(samples, min, channels, hz, false, false);
66+
}
67+
68+
public static AudioClip TrimSilence(List<float> samples, float min, int channels, int hz, bool _3D, bool stream) {
69+
int i;
70+
71+
for (i=0; i<samples.Count; i++) {
72+
if (Mathf.Abs(samples[i]) > min) {
73+
break;
74+
}
75+
}
76+
77+
samples.RemoveRange(0, i);
78+
79+
for (i=samples.Count - 1; i>0; i--) {
80+
if (Mathf.Abs(samples[i]) > min) {
81+
break;
82+
}
83+
}
84+
85+
samples.RemoveRange(i, samples.Count - i);
86+
87+
var clip = AudioClip.Create("TempClip", samples.Count, channels, hz, _3D, stream);
88+
89+
clip.SetData(samples.ToArray(), 0);
90+
91+
return clip;
92+
}
93+
94+
static FileStream CreateEmpty(string filepath) {
95+
var fileStream = new FileStream(filepath, FileMode.Create);
96+
byte emptyByte = new byte();
97+
98+
for(int i = 0; i < HEADER_SIZE; i++) //preparing the header
99+
{
100+
fileStream.WriteByte(emptyByte);
101+
}
102+
103+
return fileStream;
104+
}
105+
106+
static void ConvertAndWrite(FileStream fileStream, AudioClip clip) {
107+
108+
var samples = new float[clip.samples];
109+
110+
clip.GetData(samples, 0);
111+
112+
Int16[] intData = new Int16[samples.Length];
113+
//converting in 2 float[] steps to Int16[], //then Int16[] to Byte[]
114+
115+
Byte[] bytesData = new Byte[samples.Length * 2];
116+
//bytesData array is twice the size of
117+
//dataSource array because a float converted in Int16 is 2 bytes.
118+
119+
int rescaleFactor = 32767; //to convert float to Int16
120+
121+
for (int i = 0; i<samples.Length; i++) {
122+
intData[i] = (short) (samples[i] * rescaleFactor);
123+
Byte[] byteArr = new Byte[2];
124+
byteArr = BitConverter.GetBytes(intData[i]);
125+
byteArr.CopyTo(bytesData, i * 2);
126+
}
127+
128+
fileStream.Write(bytesData, 0, bytesData.Length);
129+
}
130+
131+
static void WriteHeader(FileStream fileStream, AudioClip clip) {
132+
133+
var hz = clip.frequency;
134+
var channels = clip.channels;
135+
var samples = clip.samples;
136+
137+
fileStream.Seek(0, SeekOrigin.Begin);
138+
139+
Byte[] riff = System.Text.Encoding.UTF8.GetBytes("RIFF");
140+
fileStream.Write(riff, 0, 4);
141+
142+
Byte[] chunkSize = BitConverter.GetBytes(fileStream.Length - 8);
143+
fileStream.Write(chunkSize, 0, 4);
144+
145+
Byte[] wave = System.Text.Encoding.UTF8.GetBytes("WAVE");
146+
fileStream.Write(wave, 0, 4);
147+
148+
Byte[] fmt = System.Text.Encoding.UTF8.GetBytes("fmt ");
149+
fileStream.Write(fmt, 0, 4);
150+
151+
Byte[] subChunk1 = BitConverter.GetBytes(16);
152+
fileStream.Write(subChunk1, 0, 4);
153+
154+
UInt16 two = 2;
155+
UInt16 one = 1;
156+
157+
Byte[] audioFormat = BitConverter.GetBytes(one);
158+
fileStream.Write(audioFormat, 0, 2);
159+
160+
Byte[] numChannels = BitConverter.GetBytes(channels);
161+
fileStream.Write(numChannels, 0, 2);
162+
163+
Byte[] sampleRate = BitConverter.GetBytes(hz);
164+
fileStream.Write(sampleRate, 0, 4);
165+
166+
Byte[] byteRate = BitConverter.GetBytes(hz * channels * 2); // sampleRate * bytesPerSample*number of channels, here 44100*2*2
167+
fileStream.Write(byteRate, 0, 4);
168+
169+
UInt16 blockAlign = (ushort) (channels * 2);
170+
fileStream.Write(BitConverter.GetBytes(blockAlign), 0, 2);
171+
172+
UInt16 bps = 16;
173+
Byte[] bitsPerSample = BitConverter.GetBytes(bps);
174+
fileStream.Write(bitsPerSample, 0, 2);
175+
176+
Byte[] datastring = System.Text.Encoding.UTF8.GetBytes("data");
177+
fileStream.Write(datastring, 0, 4);
178+
179+
Byte[] subChunk2 = BitConverter.GetBytes(samples * channels * 2);
180+
fileStream.Write(subChunk2, 0, 4);
181+
182+
// fileStream.Close();
183+
}
184+
}

Samples~/Whisper/SavWav.cs.meta

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)