Merge pull request #792 from transformerlab/add/audio-batched-generation

deep1401 · web-flow · commit 6e979a3a9a2e · 2025-09-26T15:47:35.000-06:00
Add batched generation button and modal for audio
diff --git a/src/renderer/components/Experiment/Audio/Audio.tsx b/src/renderer/components/Experiment/Audio/Audio.tsx
@@ -10,12 +10,9 @@ import {
   Box,
   Select,
   Option,
-  Textarea,
   Stack,
   Slider,
   FormLabel,
-  Switch,
-  Input,
   Modal,
   ModalDialog,
   ModalClose,
@@ -24,6 +21,7 @@ import {
   Card,
   Alert,
 } from '@mui/joy';
+import BatchedAudioModal from './BatchedAudioModal';
 import { useAPI } from '../../../lib/transformerlab-api-sdk';
 import AudioHistory from './AudioHistory';
 
@@ -175,6 +173,7 @@ export default function Audio() {
   const [topP, setTopP] = React.useState(1.0);
   const [selectedLanguage, setSelectedLanguage] = React.useState('');
   const [selectedVoice, setSelectedVoice] = React.useState('');
+  const [showBatchModal, setShowBatchModal] = React.useState(false);
 
   const [showSettingsModal, setShowSettingsModal] = React.useState(false);
 
@@ -194,7 +193,7 @@ export default function Audio() {
     setAudioUrl(null);
     setErrorMessage(null);
 
-    const result = await sendAndReceiveAudioPath(
+    const result: any = await sendAndReceiveAudioPath(
       experimentInfo?.id,
       currentModel,
       adaptor,
@@ -283,7 +282,14 @@ export default function Audio() {
         }}
       >
         <Typography level="h2">Text to Speech</Typography>
-        <Box sx={{ textAlign: 'right' }}>
+        <Box
+          sx={{
+            textAlign: 'right',
+            display: 'flex',
+            alignItems: 'center',
+            gap: 1,
+          }}
+        >
           <Typography level="body-sm">{currentModel}</Typography>
           {adaptor && (
             <Typography level="body-xs" color="neutral">
@@ -483,12 +489,13 @@ export default function Audio() {
             width: '100%',
           }}
         >
+          {/* Large text input area at the top */}
           {/* Large text input area at the top */}
           <FormControl sx={{ mt: 1 }}>
             <textarea
               value={text}
               onChange={(e) => setText(e.target.value)}
-              placeholder="Enter your text here for speech generation..."
+              placeholder={'Enter your text here for speech generation...'}
               style={{
                 minHeight: '100px',
                 padding: '16px',
@@ -519,6 +526,12 @@ export default function Audio() {
               >
                 Generate Speech
               </Button>
+              <Button
+                variant="outlined"
+                onClick={() => setShowBatchModal(true)}
+              >
+                Create Prompt Batch
+              </Button>
             </Stack>
 
             {errorMessage && (
@@ -547,6 +560,45 @@ export default function Audio() {
         </ModalDialog>
       </Modal>
 
+      <BatchedAudioModal
+        open={showBatchModal}
+        onClose={() => setShowBatchModal(false)}
+        isLoading={isLoading}
+        onSubmit={async (lines: string[]) => {
+          setIsLoading(true);
+          setErrorMessage(null);
+          try {
+            const result = await chatAPI.sendBatchedAudio(
+              experimentInfo?.id,
+              currentModel,
+              adaptor,
+              lines,
+              filePrefix,
+              sampleRate,
+              temperature,
+              speed,
+              topP,
+              selectedVoice || undefined,
+              uploadedAudioPath || undefined,
+            );
+            const anyOk = Array.isArray(result)
+              ? result.some((r) => r && r.message)
+              : false;
+            if (!anyOk) {
+              setErrorMessage('Batched generation failed.');
+            }
+            setShowBatchModal(false);
+            handleClearUpload();
+            mutateHistory();
+            if (audioHistoryRef.current) {
+              audioHistoryRef.current.scrollTop = 0;
+            }
+          } finally {
+            setIsLoading(false);
+          }
+        }}
+      />
+
       {/* No Model Running Modal */}
       <Sheet
         sx={{
diff --git a/src/renderer/components/Experiment/Audio/BatchedAudioModal.tsx b/src/renderer/components/Experiment/Audio/BatchedAudioModal.tsx
@@ -0,0 +1,128 @@
+import * as React from 'react';
+import {
+  Modal,
+  ModalDialog,
+  ModalClose,
+  DialogTitle,
+  Divider,
+  Stack,
+  Button,
+  Typography,
+  FormControl,
+} from '@mui/joy';
+
+type BatchedAudioModalProps = {
+  open: boolean;
+  onClose: () => void;
+  isLoading?: boolean;
+  onSubmit: (prompts: string[]) => Promise<void> | void;
+};
+
+export default function BatchedAudioModal({
+  open,
+  onClose,
+  isLoading = false,
+  onSubmit,
+}: BatchedAudioModalProps) {
+  const [prompts, setPrompts] = React.useState<string[]>(['']);
+
+  function updatePrompt(index: number, value: string) {
+    const next = [...prompts];
+    next[index] = value;
+    setPrompts(next);
+  }
+
+  function addPrompt() {
+    setPrompts((prev) => [...prev, '']);
+  }
+
+  function removePrompt(index: number) {
+    setPrompts((prev) => prev.filter((_, i) => i !== index));
+  }
+
+  function resetPrompts() {
+    setPrompts(['']);
+  }
+
+  async function handleSubmit() {
+    const cleaned = prompts.map((p) => p.trim()).filter((p) => p.length > 0);
+    if (cleaned.length === 0) return;
+    await onSubmit(cleaned);
+  }
+
+  return (
+    <Modal open={open} onClose={onClose}>
+      <ModalDialog variant="outlined" sx={{ minWidth: 600, maxWidth: 900 }}>
+        <ModalClose />
+        <DialogTitle>Send Batched Prompts</DialogTitle>
+        <Divider />
+        <Stack spacing={2} sx={{ mt: 1 }}>
+          <Typography level="body-sm" color="neutral">
+            Add one or more prompts. Each prompt can be multi-line. A separate
+            audio file will be generated for each prompt.
+          </Typography>
+
+          <Stack spacing={1} sx={{ maxHeight: 360, overflowY: 'auto' }}>
+            {prompts.map((value, idx) => (
+              <FormControl key={idx} sx={{ gap: 0.5 }}>
+                <textarea
+                  value={value}
+                  onChange={(e) => updatePrompt(idx, e.target.value)}
+                  placeholder={`Prompt ${idx + 1}`}
+                  style={{
+                    minHeight: '100px',
+                    padding: '12px',
+                    borderRadius: '8px',
+                    fontSize: '14px',
+                    lineHeight: '1.5',
+                    overflowY: 'auto',
+                    width: '100%',
+                  }}
+                />
+                <Stack
+                  direction="row"
+                  spacing={1}
+                  sx={{ alignSelf: 'flex-end' }}
+                >
+                  {prompts.length > 1 && (
+                    <Button
+                      size="sm"
+                      variant="plain"
+                      color="danger"
+                      onClick={() => removePrompt(idx)}
+                    >
+                      Remove
+                    </Button>
+                  )}
+                </Stack>
+              </FormControl>
+            ))}
+          </Stack>
+
+          <Stack direction="row" spacing={1}>
+            <Button variant="outlined" onClick={addPrompt}>
+              + Add Prompt
+            </Button>
+          </Stack>
+
+          <Stack direction="row" spacing={1} justifyContent="flex-end">
+            <Button variant="plain" onClick={onClose}>
+              Cancel
+            </Button>
+            <Button variant="outlined" onClick={resetPrompts}>
+              Reset
+            </Button>
+            <Button
+              variant="solid"
+              disabled={isLoading}
+              loading={isLoading}
+              onClick={handleSubmit}
+            >
+              Send Batch
+            </Button>
+          </Stack>
+        </Stack>
+      </ModalDialog>
+    </Modal>
+  );
+}
diff --git a/src/renderer/lib/api-client/chat.ts b/src/renderer/lib/api-client/chat.ts
@@ -764,6 +764,56 @@ export async function sendBatchedChat(
   return results;
 }
 
+// Batched Text-to-Speech: send multiple texts to generate multiple audios
+export async function sendBatchedAudio(
+  experimentId: number,
+  currentModel: string,
+  adaptor: string,
+  texts: string[],
+  filePrefix: string,
+  sampleRate: number,
+  temperature: number,
+  speed: number,
+  topP: number,
+  voice?: string,
+  audioPath?: string,
+  batchSize: number = 64,
+): Promise<any[] | null> {
+  const data: any = {
+    experiment_id: experimentId,
+    model: currentModel,
+    adaptor: adaptor,
+    texts: texts,
+    file_prefix: filePrefix,
+    sample_rate: sampleRate,
+    temperature: temperature,
+    speed: speed,
+    top_p: topP,
+    batch_size: batchSize,
+    inference_url: `${INFERENCE_SERVER_URL()}v1/audio/speech`,
+  };
+
+  if (voice) data.voice = voice;
+  if (audioPath) data.audio_path = audioPath;
+
+  try {
+    const response = await fetch(`${API_URL()}batch/audio/speech`, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        accept: 'application/json',
+      },
+      body: JSON.stringify(data),
+    });
+    if (!response.ok) return null;
+    const results = await response.json();
+    return results;
+  } catch (err) {
+    console.log('Error in sendBatchedAudio:', err);
+    return null;
+  }
+}
+
 export async function callTool(
   function_name: String,
   function_args: Object = {},
diff --git a/src/renderer/lib/transformerlab-api-sdk.ts b/src/renderer/lib/transformerlab-api-sdk.ts
@@ -13,6 +13,7 @@ export {
   sendCompletionReactWay,
   sendBatchedCompletion,
   sendBatchedChat,
+  sendBatchedAudio,
   callTool,
   getToolsForCompletions,
   getEmbeddings,