-
Notifications
You must be signed in to change notification settings - Fork 62
Durable video generation (golem:video-generation) #51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@jdegoes hi, help me with some clarifications and wit changes. I have also proposed a wit with changes, in the next comment. Current State of PR (Completed parts)
Wit ChangesConfig
This enum does not align with the any of the api,
This config is from runway text-to-image, since I am doing text-to-image, as part of text-to-video, I can fit this, but feels out of place and better in
Minor changes -
Avatar
This matches with Kling's Lip-sync, maybe they supported avatars in the past, but now kling can do lip-sync on any input video. (polling returns a failed(face-detection) error if no face.)
voice-id is match for how Kling supports audio, in speak function it is a choice, either [voice-id, text, speed] or [input audio file], no background audio for either. Effects
both style-guide and background removal are for image to image (supported by runway and stability) Suggested Replacements -
Others
Template I did not understand this at all, I could not find any API references, Am I meant to pre-create template, with already existing prompt/image so it can be used as a test ? I am fairly confident on my proposed changes, as I am familiar with the api now I have implemented text-to-video and image-to-video. Official documentation. |
This is my proposed wit, this mirrors the feature available with providers while remaining consistent with the original wit. This doesn't include kling advanced camera and mask options. package golem:video-generation
interface types {
variant video-error {
invalid-input(string),
unsupported-feature(string),
quota-exceeded,
generation-failed(string),
cancelled,
internal-error(string),
}
variant media-input {
text(string),
image(reference),
}
// Added prompt
record reference {
data: input-image,
prompt: option<string>,
role: option<image-role>,
}
// Changed to first and last
enum image-role {
first,
last,
}
record input-image {
data: media-data,
}
record base-video {
data: media-data,
}
record narration {
data: media-data,
}
variant media-data {
url(string),
bytes(list<u8>),
}
record generation-config {
negative-prompt: option<string>,
seed: option<u64>,
scheduler: option<string>,
guidance-scale: option<f32>,
aspect-ratio: option<aspect-ratio>,
duration-seconds: option<f32>,
resolution: option<resolution>,
enable-audio: option<bool>,
enhance-prompt: option<bool>,
provider-options: list<kv>,
///Added model and lastframe (Kling Only)
model: option<string>,
lastframe: option<input-image: media-data>,
}
enum aspect-ratio {
square,
portrait,
landscape,
cinema,
}
enum resolution {
sd,
hd,
fhd,
uhd,
}
record kv {
key: string,
value: string,
}
record video {
uri: option<string>,
base64-bytes: option<list<u8>>,
mime-type: string,
width: option<u32>,
height: option<u32>,
fps: option<f32>,
duration-seconds: option<f32>,
}
variant job-status {
pending,
running,
succeeded,
failed(string),
}
record video-result {
status: job-status,
videos: option<list<video>>,
metadata: option<list<kv>>,
}
}
interface video-generation {
use types.{media-input, generation-config, video-result, video-error};
// changed output from string to result<string, video-error>
// easier to pass input-invalid, generation error
// for all generate func
generate: func(input: media-input, config: generation-config) -> result<string, video-error>;
poll: func(job-id: string) -> result<video-result, video-error>;
cancel: func(job-id: string) -> result<string, video-error>;
}
interface lip-sync {
use types.{video-error, media-data};
// Define the two possible audio source, using voice-id or input audio
variant audio-source {
from-text(text: string, voice-id: option<string>, speed: u32),
from-audio(narration-audio: media-data),
}
generate: func(
input: (base-video: media-data),
audio: audio-source,
) -> result<string, video-error>;
record voice-info {
voice-id: string,
name: string,
language: string,
gender: option<string>,
preview-url: option<string>,
}
list-voices: func(language: option<string>) -> result<list<voice-info>, video-error>;
}
interface advanced {
use types.{video-error, kv};
// Supported in Kling and veo
extend-video: func(
input: base-video,
prompt: option<string>,
duration: option<f32>,
) -> result<string, video-error>;
// Supported in runway
upscale-video: func(
input: base-video,
) -> result<string, video-error>;
// Supported in kling only
video-effects: func(
input: input-image,
second-image: option<input-image>,
effect: string,
) -> result<string, video-error>;
// Multi image generation, kling Only
multi-image-generation: func(
input: input-image,
other-images: list<input-image>, //Upto max 3 more
config: generation-config,
) -> result<string, video-error>;
}
// I have left this as is, I would like a clarification for this
// I also dont get why no introspection
interface templates {
use types.{video-error, kv};
generate-from-template: func(
template-id: string,
variables: list<kv>
) -> string;
}
world video-generation {
import types;
import video-generation;
import lip-sync;
import advanced;
import templates;
export api: video-generation;
export lip-sync;
export template-videos: templates;
export video-effects: effects;
}
|
I did not spend much time on this WIT so I am glad you took a closer look. I like your proposed revisions and would suggest a few more:
|
/closes #44
/claim #44
Runway Test Video