Skip to content

Text to voice feature #1735

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
3 changes: 3 additions & 0 deletions submodules/AccountContext/Sources/MediaManager.swift
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ public func peerMessageMediaPlayerType(_ message: EngineMessage) -> MediaManager
break
}
}
if let attribute = message.attributes.first(where: { $0 is TextTranscriptionMessageAttribute }) as? TextTranscriptionMessageAttribute {
file = attribute.file
}
return file
}

Expand Down
1 change: 1 addition & 0 deletions submodules/AttachmentUI/Sources/AttachmentPanel.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1247,6 +1247,7 @@ final class AttachmentPanel: ASDisplayNode, ASScrollViewDelegate {
}, updateHistoryFilter: { _ in
}, updateDisplayHistoryFilterAsList: { _ in
}, requestLayout: { _ in
}, startTranscribingText: { _ in
}, chatController: {
return nil
}, statuses: nil)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ public final class ChatPanelInterfaceInteraction {
public let openBoostToUnrestrict: () -> Void
public let updateVideoTrimRange: (Double, Double, Bool, Bool) -> Void
public let requestLayout: (ContainedViewLayoutTransition) -> Void
public let startTranscribingText: (Message) -> Void
public let chatController: () -> ViewController?
public let statuses: ChatPanelInterfaceInteractionStatuses?

Expand Down Expand Up @@ -292,6 +293,7 @@ public final class ChatPanelInterfaceInteraction {
updateHistoryFilter: @escaping ((ChatPresentationInterfaceState.HistoryFilter?) -> ChatPresentationInterfaceState.HistoryFilter?) -> Void,
updateDisplayHistoryFilterAsList: @escaping (Bool) -> Void,
requestLayout: @escaping (ContainedViewLayoutTransition) -> Void,
startTranscribingText: @escaping (Message) -> Void,
chatController: @escaping () -> ViewController?,
statuses: ChatPanelInterfaceInteractionStatuses?
) {
Expand Down Expand Up @@ -408,7 +410,7 @@ public final class ChatPanelInterfaceInteraction {
self.updateHistoryFilter = updateHistoryFilter
self.updateDisplayHistoryFilterAsList = updateDisplayHistoryFilterAsList
self.requestLayout = requestLayout

self.startTranscribingText = startTranscribingText
self.chatController = chatController
self.statuses = statuses
}
Expand Down Expand Up @@ -532,6 +534,7 @@ public final class ChatPanelInterfaceInteraction {
}, updateHistoryFilter: { _ in
}, updateDisplayHistoryFilterAsList: { _ in
}, requestLayout: { _ in
}, startTranscribingText: { _ in
}, chatController: {
return nil
}, statuses: nil)
Expand Down
4 changes: 4 additions & 0 deletions submodules/GalleryData/Sources/GalleryData.swift
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,10 @@ public func chatMessageGalleryControllerData(context: AccountContext, chatLocati
}
}

if let attribute = message.attributes.first(where: { $0 is TextTranscriptionMessageAttribute }) as? TextTranscriptionMessageAttribute {
galleryMedia = attribute.file
}

var stream = false
var autoplayingVideo = false
var landscape = false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ private var declaredEncodables: Void = {
declareEncodable(SendAsMessageAttribute.self, f: { SendAsMessageAttribute(decoder: $0) })
declareEncodable(ForwardVideoTimestampAttribute.self, f: { ForwardVideoTimestampAttribute(decoder: $0) })
declareEncodable(AudioTranscriptionMessageAttribute.self, f: { AudioTranscriptionMessageAttribute(decoder: $0) })
declareEncodable(TextTranscriptionMessageAttribute.self, f: { TextTranscriptionMessageAttribute(decoder: $0) })
declareEncodable(NonPremiumMessageAttribute.self, f: { NonPremiumMessageAttribute(decoder: $0) })
declareEncodable(TelegramExtendedMedia.self, f: { TelegramExtendedMedia(decoder: $0) })
declareEncodable(TelegramPeerUsername.self, f: { TelegramPeerUsername(decoder: $0) })
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3926,6 +3926,9 @@ func replayFinalState(
updatedAttributes.append(translation)
}
}
if let transcription = previousMessage.attributes.first(where: { $0 is TextTranscriptionMessageAttribute }) as? TextTranscriptionMessageAttribute {
updatedAttributes.append(transcription)
}
}

if let previousFactCheckAttribute = previousMessage.attributes.first(where: { $0 is FactCheckMessageAttribute }) as? FactCheckMessageAttribute, let updatedFactCheckAttribute = message.attributes.first(where: { $0 is FactCheckMessageAttribute }) as? FactCheckMessageAttribute {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
//
// SyncCore_TextTranscriptionMessageAttribute.swift
// Telegram
//
// Created by Dmitry Bolonikov on 7.04.25.
//

import Postbox

public class TextTranscriptionMessageAttribute: MessageAttribute, Equatable {
public let id: Int64
public let visible: Bool
public let downloading: Bool
public let file: TelegramMediaFile

public var associatedPeerIds: [PeerId] {
return []
}

public init(
id: Int64,
visible: Bool,
downloading: Bool,
file: TelegramMediaFile
) {
self.id = id
self.visible = visible
self.downloading = downloading
self.file = file
}

required public init(decoder: PostboxDecoder) {
self.id = decoder.decodeInt64ForKey("id", orElse: 0)
self.visible = decoder.decodeBoolForKey("visible", orElse: false)
self.downloading = decoder.decodeBoolForKey("downloading", orElse: false)
self.file = decoder.decodeObjectForKey("file") as! TelegramMediaFile
}

public func encode(_ encoder: PostboxEncoder) {
encoder.encodeInt64(self.id, forKey: "id")
encoder.encodeBool(self.visible, forKey: "visible")
encoder.encodeBool(self.downloading, forKey: "downloading")
encoder.encodeObject(file, forKey: "file")
}

public static func ==(lhs: TextTranscriptionMessageAttribute, rhs: TextTranscriptionMessageAttribute) -> Bool {
if lhs.id != rhs.id {
return false
}
if lhs.visible != rhs.visible {
return false
}
if lhs.file != rhs.file {
return false
}
if lhs.downloading != rhs.downloading {
return false
}
return true
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,10 @@ public extension TelegramEngine {
|> ignoreValues
}

public func transcribeText(messageId: MessageId) -> Signal<EngineTextTranscriptionResult, NoError> {
_internal_transcribeText(postbox: self.account.postbox, network: self.account.network, messageId: messageId)
}

public func storeLocallyDerivedData(messageId: MessageId, data: [String: CodableEntry]) -> Signal<Never, NoError> {
return self.account.postbox.transaction { transaction -> Void in
transaction.updateMessage(messageId, update: { currentMessage in
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
//
// TextTranscription.swift
// Telegram
//
// Created by Dmitry Bolonikov on 7.04.25.
//

import Foundation
import Postbox
import SwiftSignalKit
import TelegramApi
import MtProtoKit

public enum EngineTextTranscriptionResult {
case transcribing
case finished
}

private enum InternalTextTranscriptionResult {
case alreadyTranscribed(TextTranscriptionMessageAttribute)
case startTranscribing(TextTranscriptionMessageAttribute)
case transcribed(TextTranscriptionMessageAttribute)
case error
}

func _internal_transcribeText(postbox: Postbox, network: Network, messageId: MessageId) -> Signal<EngineTextTranscriptionResult, NoError> {
return postbox.transaction { transaction -> Message? in
transaction.getMessage(messageId)
}
|> mapToSignal { message -> Signal<InternalTextTranscriptionResult, NoError> in
guard let message else {
return .single(.error)
}

if let attribute = message.attributes.first(where: { $0 is TextTranscriptionMessageAttribute }) as? TextTranscriptionMessageAttribute {
return .single(.alreadyTranscribed(attribute))
}

return Signal { subscriber in

let fileId = Int64.random(in: Int64.min...Int64.max)
let resource = LocalFileMediaResource(fileId: fileId)

let mediaId = MediaId(namespace: Namespaces.Media.LocalFile, id: Int64.random(in: Int64.min...Int64.max))

let voiceAttributes: [TelegramMediaFileAttribute] = [.Audio(isVoice: true, duration: 0, title: nil, performer: nil, waveform: nil)]

let file = TelegramMediaFile(
fileId: mediaId,
partialReference: nil,
resource: resource,
previewRepresentations: [],
videoThumbnails: [],
immediateThumbnailData: nil,
mimeType: "audio/ogg",
size: 1,
attributes: voiceAttributes,
alternativeRepresentations: [])

let attributeId = Int64.random(in: Int64.min...Int64.max)
let attribute = TextTranscriptionMessageAttribute(id: attributeId,
visible: true,
downloading: true,
file: file)

subscriber.putNext(.startTranscribing(attribute))

DispatchQueue.global(qos: .userInitiated).asyncAfter(deadline: .now() + 5) {
guard let fileUrl = Bundle.main.url(forResource: "TextToVoiceFeature", withExtension: "ogg"),
let data = try? Data(contentsOf: fileUrl) else {
subscriber.putNext(.error)
subscriber.putCompletion()
return
}

postbox.mediaBox.storeResourceData(resource.id, data: data)

// TODO: Fetch duration, waveform from response
let waveformBase64 = "DAAOAAkACQAGAAwADwAMABAADQAPABsAGAALAA0AGAAfABoAHgATABgAGQAYABQADAAVABEAHwANAA0ACQAWABkACQAOAAwACQAfAAAAGQAVAAAAEwATAAAACAAfAAAAHAAAABwAHwAAABcAGQAAABQADgAAABQAHwAAAB8AHwAAAAwADwAAAB8AEwAAABoAFwAAAB8AFAAAAAAAHwAAAAAAHgAAAAAAHwAAAAAAHwAAAAAAHwAAAAAAHwAAAAAAHwAAAAAAAAA="

let voiceAttributes: [TelegramMediaFileAttribute] = [.Audio(isVoice: true, duration: 0, title: nil, performer: nil, waveform: Data(base64Encoded: waveformBase64)!)]

let file = TelegramMediaFile(
fileId: mediaId,
partialReference: nil,
resource: resource,
previewRepresentations: [],
videoThumbnails: [],
immediateThumbnailData: nil,
mimeType: "audio/ogg",
size: Int64(data.count),
attributes: voiceAttributes,
alternativeRepresentations: [])

let attributeId = Int64.random(in: Int64.min...Int64.max)
let attribute = TextTranscriptionMessageAttribute(id: attributeId,
visible: true,
downloading: false,
file: file)

subscriber.putNext(.transcribed(attribute))
subscriber.putCompletion()
}

return EmptyDisposable
}
}
|> mapToSignal { result -> Signal<EngineTextTranscriptionResult, NoError> in
return postbox.transaction { transaction -> EngineTextTranscriptionResult in
transaction.updateMessage(messageId, update: { currentMessage in
var attributes = currentMessage.attributes.filter { !($0 is TextTranscriptionMessageAttribute) }

switch result {
case .transcribed(let attribute):
attributes.append(attribute)

case .startTranscribing(let attribute):
attributes.append(attribute)
case .alreadyTranscribed(let attribute):
let updatedAttribute = TextTranscriptionMessageAttribute(id: attribute.id, visible: true, downloading: attribute.downloading, file: attribute.file)
guard updatedAttribute != attribute else {
return .skip
}
attributes.append(updatedAttribute)
default:
return .skip
}

let storeForwardInfo = currentMessage.forwardInfo.flatMap(StoreMessageForwardInfo.init)

return .update(StoreMessage(
id: currentMessage.id,
globallyUniqueId: currentMessage.globallyUniqueId,
groupingKey: currentMessage.groupingKey,
threadId: currentMessage.threadId,
timestamp: currentMessage.timestamp,
flags: StoreMessageFlags(currentMessage.flags),
tags: currentMessage.tags,
globalTags: currentMessage.globalTags,
localTags: currentMessage.localTags,
forwardInfo: storeForwardInfo,
authorId: currentMessage.author?.id,
text: currentMessage.text,
attributes: attributes,
media: currentMessage.media))
})

switch result {
case .startTranscribing:
return .transcribing
default:
return .finished
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ public final class ChatMessageInteractiveFileNode: ASDisplayNode {
public let layoutConstants: ChatMessageItemLayoutConstants
public let constrainedSize: CGSize
public let controllerInteraction: ChatControllerInteraction
public let alwaysDisplayTranscriptionButton: Bool
public let transcriptionState: AudioTranscriptionButtonComponent.TranscriptionState?
public let transcriptionButtonTapped: (() -> Void)?

public init(
context: AccountContext,
Expand All @@ -88,7 +91,10 @@ public final class ChatMessageInteractiveFileNode: ASDisplayNode {
isAttachedContentBlock: Bool,
layoutConstants: ChatMessageItemLayoutConstants,
constrainedSize: CGSize,
controllerInteraction: ChatControllerInteraction
controllerInteraction: ChatControllerInteraction,
alwaysDisplayTranscriptionButton: Bool = false,
transcriptionState: AudioTranscriptionButtonComponent.TranscriptionState? = nil,
transcriptionButtonTapped: (() -> Void)? = nil
) {
self.context = context
self.presentationData = presentationData
Expand All @@ -112,6 +118,9 @@ public final class ChatMessageInteractiveFileNode: ASDisplayNode {
self.layoutConstants = layoutConstants
self.constrainedSize = constrainedSize
self.controllerInteraction = controllerInteraction
self.alwaysDisplayTranscriptionButton = alwaysDisplayTranscriptionButton
self.transcriptionState = transcriptionState
self.transcriptionButtonTapped = transcriptionButtonTapped
}
}

Expand Down Expand Up @@ -789,6 +798,10 @@ public final class ChatMessageInteractiveFileNode: ASDisplayNode {
}
}

if arguments.alwaysDisplayTranscriptionButton {
displayTranscribe = true
}

let transcribedText = forcedAudioTranscriptionText ?? transcribedText(message: arguments.message)

switch audioTranscriptionState {
Expand All @@ -805,7 +818,7 @@ public final class ChatMessageInteractiveFileNode: ASDisplayNode {
updatedAudioTranscriptionState = .locked
}

let effectiveAudioTranscriptionState = updatedAudioTranscriptionState ?? audioTranscriptionState
let effectiveAudioTranscriptionState = updatedAudioTranscriptionState ?? arguments.transcriptionState ?? audioTranscriptionState

var displayTrailingAnimatedDots = false

Expand Down Expand Up @@ -1356,7 +1369,11 @@ public final class ChatMessageInteractiveFileNode: ASDisplayNode {
guard let strongSelf = self else {
return
}
strongSelf.transcribe()
if let action = arguments.transcriptionButtonTapped {
action()
} else {
strongSelf.transcribe()
}
}
)),
environment: {},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ swift_library(
"//submodules/TelegramUI/Components/Chat/ShimmeringLinkNode",
"//submodules/TelegramUI/Components/Chat/ChatMessageItemCommon",
"//submodules/TelegramUI/Components/Chat/MessageQuoteComponent",
"//submodules/TelegramUI/Components/Chat/ChatMessageInteractiveFileNode",
"//submodules/TelegramUI/Components/TextLoadingEffect",
"//submodules/TelegramUI/Components/ChatControllerInteraction",
"//submodules/TelegramUI/Components/InteractiveTextComponent",
Expand Down
Loading