Skip to content

Commit e80da05

Browse files
refactor(speech): (set-5) Simplified next set of Speech Samples (#12533)
* The last set of refactored speech Samples * Minor changes in namings * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 8bb38a1 commit e80da05

13 files changed

+139
-172
lines changed

speech/snippets/transcribe_streaming.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,28 +13,26 @@
1313
# limitations under the License.
1414

1515
"""Google Cloud Speech API sample application using the streaming API.
16-
17-
Example usage:
18-
python transcribe_streaming.py resources/audio.raw
1916
"""
2017

21-
import argparse
22-
2318
from google.cloud import speech
2419

2520

2621
# [START speech_transcribe_streaming]
2722
def transcribe_streaming(stream_file: str) -> speech.RecognitionConfig:
28-
"""Streams transcription of the given audio file."""
29-
23+
"""Streams transcription of the given audio file using Google Cloud Speech-to-Text API.
24+
Args:
25+
stream_file (str): Path to the local audio file to be transcribed.
26+
Example: "resources/audio.raw"
27+
"""
3028
client = speech.SpeechClient()
3129

3230
# [START speech_python_migration_streaming_request]
3331
with open(stream_file, "rb") as audio_file:
34-
content = audio_file.read()
32+
audio_content = audio_file.read()
3533

3634
# In practice, stream should be a generator yielding chunks of audio data.
37-
stream = [content]
35+
stream = [audio_content]
3836

3937
requests = (
4038
speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream
@@ -76,9 +74,4 @@ def transcribe_streaming(stream_file: str) -> speech.RecognitionConfig:
7674

7775

7876
if __name__ == "__main__":
79-
parser = argparse.ArgumentParser(
80-
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
81-
)
82-
parser.add_argument("stream", help="File to stream to the API")
83-
args = parser.parse_args()
84-
transcribe_streaming(args.stream)
77+
transcribe_streaming("resources/audio.raw")

speech/snippets/transcribe_streaming_v2.py

Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,39 +12,38 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
16-
import argparse
17-
1815
# [START speech_transcribe_streaming_v2]
16+
import os
17+
1918
from google.cloud.speech_v2 import SpeechClient
2019
from google.cloud.speech_v2.types import cloud_speech as cloud_speech_types
2120

21+
PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
22+
2223

2324
def transcribe_streaming_v2(
24-
project_id: str,
25-
audio_file: str,
25+
stream_file: str,
2626
) -> cloud_speech_types.StreamingRecognizeResponse:
27-
"""Transcribes audio from audio file stream.
28-
27+
"""Transcribes audio from an audio file stream using Google Cloud Speech-to-Text API.
2928
Args:
30-
project_id: The GCP project ID.
31-
audio_file: The path to the audio file to transcribe.
32-
29+
stream_file (str): Path to the local audio file to be transcribed.
30+
Example: "resources/audio.wav"
3331
Returns:
34-
The response from the transcribe method.
32+
list[cloud_speech_types.StreamingRecognizeResponse]: A list of objects.
33+
Each response includes the transcription results for the corresponding audio segment.
3534
"""
3635
# Instantiates a client
3736
client = SpeechClient()
3837

3938
# Reads a file as bytes
40-
with open(audio_file, "rb") as f:
41-
content = f.read()
39+
with open(stream_file, "rb") as f:
40+
audio_content = f.read()
4241

4342
# In practice, stream should be a generator yielding chunks of audio data
44-
chunk_length = len(content) // 5
43+
chunk_length = len(audio_content) // 5
4544
stream = [
46-
content[start : start + chunk_length]
47-
for start in range(0, len(content), chunk_length)
45+
audio_content[start : start + chunk_length]
46+
for start in range(0, len(audio_content), chunk_length)
4847
]
4948
audio_requests = (
5049
cloud_speech_types.StreamingRecognizeRequest(audio=audio) for audio in stream
@@ -59,7 +58,7 @@ def transcribe_streaming_v2(
5958
config=recognition_config
6059
)
6160
config_request = cloud_speech_types.StreamingRecognizeRequest(
62-
recognizer=f"projects/{project_id}/locations/global/recognizers/_",
61+
recognizer=f"projects/{PROJECT_ID}/locations/global/recognizers/_",
6362
streaming_config=streaming_config,
6463
)
6564

@@ -84,10 +83,4 @@ def requests(config: cloud_speech_types.RecognitionConfig, audio: list) -> list:
8483

8584

8685
if __name__ == "__main__":
87-
parser = argparse.ArgumentParser(
88-
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
89-
)
90-
parser.add_argument("project_id", help="GCP Project ID")
91-
parser.add_argument("audio_file", help="Audio file to stream")
92-
args = parser.parse_args()
93-
transcribe_streaming_v2(args.project_id, args.audio_file)
86+
transcribe_streaming_v2("resources/audio.wav")

speech/snippets/transcribe_streaming_v2_test.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,8 @@
2525

2626
@Retry()
2727
def test_transcribe_streaming_v2(capsys: pytest.CaptureFixture) -> None:
28-
project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
29-
3028
responses = transcribe_streaming_v2.transcribe_streaming_v2(
31-
project_id, os.path.join(_RESOURCES, "audio.wav")
29+
os.path.join(_RESOURCES, "audio.wav")
3230
)
3331

3432
transcript = ""

speech/snippets/transcribe_streaming_voice_activity_events.py

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,38 +12,38 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
16-
import argparse
17-
1815
# [START speech_transcribe_streaming_voice_activity_events]
16+
import os
17+
1918
from google.cloud.speech_v2 import SpeechClient
2019
from google.cloud.speech_v2.types import cloud_speech
2120

21+
PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
22+
2223

2324
def transcribe_streaming_voice_activity_events(
24-
project_id: str, audio_file: str
25+
audio_file: str,
2526
) -> cloud_speech.StreamingRecognizeResponse:
26-
"""Transcribes audio from a file into text.
27-
27+
"""Transcribes audio from a file into text and detects voice activity
28+
events using Google Cloud Speech-to-Text API.
2829
Args:
29-
project_id: The GCP project ID to use.
30-
audio_file: The path to the audio file to transcribe.
31-
30+
audio_file (str): Path to the local audio file to be transcribed.
31+
Example: "resources/audio.wav"
3232
Returns:
33-
The streaming response containing the transcript.
33+
list[cloud_speech.StreamingRecognizeResponse]: A list of `StreamingRecognizeResponse` objects.
3434
"""
3535
# Instantiates a client
3636
client = SpeechClient()
3737

3838
# Reads a file as bytes
39-
with open(audio_file, "rb") as f:
40-
content = f.read()
39+
with open(audio_file, "rb") as file:
40+
audio_content = file.read()
4141

4242
# In practice, stream should be a generator yielding chunks of audio data
43-
chunk_length = len(content) // 5
43+
chunk_length = len(audio_content) // 5
4444
stream = [
45-
content[start : start + chunk_length]
46-
for start in range(0, len(content), chunk_length)
45+
audio_content[start : start + chunk_length]
46+
for start in range(0, len(audio_content), chunk_length)
4747
]
4848
audio_requests = (
4949
cloud_speech.StreamingRecognizeRequest(audio=audio) for audio in stream
@@ -64,7 +64,7 @@ def transcribe_streaming_voice_activity_events(
6464
)
6565

6666
config_request = cloud_speech.StreamingRecognizeRequest(
67-
recognizer=f"projects/{project_id}/locations/global/recognizers/_",
67+
recognizer=f"projects/{PROJECT_ID}/locations/global/recognizers/_",
6868
streaming_config=streaming_config,
6969
)
7070

@@ -99,10 +99,4 @@ def requests(config: cloud_speech.RecognitionConfig, audio: list) -> list:
9999

100100

101101
if __name__ == "__main__":
102-
parser = argparse.ArgumentParser(
103-
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
104-
)
105-
parser.add_argument("project_id", help="GCP Project ID")
106-
parser.add_argument("audio_file", help="Audio file to stream")
107-
args = parser.parse_args()
108-
transcribe_streaming_voice_activity_events(args.project_id, args.audio_file)
102+
transcribe_streaming_voice_activity_events("resources/audio.wav")

speech/snippets/transcribe_streaming_voice_activity_events_test.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,8 @@
2828
def test_transcribe_streaming_voice_activity_events(
2929
capsys: pytest.CaptureFixture,
3030
) -> None:
31-
project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
32-
3331
responses = transcribe_streaming_voice_activity_events.transcribe_streaming_voice_activity_events(
34-
project_id, os.path.join(_RESOURCES, "audio.wav")
32+
os.path.join(_RESOURCES, "audio.wav")
3533
)
3634

3735
transcript = ""

speech/snippets/transcribe_streaming_voice_activity_timeouts.py

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,46 +12,43 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
16-
import argparse
17-
1815
# [START speech_transcribe_streaming_voice_activity_timeouts]
16+
import os
1917
from time import sleep
2018

2119
from google.cloud.speech_v2 import SpeechClient
2220
from google.cloud.speech_v2.types import cloud_speech
2321
from google.protobuf import duration_pb2 # type: ignore
2422

23+
PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
24+
2525

2626
def transcribe_streaming_voice_activity_timeouts(
27-
project_id: str,
2827
speech_start_timeout: int,
2928
speech_end_timeout: int,
3029
audio_file: str,
3130
) -> cloud_speech.StreamingRecognizeResponse:
3231
"""Transcribes audio from audio file to text.
33-
3432
Args:
35-
project_id: The GCP project ID to use.
3633
speech_start_timeout: The timeout in seconds for speech start.
3734
speech_end_timeout: The timeout in seconds for speech end.
38-
audio_file: The audio file to transcribe.
39-
35+
audio_file: Path to the local audio file to be transcribed.
36+
Example: "resources/audio_silence_padding.wav"
4037
Returns:
4138
The streaming response containing the transcript.
4239
"""
4340
# Instantiates a client
4441
client = SpeechClient()
4542

4643
# Reads a file as bytes
47-
with open(audio_file, "rb") as f:
48-
content = f.read()
44+
with open(audio_file, "rb") as file:
45+
audio_content = file.read()
4946

5047
# In practice, stream should be a generator yielding chunks of audio data
51-
chunk_length = len(content) // 20
48+
chunk_length = len(audio_content) // 20
5249
stream = [
53-
content[start : start + chunk_length]
54-
for start in range(0, len(content), chunk_length)
50+
audio_content[start : start + chunk_length]
51+
for start in range(0, len(audio_content), chunk_length)
5552
]
5653
audio_requests = (
5754
cloud_speech.StreamingRecognizeRequest(audio=audio) for audio in stream
@@ -81,7 +78,7 @@ def transcribe_streaming_voice_activity_timeouts(
8178
)
8279

8380
config_request = cloud_speech.StreamingRecognizeRequest(
84-
recognizer=f"projects/{project_id}/locations/global/recognizers/_",
81+
recognizer=f"projects/{PROJECT_ID}/locations/global/recognizers/_",
8582
streaming_config=streaming_config,
8683
)
8784

@@ -119,19 +116,16 @@ def requests(config: cloud_speech.RecognitionConfig, audio: list) -> list:
119116

120117

121118
if __name__ == "__main__":
122-
parser = argparse.ArgumentParser(
123-
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
124-
)
125-
parser.add_argument("project_id", help="GCP Project ID")
126-
parser.add_argument(
127-
"speech_start_timeout", help="Timeout in seconds for speech start"
128-
)
129-
parser.add_argument("speech_end_timeout", help="Timeout in seconds for speech end")
130-
parser.add_argument("audio_file", help="Audio file to stream")
131-
args = parser.parse_args()
119+
# Define the timeout duration for detecting the start of speech
120+
# In this case this means the function will wait for up to 5 seconds to determine if speech has started
121+
# before it begins processing the audio stream.
122+
speech_start_timeout = 5
123+
# Define the timeout duration for detecting the end of speech
124+
# This indicates that the function will continue to listen for up to 1 second
125+
# after the last detected speech segment to determine if speech has ended.
126+
speech_end_timeout = 1
132127
transcribe_streaming_voice_activity_timeouts(
133-
args.project_id,
134-
args.speech_start_timeout,
135-
args.speech_end_timeout,
136-
args.audio_file,
128+
speech_start_timeout=speech_start_timeout,
129+
speech_end_timeout=speech_end_timeout,
130+
audio_file="resources/audio_silence_padding.wav",
137131
)

speech/snippets/transcribe_streaming_voice_activity_timeouts_test.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,7 @@
2727

2828
@flaky(max_runs=3, min_passes=1)
2929
def test_transcribe_silence_padding_timeouts(capsys: pytest.CaptureFixture) -> None:
30-
project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
31-
3230
responses = transcribe_streaming_voice_activity_timeouts.transcribe_streaming_voice_activity_timeouts(
33-
project_id,
3431
1,
3532
5,
3633
os.path.join(_RESOURCES, "audio_silence_padding.wav"),
@@ -44,10 +41,7 @@ def test_transcribe_silence_padding_timeouts(capsys: pytest.CaptureFixture) -> N
4441
def test_transcribe_streaming_voice_activity_timeouts(
4542
capsys: pytest.CaptureFixture,
4643
) -> None:
47-
project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
48-
4944
responses = transcribe_streaming_voice_activity_timeouts.transcribe_streaming_voice_activity_timeouts(
50-
project_id,
5145
5,
5246
1,
5347
os.path.join(_RESOURCES, "audio_silence_padding.wav"),

speech/snippets/transcribe_word_level_confidence_gcs_beta.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,13 @@
1717
from google.cloud import speech_v1p1beta1 as speech
1818

1919

20-
def transcribe_file_with_word_level_confidence(gcs_uri: str) -> str:
20+
def transcribe_file_with_word_level_confidence(audio_uri: str) -> str:
2121
"""Transcribe a remote audio file with word level confidence.
22-
2322
Args:
24-
gcs_uri: The Google Cloud Storage path to an audio file.
25-
23+
audio_uri (str): The Cloud Storage URI of the input audio.
24+
E.g., gs://[BUCKET]/[FILE]
2625
Returns:
27-
The generated transcript from the audio file provided.
26+
The generated transcript from the audio file provided with word level confidence.
2827
"""
2928

3029
client = speech.SpeechClient()
@@ -34,11 +33,11 @@ def transcribe_file_with_word_level_confidence(gcs_uri: str) -> str:
3433
encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
3534
sample_rate_hertz=44100,
3635
language_code="en-US",
37-
enable_word_confidence=True,
36+
enable_word_confidence=True, # Enable word level confidence
3837
)
3938

4039
# Set the remote path for the audio file
41-
audio = speech.RecognitionAudio(uri=gcs_uri)
40+
audio = speech.RecognitionAudio(uri=audio_uri)
4241

4342
# Use non-blocking call for getting file transcription
4443
response = client.long_running_recognize(config=config, audio=audio).result(
@@ -64,3 +63,8 @@ def transcribe_file_with_word_level_confidence(gcs_uri: str) -> str:
6463

6564

6665
# [END speech_transcribe_word_level_confidence_gcs_beta]
66+
67+
if __name__ == "__main__":
68+
transcribe_file_with_word_level_confidence(
69+
"gs://cloud-samples-data/speech/brooklyn_bridge.flac"
70+
)

0 commit comments

Comments
 (0)