Updated and simplified for new hw config

makermelissa · makermelissa · commit de54ad562b2c · 2023-03-23T14:42:43.000-07:00
diff --git a/ChatGPT_Voice_Assistant/assistant.py b/ChatGPT_Voice_Assistant/assistant.py
@@ -6,29 +6,44 @@
 
 import argparse
 import os
-import sys
 
 from datetime import datetime, timedelta
 from queue import Queue
-from time import sleep
+import time
+import random
 from tempfile import NamedTemporaryFile
 
 import speech_recognition as sr
 import openai
 
-# Add your OpenAI API key here
-openai.api_key = "sk-..."
+import board
+import digitalio
+from adafruit_motorkit import MotorKit
+
+openai.api_key = "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"
 SYSTEM_ROLE = (
     "You are a helpful voice assistant that answers questions and gives information"
 )
-
-def speak(text):
-    subprocess.run(["espeak-ng", text, "&"], check=False)
-
+CHATGPT_MODEL = "gpt-3.5-turbo"
+WHISPER_MODEL = "whisper-1"
+ARM_MOVEMENT_TIME = 0.5
+BASE_MOUTH_DURATION = 0.2  # A higher number means slower mouth movement
+SPEECH_VARIANCE = 0.03   # A higher number means more variance in the mouth movement
+RECORD_TIMEOUT = 30
+
+# Setup Motors
+kit = MotorKit(i2c=board.I2C())
+arms_motor = kit.motor1
+mouth_motor = kit.motor2
+
+# Setup Foot Button
+foot_button = digitalio.DigitalInOut(board.D16)
+foot_button.direction = digitalio.Direction.INPUT
+foot_button.pull = digitalio.Pull.UP
 
 def sendchat(prompt):
     completion = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo",
+        model=CHATGPT_MODEL,
         messages=[
             {"role": "system", "content": SYSTEM_ROLE},
             {"role": "user", "content": prompt},
@@ -37,84 +52,80 @@ def sendchat(prompt):
     # Send the heard text to ChatGPT and return the result
     return completion.choices[0].message.content
 
+def move_arms_motor(dir_up=True, speed=1.0):
+    direction = 1 if dir_up else -1
+    arms_motor.throttle = speed * direction
+    time.sleep(ARM_MOVEMENT_TIME)
+    arms_motor.throttle = 0
+
+def move_mouth_motor(dir_open=True, duration=0.5, speed=1.0):
+    direction = 1 if dir_open else -1
+    mouth_motor.throttle = speed * direction
+    time.sleep(duration)
+    mouth_motor.throttle = 0
+
+def move_mouth():
+    move_mouth_motor(dir_open=True, duration=random_mouth_duration())
+    move_mouth_motor(dir_open=False, duration=random_mouth_duration())
+
+def random_mouth_duration():
+    return BASE_MOUTH_DURATION + random.random() * SPEECH_VARIANCE - (SPEECH_VARIANCE / 2)
+
+def move_arms(hide=True):
+    move_arms_motor(dir_up= not hide)
+
+def speak(text):
+    # while the subprocess is still running, move the mouth
+    with subprocess.Popen(["espeak-ng", text, "&"]) as proc:
+        while proc.poll() is None:
+            move_mouth()
 
 def transcribe(wav_data):
     # Read the transcription.
     print("Transcribing...")
-    with NamedTemporaryFile(suffix=".wav") as temp_file:
-        result = openai.Audio.translate_raw("whisper-1", wav_data, temp_file.name)
-    return result["text"].strip()
-
+    speak("Let me think about that")
+    move_arms(hide=True)
+    attempts = 0
+    while attempts < 3:
+        try:
+            with NamedTemporaryFile(suffix=".wav") as temp_file:
+                result = openai.Audio.translate_raw(WHISPER_MODEL, wav_data, temp_file.name)
+                return result["text"].strip()
+        except (
+            openai.error.ServiceUnavailableError,
+            openai.error.APIError
+        ):
+            time.sleep(3)
+        attempts += 1
+    return "I wasn't able to understand you. Please repeat that."
 
 class Listener:
     def __init__(
-        self, default_microphone, record_timeout, energy_threshold, phrase_timeout
+        self, energy_threshold, phrase_timeout
     ):
         self.listener_handle = None
-        self.recorder = sr.Recognizer()
-        self.record_timeout = record_timeout
-        self.recorder.energy_threshold = energy_threshold
-        self.recorder.dynamic_energy_threshold = False
-        self.recorder.pause_threshold = 1
-        self.source = None
+        self.recognizer = sr.Recognizer()
+        self.recognizer.energy_threshold = energy_threshold
+        self.recognizer.dynamic_energy_threshold = False
+        self.recognizer.pause_threshold = 1
         self.last_sample = bytes()
         self.phrase_time = datetime.utcnow()
         self.phrase_timeout = phrase_timeout
         self.phrase_complete = False
-        self.default_microphone = default_microphone
         # Thread safe Queue for passing data from the threaded recording callback.
         self.data_queue = Queue()
-        self.source = self._get_microphone()
-
-    def _get_microphone(self):
-        if self.source:
-            return self.source
-        mic_name = self.default_microphone
-        source = None
-        if not mic_name or mic_name == "list":
-            print("Available microphone devices are: ")
-            for index, name in enumerate(sr.Microphone.list_microphone_names()):
-                print(f'Microphone with name "{name}" found')
-            sys.exit()
-        else:
-            for index, name in enumerate(sr.Microphone.list_microphone_names()):
-                if mic_name in name:
-                    print(f'Microphone with name "{name}" at index "{index}" found')
-                    source = sr.Microphone(sample_rate=16000, device_index=index)
-                    break
-        if not source:
-            print(f'Microphone with name "{mic_name}" not found')
-            sys.exit()
-
-        with source:
-            self.recorder.adjust_for_ambient_noise(source)
-
-        return source
+        self.mic_dev_index = None
 
     def listen(self):
         if not self.listener_handle:
-            with self._get_microphone() as source:
-                audio = self.recorder.listen(source)
+            with sr.Microphone() as source:
+                print(source.stream)
+                self.recognizer.adjust_for_ambient_noise(source)
+                audio = self.recognizer.listen(source, timeout=RECORD_TIMEOUT)
             data = audio.get_raw_data()
             self.data_queue.put(data)
 
-    def start(self):
-        if not self.listener_handle:
-            self.listener_handle = self.recorder.listen_in_background(
-                self._get_microphone(),
-                self.record_callback,
-                phrase_time_limit=self.record_timeout,
-            )
-
-    def stop(self, wait_for_stop: bool = False):
-        self.listener_handle(wait_for_stop=wait_for_stop)
-        self.listener_handle = None
-
     def record_callback(self, _, audio: sr.AudioData) -> None:
-        """
-        Threaded callback function to recieve audio data when recordings finish.
-        audio: An AudioData containing the recorded bytes.
-        """
         # Grab the raw bytes and push it into the thread safe queue.
         data = audio.get_raw_data()
         self.data_queue.put(data)
@@ -143,14 +154,14 @@ def get_audio_data(self):
                 data = self.get_speech()
                 self.last_sample += data
 
-            source = self._get_microphone()
-
             # Use AudioData to convert the raw data to wav data.
-            return sr.AudioData(
-                self.last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH
-            )
-        return None
+            with sr.Microphone() as source:
+                audio_data  = sr.AudioData(
+                    self.last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH
+                )
+            return audio_data
 
+        return None
 
 def main():
     parser = argparse.ArgumentParser()
@@ -160,44 +171,32 @@ def main():
         help="Energy level for mic to detect.",
         type=int,
     )
-    parser.add_argument(
-        "--record_timeout",
-        default=2,
-        help="How real time the recording is in seconds.",
-        type=float,
-    )
     parser.add_argument(
         "--phrase_timeout",
         default=3,
         help="How much empty space between recordings before we "
         "consider it a new line in the transcription.",
         type=float,
     )
-    parser.add_argument(
-        "--default_microphone",
-        default="pulse",
-        help="Default microphone name for SpeechRecognition. "
-        "Run this with 'list' to view available Microphones.",
-        type=str,
-    )
+
     args = parser.parse_args()
 
     listener = Listener(
-        args.default_microphone,
-        args.record_timeout,
         args.energy_threshold,
         args.phrase_timeout,
     )
 
     transcription = [""]
 
-    print("How may I help you?")
-    speak("How may I help you?")
-
     while True:
         try:
+            # If button is pressed, start listening
+            if not foot_button.value:
+                print("How may I help you?")
+                speak("How may I help you?")
+                listener.listen()
+
             # Pull raw recorded audio from the queue.
-            listener.listen()
             if listener.speech_waiting():
                 audio_data = listener.get_audio_data()
                 text = transcribe(audio_data.get_wav_data())
@@ -209,6 +208,7 @@ def main():
                         chat_response = sendchat(text)
                         transcription.append(f"> {chat_response}")
                         print("Got response from ChatGPT. Beginning speech synthesis.")
+                        move_arms(hide=False)
                         speak(chat_response)
                         print("Done speaking.")
                     else:
@@ -219,12 +219,10 @@ def main():
                 for line in transcription:
                     print(line)
                 print("", end="", flush=True)
-                sleep(0.25)
-        except (AssertionError, AttributeError):
-            pass
+                time.sleep(0.25)
         except KeyboardInterrupt:
             break
-
+    move_arms(hide=False)
     print("\n\nTranscription:")
     for line in transcription:
         print(line)