Rewrote listener for Magic Storybook and added desktop icon

makermelissa · makermelissa · commit 494cf6ba37fe · 2023-05-19T10:48:56.000-07:00
diff --git a/Magic_AI_Storybook/images/magic_book_icon.png b/Magic_AI_Storybook/images/magic_book_icon.png
diff --git a/Magic_AI_Storybook/listener.py b/Magic_AI_Storybook/listener.py
@@ -2,56 +2,40 @@
 #
 # SPDX-License-Identifier: MIT
 
-from datetime import datetime, timedelta
-from queue import Queue
+import time
 
 import speech_recognition as sr
 
 
 class Listener:
-    def __init__(self, energy_threshold=1000, phrase_timeout=3.0, record_timeout=30):
+    def __init__(self, api_key, energy_threshold=300, record_timeout=30):
         self.listener_handle = None
+        self.microphone = sr.Microphone()
         self.recognizer = sr.Recognizer()
         self.recognizer.energy_threshold = energy_threshold
-        self.recognizer.dynamic_energy_threshold = False
-        self.recognizer.pause_threshold = 1
-        self.last_sample = bytes()
-        self.phrase_time = datetime.utcnow()
-        self.phrase_timeout = phrase_timeout
+        with self.microphone as source:
+            self.recognizer.adjust_for_ambient_noise(source)  # we only need to calibrate once, before we start listening
         self.record_timeout = record_timeout
-        self.phrase_complete = False
-        # Thread safe Queue for passing data from the threaded recording callback.
-        self.data_queue = Queue()
-        self.mic_dev_index = None
+        self.listener_handle = None
+        self.audio = None
+        self.api_key = api_key
 
     def listen(self, ready_callback=None):
-        self.phrase_complete = False
-        start = datetime.utcnow()
-        self.start_listening()
+        self._start_listening()
         if ready_callback:
             ready_callback()
         while (
             self.listener_handle
-            and not self.speech_waiting()
-            or not self.phrase_complete
+            and self.audio is None
         ):
-            if self.phrase_time and start - self.phrase_time > timedelta(
-                seconds=self.phrase_timeout
-            ):
-                self.last_sample = bytes()
-                self.phrase_complete = True
-            self.phrase_time = start
+            time.sleep(0.1)
         self.stop_listening()
 
-    def start_listening(self):
-        if not self.listener_handle:
-            with sr.Microphone() as source:
-                self.recognizer.adjust_for_ambient_noise(source)
-            self.listener_handle = self.recognizer.listen_in_background(
-                sr.Microphone(),
-                self.record_callback,
-                phrase_time_limit=self.record_timeout,
-            )
+    def _save_audio_callback(self, _recognizer, audio):
+        self.audio = audio
+
+    def _start_listening(self):
+        self.listener_handle = self.recognizer.listen_in_background(self.microphone, self._save_audio_callback)
 
     def stop_listening(self, wait_for_stop=False):
         if self.listener_handle:
@@ -61,40 +45,24 @@ def stop_listening(self, wait_for_stop=False):
     def is_listening(self):
         return self.listener_handle is not None
 
-    def record_callback(self, _, audio: sr.AudioData) -> None:
-        # Grab the raw bytes and push it into the thread safe queue.
-        data = audio.get_raw_data()
-        self.data_queue.put(data)
-
     def speech_waiting(self):
-        return not self.data_queue.empty()
-
-    def get_speech(self):
-        if self.speech_waiting():
-            return self.data_queue.get()
-        return None
-
-    def get_audio_data(self):
-        now = datetime.utcnow()
-        if self.speech_waiting():
-            self.phrase_complete = False
-            if self.phrase_time and now - self.phrase_time > timedelta(
-                seconds=self.phrase_timeout
-            ):
-                self.last_sample = bytes()
-                self.phrase_complete = True
-            self.phrase_time = now
-
-            # Concatenate our current audio data with the latest audio data.
-            while self.speech_waiting():
-                data = self.get_speech()
-                self.last_sample += data
+        return self.audio is not None
 
-            # Use AudioData to convert the raw data to wav data.
-            with sr.Microphone() as source:
-                audio_data = sr.AudioData(
-                    self.last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH
-                )
-            return audio_data
+    def recognize(self):
+        if self.audio:
+            # Transcribe the audio data to text using Whisper
+            print("Recognizing...")
+            attempts = 0
+            while attempts < 3:
+                try:
+                    result = self.recognizer.recognize_whisper_api(
+                        self.audio, api_key=self.api_key
+                    )
 
-        return None
+                    return result.strip()
+                except sr.RequestError as e:
+                    time.sleep(3)
+                attempts += 1
+                print("I wasn't able to understand you. Please repeat that.")
+            return None
+        return None
diff --git a/Magic_AI_Storybook/story.py b/Magic_AI_Storybook/story.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: 2023 Melissa LeBlanc-Williams for Adafruit Industries
 #
 # SPDX-License-Identifier: MIT
+# Desktop Icon from <a href="https://www.flaticon.com/free-icons/book" title="book icons">Book icons created by Freepik - Flaticon</a>
 
 import threading
 import sys
@@ -26,7 +27,7 @@
 STORY_WORD_LENGTH = 800
 REED_SWITCH_PIN = board.D17
 NEOPIXEL_PIN = board.D18
-API_KEYS_FILE = "/home/pi/keys.txt"
+API_KEYS_FILE = "~/keys.txt"
 PROMPT_FILE = "/boot/bookprompt.txt"
 
 # Neopixel Settings
@@ -47,8 +48,11 @@
 BUTTON_NEW_IMAGE = "button_new.png"
 
 # Asset Paths
-IMAGES_PATH = os.path.dirname(sys.argv[0]) + "images/"
-FONTS_PATH = os.path.dirname(sys.argv[0]) + "fonts/"
+BASE_PATH = os.path.dirname(sys.argv[0])
+if BASE_PATH != "":
+    BASE_PATH += "/"
+IMAGES_PATH = BASE_PATH + "images/"
+FONTS_PATH = BASE_PATH + "fonts/"
 
 # Font Path, Size
 TITLE_FONT = (FONTS_PATH + "Desdemona Black Regular.otf", 48)
@@ -58,7 +62,7 @@
 
 # Delays to control the speed of the text
 WORD_DELAY = 0.1
-WELCOME_IMAGE_DELAY = 3
+WELCOME_IMAGE_DELAY = 0
 TITLE_FADE_TIME = 0.05
 TITLE_FADE_STEPS = 25
 TEXT_FADE_TIME = 0.25
@@ -84,6 +88,12 @@
 
 # Do some checks and Import API keys from API_KEYS_FILE
 config = configparser.ConfigParser()
+
+username = os.environ["SUDO_USER"]
+user_homedir = os.path.expanduser(f"~{username}")
+API_KEYS_FILE = API_KEYS_FILE.replace("~", user_homedir)
+
+print(os.path.expanduser(API_KEYS_FILE))
 config.read(os.path.expanduser(API_KEYS_FILE))
 if not config.has_section("openai"):
     print("Please make sure API_KEYS_FILE points to a valid file.")
@@ -186,7 +196,7 @@ def __init__(self, rotation=0):
         self._busy = False
         # Use a cursor to keep track of where we are in the text area
         self.cursor = {"x": 0, "y": 0}
-        self.listener = Listener(ENERGY_THRESHOLD, PHRASE_TIMEOUT, RECORD_TIMEOUT)
+        self.listener = None
         self.backlight = Backlight()
         self.pixels = neopixel.NeoPixel(
             NEOPIXEL_PIN,
@@ -202,7 +212,7 @@ def __init__(self, rotation=0):
 
     def start(self):
         # Output to the LCD instead of the console
-        os.putenv("DISPLAY", ":0")
+        #os.putenv("DISPLAY", ":0")
 
         # Initialize the display
         pygame.init()
@@ -217,6 +227,9 @@ def start(self):
         self.display_welcome()
         start_time = time.monotonic()
 
+        #Initialize the Listener
+        self.listener = Listener(openai.api_key, ENERGY_THRESHOLD, RECORD_TIMEOUT)
+
         # Preload remaining images
         self._load_image("background", BACKGROUND_IMAGE)
         self._load_image("loading", LOADING_IMAGE)
@@ -585,9 +598,7 @@ def show_waiting():
             # No response from user, so return
             return
 
-        audio_data = self.listener.get_audio_data()
-
-        story_request = self._transcribe(audio_data.get_wav_data())
+        story_request = self.listener.recognize()
 
         story_prompt = self._make_story_prompt(story_request)
         self.display_loading()
@@ -636,23 +647,6 @@ def _make_story_prompt(self, request):
             STORY_WORD_LENGTH=STORY_WORD_LENGTH, STORY_REQUEST=request
         )
 
-    @staticmethod
-    def _transcribe(wav_data):
-        # Transcribe the audio data to text using Whisper
-        print("Transcribing...")
-        attempts = 0
-        while attempts < 3:
-            try:
-                with NamedTemporaryFile(suffix=".wav") as temp_file:
-                    result = openai.Audio.translate_raw(
-                        WHISPER_MODEL, wav_data, temp_file.name
-                    )
-                    return result["text"].strip()
-            except (openai.error.ServiceUnavailableError, openai.error.APIError):
-                time.sleep(3)
-            attempts += 1
-        return "I wasn't able to understand you. Please repeat that."
-
     def _sendchat(self, prompt):
         response = ""
         print("Sending to chatGPT")
diff --git a/Magic_AI_Storybook/storybook.desktop b/Magic_AI_Storybook/storybook.desktop
@@ -0,0 +1,7 @@
+[Desktop Entry]
+Comment=Run Magic Storybook
+Terminal=true
+Name=Magic Storybook
+Exec=sudo python -E /home/pi/Magic_AI_Storybook/story.py
+Type=Application
+Icon=/home/pi/Magic_AI_Storybook/images/magic_book_icon.png