feat: Add profanity filter (#69)

hugobloem · web-flow · commit 35a8a251751b · 2025-02-23T16:04:23.000Z
* refactor: use Pydantic model for Microsoft STT configuration

* feat: add profanity filter configuration to Microsoft STT
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 wyoming==1.6.0
 azure-cognitiveservices-speech==1.42.0
-ruff
+ruff
+pydantic>=2,<3
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,14 +1,14 @@
 """Fixtures for tests."""
 
-from types import SimpleNamespace
+from wyoming_microsoft_stt import SpeechConfig
 import pytest
 import os
 
 
 @pytest.fixture
 def microsoft_stt_args():
     """Return MicrosoftSTT instance."""
-    args = SimpleNamespace(
+    args = SpeechConfig(
         subscription_key=os.environ.get("SPEECH_KEY"),
         service_region=os.environ.get("SPEECH_REGION"),
     )
diff --git a/tests/test_microsoft_stt.py b/tests/test_microsoft_stt.py
@@ -18,3 +18,13 @@ def test_transcribe(microsoft_stt_args):
 
     result = microsoft_stt.transcribe(filename, language)
     assert "hello world" in result.lower()
+
+
+def test_set_profanity(microsoft_stt_args):
+    """Test set_profanity."""
+    microsoft_stt = MicrosoftSTT(microsoft_stt_args)
+    assert microsoft_stt.speech_config is not None
+
+    profanity = "masked"
+    microsoft_stt.set_profanity(profanity)
+    # There is currently no way to check the set profanity level
diff --git a/wyoming_microsoft_stt/__init__.py b/wyoming_microsoft_stt/__init__.py
@@ -1 +1,13 @@
 """Wyoming server for Microsoft STT."""
+
+from typing import Literal
+from pydantic import BaseModel
+
+
+class SpeechConfig(BaseModel):
+    """Speech configuration."""
+
+    subscription_key: str
+    service_region: str
+    profanity: Literal["off", "masked", "removed"] = "masked"
+    language: str = "en-US"
diff --git a/wyoming_microsoft_stt/__main__.py b/wyoming_microsoft_stt/__main__.py
@@ -14,41 +14,83 @@
 from .microsoft_stt import MicrosoftSTT
 from .handler import MicrosoftEventHandler
 from .version import __version__
+from . import SpeechConfig
 
 _LOGGER = logging.getLogger(__name__)
 
 stop_event = asyncio.Event()
 
+
 def handle_stop_signal(*args):
     """Handle shutdown signal and set the stop event."""
     _LOGGER.info("Received stop signal. Shutting down...")
     stop_event.set()
 
+
 def parse_arguments():
     """Parse command-line arguments."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--service-region", default=os.getenv("AZURE_SERVICE_REGION"), help="Microsoft Azure region (e.g., westus2)")
-    parser.add_argument("--subscription-key", default=os.getenv("AZURE_SUBSCRIPTION_KEY"), help="Microsoft Azure subscription key")
-    parser.add_argument("--uri", default="tcp://0.0.0.0:10300", help="unix:// or tcp://")
-    parser.add_argument("--download-dir", default="/tmp/", help="Directory to download languages.json into (default: /tmp/)")
-    parser.add_argument("--language", default="en-US", help="Default language to set for transcription")
-    parser.add_argument("--update-languages", action="store_true", help="Download latest languages.json during startup")
+    parser.add_argument(
+        "--service-region",
+        default=os.getenv("AZURE_SERVICE_REGION"),
+        help="Microsoft Azure region (e.g., westus2)",
+    )
+    parser.add_argument(
+        "--subscription-key",
+        default=os.getenv("AZURE_SUBSCRIPTION_KEY"),
+        help="Microsoft Azure subscription key",
+    )
+    parser.add_argument(
+        "--uri", default="tcp://0.0.0.0:10300", help="unix:// or tcp://"
+    )
+    parser.add_argument(
+        "--download-dir",
+        default="/tmp/",
+        help="Directory to download languages.json into (default: /tmp/)",
+    )
+    parser.add_argument(
+        "--language", default="en-US", help="Default language to set for transcription"
+    )
+    parser.add_argument(
+        "--update-languages",
+        action="store_true",
+        help="Download latest languages.json during startup",
+    )
+    parser.add_argument(
+        "--profanity",
+        default="masked",
+        choices=["masked", "removed", "raw"],
+        help="Profanity setting for speech recognition",
+    )
     parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
     return parser.parse_args()
 
+
 def validate_args(args):
     """Validate command-line arguments."""
     if not args.service_region or not args.subscription_key:
-        raise ValueError("Both --service-region and --subscription-key must be provided either as command-line arguments or environment variables.")
+        raise ValueError(
+            "Both --service-region and --subscription-key must be provided either as command-line arguments or environment variables."
+        )
     # Reinstate key validation with more flexibility to accommodate complex keys
-    if not re.match(r'^[A-Za-z0-9\-_]{40,}$', args.subscription_key):
-        _LOGGER.warning("The subscription key does not match the expected format but will attempt to initialize.")
+    if not re.match(r"^[A-Za-z0-9\-_]{40,}$", args.subscription_key):
+        _LOGGER.warning(
+            "The subscription key does not match the expected format but will attempt to initialize."
+        )
+
 
 async def main() -> None:
     """Start Wyoming Microsoft STT server."""
     args = parse_arguments()
     validate_args(args)
 
+    speech_config = SpeechConfig(
+        subscription_key=args.subscription_key,
+        service_region=args.service_region,
+        profanity=args.profanity,
+        language=args.language,
+    )
+
     # Set up logging
     logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
     _LOGGER.debug("Arguments parsed successfully.")
@@ -98,7 +140,7 @@ async def main() -> None:
     # Load Microsoft STT model
     try:
         _LOGGER.debug("Loading Microsoft STT")
-        stt_model = MicrosoftSTT(args)
+        stt_model = MicrosoftSTT(speech_config)
         _LOGGER.info("Microsoft STT model loaded successfully.")
     except Exception as e:
         _LOGGER.error(f"Failed to load Microsoft STT model: {e}")
@@ -121,6 +163,7 @@ async def main() -> None:
     except Exception as e:
         _LOGGER.error(f"An error occurred while running the server: {e}")
 
+
 if __name__ == "__main__":
     # Set up signal handling for graceful shutdown
     signal.signal(signal.SIGTERM, handle_stop_signal)
diff --git a/wyoming_microsoft_stt/microsoft_stt.py b/wyoming_microsoft_stt/microsoft_stt.py
@@ -1,25 +1,29 @@
 import azure.cognitiveservices.speech as speechsdk  # noqa: D100
 import logging
+from . import SpeechConfig
 
 _LOGGER = logging.getLogger(__name__)
 
 
 class MicrosoftSTT:
     """Class to handle Microsoft STT."""
 
-    def __init__(self, args) -> None:
+    def __init__(self, speechconfig: SpeechConfig) -> None:
         """Initialize."""
-        self.args = args
+        self.args = speechconfig
+
         try:
             # Initialize the speech configuration with the provided subscription key and region
             self.speech_config = speechsdk.SpeechConfig(
-                subscription=args.subscription_key, region=args.service_region
+                subscription=self.args.subscription_key, region=self.args.service_region
             )
             _LOGGER.info("Microsoft SpeechConfig initialized successfully.")
         except Exception as e:
             _LOGGER.error(f"Failed to initialize Microsoft SpeechConfig: {e}")
             raise
 
+        self.set_profanity(self.args.profanity)
+
     def transcribe(self, filename: str, language=None):
         """Transcribe a file."""
         # Use the default language from args if no language is provided
@@ -48,10 +52,29 @@ def transcribe(self, filename: str, language=None):
                 return ""
             elif result.reason == speechsdk.ResultReason.Canceled:
                 cancellation_details = result.cancellation_details
-                _LOGGER.warning(f"Speech Recognition canceled: {cancellation_details.reason}")
+                _LOGGER.warning(
+                    f"Speech Recognition canceled: {cancellation_details.reason}"
+                )
                 if cancellation_details.reason == speechsdk.CancellationReason.Error:
-                    _LOGGER.error(f"Error details: {cancellation_details.error_details}")
+                    _LOGGER.error(
+                        f"Error details: {cancellation_details.error_details}"
+                    )
                 return ""
         except Exception as e:
             _LOGGER.error(f"Failed to transcribe audio file {filename}: {e}")
             return ""
+
+    def set_profanity(self, profanity: str):
+        """Set the profanity filter level."""
+        if profanity == "off":
+            profanity_level = speechsdk.ProfanityOption.Raw
+        elif profanity == "masked":
+            profanity_level = speechsdk.ProfanityOption.Masked
+        elif profanity == "removed":
+            profanity_level = speechsdk.ProfanityOption.Removed
+        else:
+            _LOGGER.error(f"Invalid profanity level: {profanity}")
+            return
+
+        self.speech_config.set_profanity(profanity_level)
+        _LOGGER.debug(f"Profanity filter set to {profanity}")