From 5b7ebabb106ea7ed87b16e83e943cbd9a17efa32 Mon Sep 17 00:00:00 2001
From: Bill Shaw <bill@freespoke.com>
Date: Thu, 20 Feb 2025 14:45:35 -0500
Subject: [PATCH 1/2] add word count checker

---
 .vscode/settings.json |  5 +++++
 main.py               | 15 +++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..a8c2003
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+    "python-envs.defaultEnvManager": "ms-python.python:conda",
+    "python-envs.defaultPackageManager": "ms-python.python:conda",
+    "python-envs.pythonProjects": []
+}
\ No newline at end of file
diff --git a/main.py b/main.py
index 04daeae..07fe4b0 100644
--- a/main.py
+++ b/main.py
@@ -23,10 +23,13 @@ def main():
         end_time = perf_counter()
         contiguity_errors = check_contiguity(sections)
         bad_start_errors = check_start_idx(sections)
-        if contiguity_errors + bad_start_errors == 0:
+        word_count_errors = check_word_counts(sections, transcript)
+        if contiguity_errors + bad_start_errors + word_count_errors == 0:
             print("OK")
         else:
-            print(f"{contiguity_errors} contiguity errors, {bad_start_errors} start_errors")
+            print(f"{contiguity_errors} contiguity errors")
+            print(f"{bad_start_errors} start_errors")
+            print(f"{word_count_errors} word count errors")
         print(f"Segmented in {end_time - start_time:.1f}")
 
 def check_contiguity(sections) -> int:
@@ -48,6 +51,14 @@ def check_start_idx(sections):
             errcount += 1
     return errcount
 
+def check_word_counts(sections, transcript):
+    errcount = 0
+    transcript_wc = len(transcript.split())
+    sections_wc = sum([section["word_count"] for section in sections])
+    if transcript_wc != sections_wc:
+        print(f"Word count mismatch. Expected {transcript_wc}, got {sections_wc}.")
+        errcount += 1
+    return errcount
 
 if __name__ == "__main__":
     main()

From b33ad2d3ddf45c9a3c967731bbc186b7c6268033 Mon Sep 17 00:00:00 2001
From: Bill Shaw <bill@freespoke.com>
Date: Tue, 25 Feb 2025 12:03:00 -0500
Subject: [PATCH 2/2] Add check to validate transcript lines match section
 content. Also check for out of range index values.

---
 main.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 07fe4b0..80fdeca 100644
--- a/main.py
+++ b/main.py
@@ -24,7 +24,8 @@ def main():
         contiguity_errors = check_contiguity(sections)
         bad_start_errors = check_start_idx(sections)
         word_count_errors = check_word_counts(sections, transcript)
-        if contiguity_errors + bad_start_errors + word_count_errors == 0:
+        section_errors = check_segment_offsets(sections, transcript)
+        if contiguity_errors + bad_start_errors + word_count_errors + section_errors == 0:
             print("OK")
         else:
             print(f"{contiguity_errors} contiguity errors")
@@ -60,5 +61,26 @@ def check_word_counts(sections, transcript):
         errcount += 1
     return errcount
 
+def check_segment_offsets(sections, transcript):
+    """
+    Compare each section with the corresponding lines from the transcript
+    using the start and end indices of the section.
+    If the text doesn't match or the indices are out of range, print an error.
+    Return the number of errors found.
+    """
+    utterances = transcript.split('\n')
+    errcount = 0
+    for section in sections:
+        chunk = "\n".join(utterances[section["start"] : section["end"] + 1])
+        try:
+            content =section.get("content", "")
+            if content != chunk:
+                print(f"Section content does not match corresponding utterances (start: {section["start"]}, end: {section["end"]}). Section: {section}")
+                errcount += 1
+        except IndexError:
+            errcount += 1
+            print(f"Section start/end out of range. start {section["start"]}; end {section["end"]}; max {len(utterances)}. Section {section}")
+    return errcount
+
 if __name__ == "__main__":
     main()