From 5b7ebabb106ea7ed87b16e83e943cbd9a17efa32 Mon Sep 17 00:00:00 2001 From: Bill Shaw Date: Thu, 20 Feb 2025 14:45:35 -0500 Subject: [PATCH 1/2] add word count checker --- .vscode/settings.json | 5 +++++ main.py | 15 +++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a8c2003 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "python-envs.defaultEnvManager": "ms-python.python:conda", + "python-envs.defaultPackageManager": "ms-python.python:conda", + "python-envs.pythonProjects": [] +} \ No newline at end of file diff --git a/main.py b/main.py index 04daeae..07fe4b0 100644 --- a/main.py +++ b/main.py @@ -23,10 +23,13 @@ def main(): end_time = perf_counter() contiguity_errors = check_contiguity(sections) bad_start_errors = check_start_idx(sections) - if contiguity_errors + bad_start_errors == 0: + word_count_errors = check_word_counts(sections, transcript) + if contiguity_errors + bad_start_errors + word_count_errors == 0: print("OK") else: - print(f"{contiguity_errors} contiguity errors, {bad_start_errors} start_errors") + print(f"{contiguity_errors} contiguity errors") + print(f"{bad_start_errors} start_errors") + print(f"{word_count_errors} word count errors") print(f"Segmented in {end_time - start_time:.1f}") def check_contiguity(sections) -> int: @@ -48,6 +51,14 @@ def check_start_idx(sections): errcount += 1 return errcount +def check_word_counts(sections, transcript): + errcount = 0 + transcript_wc = len(transcript.split()) + sections_wc = sum([section["word_count"] for section in sections]) + if transcript_wc != sections_wc: + print(f"Word count mismatch. Expected {transcript_wc}, got {sections_wc}.") + errcount += 1 + return errcount if __name__ == "__main__": main() From b33ad2d3ddf45c9a3c967731bbc186b7c6268033 Mon Sep 17 00:00:00 2001 From: Bill Shaw Date: Tue, 25 Feb 2025 12:03:00 -0500 Subject: [PATCH 2/2] Add check to validate transcript lines match section content. Also check for out of range index values. --- main.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 07fe4b0..80fdeca 100644 --- a/main.py +++ b/main.py @@ -24,7 +24,8 @@ def main(): contiguity_errors = check_contiguity(sections) bad_start_errors = check_start_idx(sections) word_count_errors = check_word_counts(sections, transcript) - if contiguity_errors + bad_start_errors + word_count_errors == 0: + section_errors = check_segment_offsets(sections, transcript) + if contiguity_errors + bad_start_errors + word_count_errors + section_errors == 0: print("OK") else: print(f"{contiguity_errors} contiguity errors") @@ -60,5 +61,26 @@ def check_word_counts(sections, transcript): errcount += 1 return errcount +def check_segment_offsets(sections, transcript): + """ + Compare each section with the corresponding lines from the transcript + using the start and end indices of the section. + If the text doesn't match or the indices are out of range, print an error. + Return the number of errors found. + """ + utterances = transcript.split('\n') + errcount = 0 + for section in sections: + chunk = "\n".join(utterances[section["start"] : section["end"] + 1]) + try: + content =section.get("content", "") + if content != chunk: + print(f"Section content does not match corresponding utterances (start: {section["start"]}, end: {section["end"]}). Section: {section}") + errcount += 1 + except IndexError: + errcount += 1 + print(f"Section start/end out of range. start {section["start"]}; end {section["end"]}; max {len(utterances)}. Section {section}") + return errcount + if __name__ == "__main__": main()