diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a8c2003 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "python-envs.defaultEnvManager": "ms-python.python:conda", + "python-envs.defaultPackageManager": "ms-python.python:conda", + "python-envs.pythonProjects": [] +} \ No newline at end of file diff --git a/main.py b/main.py index 04daeae..80fdeca 100644 --- a/main.py +++ b/main.py @@ -23,10 +23,14 @@ def main(): end_time = perf_counter() contiguity_errors = check_contiguity(sections) bad_start_errors = check_start_idx(sections) - if contiguity_errors + bad_start_errors == 0: + word_count_errors = check_word_counts(sections, transcript) + section_errors = check_segment_offsets(sections, transcript) + if contiguity_errors + bad_start_errors + word_count_errors + section_errors == 0: print("OK") else: - print(f"{contiguity_errors} contiguity errors, {bad_start_errors} start_errors") + print(f"{contiguity_errors} contiguity errors") + print(f"{bad_start_errors} start_errors") + print(f"{word_count_errors} word count errors") print(f"Segmented in {end_time - start_time:.1f}") def check_contiguity(sections) -> int: @@ -48,6 +52,35 @@ def check_start_idx(sections): errcount += 1 return errcount +def check_word_counts(sections, transcript): + errcount = 0 + transcript_wc = len(transcript.split()) + sections_wc = sum([section["word_count"] for section in sections]) + if transcript_wc != sections_wc: + print(f"Word count mismatch. Expected {transcript_wc}, got {sections_wc}.") + errcount += 1 + return errcount + +def check_segment_offsets(sections, transcript): + """ + Compare each section with the corresponding lines from the transcript + using the start and end indices of the section. + If the text doesn't match or the indices are out of range, print an error. + Return the number of errors found. + """ + utterances = transcript.split('\n') + errcount = 0 + for section in sections: + chunk = "\n".join(utterances[section["start"] : section["end"] + 1]) + try: + content =section.get("content", "") + if content != chunk: + print(f"Section content does not match corresponding utterances (start: {section["start"]}, end: {section["end"]}). Section: {section}") + errcount += 1 + except IndexError: + errcount += 1 + print(f"Section start/end out of range. start {section["start"]}; end {section["end"]}; max {len(utterances)}. Section {section}") + return errcount if __name__ == "__main__": main()