Skip to content

Commit ae1bb4c

Browse files
Resolved conflicts post merge from main
2 parents bac2286 + 0395464 commit ae1bb4c

7 files changed

+228
-226
lines changed

tests/integration/client_test.py

Lines changed: 20 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
import logging
22
import os
3-
import unittest
3+
from difflib import SequenceMatcher, unified_diff
44
from pathlib import Path
55

66
import pytest
7-
import requests
8-
9-
from unstract.llmwhisperer import LLMWhispererClient
107

118
logger = logging.getLogger(__name__)
129

@@ -40,79 +37,38 @@ def test_get_usage_info(client):
4037
)
4138
def test_whisper(client, data_dir, processing_mode, output_mode, input_file):
4239
file_path = os.path.join(data_dir, input_file)
43-
response = client.whisper(
40+
whisper_result = client.whisper(
4441
processing_mode=processing_mode,
4542
output_mode=output_mode,
4643
file_path=file_path,
4744
timeout=200,
4845
)
49-
logger.debug(response)
46+
logger.debug(whisper_result)
5047

5148
exp_basename = f"{Path(input_file).stem}.{processing_mode}.{output_mode}.txt"
5249
exp_file = os.path.join(data_dir, "expected", exp_basename)
53-
with open(exp_file, encoding="utf-8") as f:
54-
exp = f.read()
5550

56-
assert isinstance(response, dict)
57-
assert response["status_code"] == 200
58-
assert response["extracted_text"] == exp
51+
assert_extracted_text(exp_file, whisper_result, processing_mode, output_mode)
5952

6053

61-
# TODO: Review and port to pytest based tests
62-
class TestLLMWhispererClient(unittest.TestCase):
63-
@unittest.skip("Skipping test_whisper")
64-
def test_whisper(self):
65-
client = LLMWhispererClient()
66-
# response = client.whisper(
67-
# url="https://storage.googleapis.com/pandora-static/samples/bill.jpg.pdf"
68-
# )
69-
response = client.whisper(
70-
file_path="test_data/restaurant_invoice_photo.pdf",
71-
timeout=200,
72-
store_metadata_for_highlighting=True,
73-
)
74-
print(response)
75-
# self.assertIsInstance(response, dict)
54+
def assert_extracted_text(file_path, whisper_result, mode, output_mode):
55+
with open(file_path, encoding="utf-8") as f:
56+
exp = f.read()
7657

77-
# @unittest.skip("Skipping test_whisper")
78-
def test_whisper_stream(self):
79-
client = LLMWhispererClient()
80-
download_url = "https://storage.googleapis.com/pandora-static/samples/bill.jpg.pdf"
81-
# Create a stream of download_url and pass it to whisper
82-
response_download = requests.get(download_url, stream=True)
83-
response_download.raise_for_status()
84-
response = client.whisper(
85-
stream=response_download.iter_content(chunk_size=1024),
86-
timeout=200,
87-
store_metadata_for_highlighting=True,
88-
)
89-
print(response)
90-
# self.assertIsInstance(response, dict)
58+
assert isinstance(whisper_result, dict)
59+
assert whisper_result["status_code"] == 200
9160

92-
@unittest.skip("Skipping test_whisper_status")
93-
def test_whisper_status(self):
94-
client = LLMWhispererClient()
95-
response = client.whisper_status(whisper_hash="7cfa5cbb|5f1d285a7cf18d203de7af1a1abb0a3a")
96-
logger.info(response)
97-
self.assertIsInstance(response, dict)
61+
# For OCR based processing
62+
threshold = 0.97
9863

99-
@unittest.skip("Skipping test_whisper_retrieve")
100-
def test_whisper_retrieve(self):
101-
client = LLMWhispererClient()
102-
response = client.whisper_retrieve(whisper_hash="7cfa5cbb|5f1d285a7cf18d203de7af1a1abb0a3a")
103-
logger.info(response)
104-
self.assertIsInstance(response, dict)
64+
# For text based processing
65+
if mode == "native_text" and output_mode == "text":
66+
threshold = 0.99
67+
extracted_text = whisper_result["extracted_text"]
68+
similarity = SequenceMatcher(None, extracted_text, exp).ratio()
10569

106-
@unittest.skip("Skipping test_whisper_highlight_data")
107-
def test_whisper_highlight_data(self):
108-
client = LLMWhispererClient()
109-
response = client.highlight_data(
110-
whisper_hash="9924d865|5f1d285a7cf18d203de7af1a1abb0a3a",
111-
search_text="Indiranagar",
70+
if similarity < threshold:
71+
diff = "\n".join(
72+
unified_diff(exp.splitlines(), extracted_text.splitlines(), fromfile="Expected", tofile="Extracted")
11273
)
113-
logger.info(response)
114-
self.assertIsInstance(response, dict)
115-
116-
117-
if __name__ == "__main__":
118-
unittest.main()
74+
pytest.fail(f"Texts are not similar enough: {similarity * 100:.2f}% similarity. Diff:\n{diff}")

0 commit comments

Comments
 (0)