Merge pull request #15 from Garulf/add-flow-string-matcher

Garulf · web-flow · commit 903cf4ce5bf1 · 2022-11-11T07:45:55.000-05:00
Add flow string matcher
diff --git a/flox/string_matcher.py b/flox/string_matcher.py
@@ -0,0 +1,204 @@
+from typing import List
+
+SPACE_CHAR: str = ' '
+USER_SEARCH_PRECISION = 50
+
+"""
+This is a python copy of Flow Launcher's string matcher.
+I take no credit for the algorithm, I just translated it to python.
+"""
+
+
+def string_matcher(query: str, text: str, ignore_case: bool = True) -> float:
+    """Compare query to text"""
+    if not text or not query:
+        return False
+
+    query = query.strip()
+
+    current_acronym_query_index = 0
+    acronym_match_data: List[int] = []
+    acronyms_total_count: int = 0
+    acronyms_matched: int = 0
+
+    full_text_lower: str = text.lower() if ignore_case else text
+    query_lower: str = query.lower() if ignore_case else query
+
+    query_substrings: List[str] = query_lower.split(' ')
+    current_query_substring_index: int = 0
+    current_query_substring = query_substrings[current_query_substring_index]
+    current_query_substring_char_index = 0
+
+    first_match_index = -1
+    first_match_index_in_word = -1
+    last_match_index = 0
+    all_query_substrings_matched: bool = False
+    match_found_in_previous_loop: bool = False
+    all_substrings_contained_in_text: bool = True
+
+    index_list: List[int] = []
+    space_indices: List[int] = []
+    for text_index in range(len(full_text_lower)):
+        if current_acronym_query_index >= len(query_lower) and acronyms_matched == len(query_lower):
+
+            if is_acronym_count(full_text_lower, text_index):
+                acronyms_total_count += 1
+                continue
+
+        if current_acronym_query_index >= len(query_lower) or current_acronym_query_index >= len(query_lower) and all_query_substrings_matched:
+            break
+
+        if full_text_lower[text_index] == SPACE_CHAR and current_query_substring_char_index == 0:
+            space_indices.append(text_index)
+
+        if is_acronym(text, text_index):
+            if full_text_lower[text_index] == query_lower[current_acronym_query_index]:
+                acronym_match_data.append(text_index)
+                acronyms_matched += 1
+                current_acronym_query_index += 1
+
+        if is_acronym_count(text, text_index):
+            acronyms_total_count += 1
+
+        if all_query_substrings_matched or full_text_lower[text_index] != current_query_substring[current_query_substring_char_index]:
+            match_found_in_previous_loop = False
+            continue
+
+        if first_match_index < 0:
+            first_match_index = text_index
+
+        if current_query_substring_char_index == 0:
+            match_found_in_previous_loop = True
+            first_match_index_in_word = text_index
+        elif not match_found_in_previous_loop:
+            start_index_to_verify = text_index - current_query_substring_char_index
+
+            if all_previous_chars_matched(start_index_to_verify, current_query_substring_char_index, full_text_lower, current_query_substring):
+                match_found_in_previous_loop = True
+                first_match_index_in_word = start_index_to_verify if current_query_substring_index == 0 else first_match_index
+
+                index_list = get_updated_index_list(
+                    start_index_to_verify, current_query_substring_char_index, first_match_index_in_word, index_list)
+
+        last_match_index = text_index + 1
+        index_list.append(text_index)
+
+        current_query_substring_char_index += 1
+
+        if current_query_substring_char_index == len(current_query_substring):
+            all_substrings_contained_in_text = match_found_in_previous_loop and all_substrings_contained_in_text
+
+            current_query_substring_index += 1
+
+            all_query_substrings_matched = all_query_substrings_matched_func(
+                current_query_substring_index, len(query_substrings))
+
+            if all_query_substrings_matched:
+                continue
+
+            current_query_substring = query_substrings[current_query_substring_index]
+            current_query_substring_char_index = 0
+
+    if acronyms_matched > 0 and acronyms_matched == len(query):
+        acronyms_score: int = acronyms_matched * 100 / acronyms_total_count
+
+        if acronyms_score >= USER_SEARCH_PRECISION:
+            return (True, USER_SEARCH_PRECISION, acronym_match_data, acronyms_score)
+
+    if all_query_substrings_matched:
+
+        nearest_space_index = calculate_closest_space_index(
+            space_indices, first_match_index)
+
+        score = calculate_search_score(query, text, first_match_index - nearest_space_index - 1,
+                                       space_indices, last_match_index - first_match_index, all_substrings_contained_in_text)
+
+        return (True, USER_SEARCH_PRECISION, index_list, score)
+
+    return (False, USER_SEARCH_PRECISION)
+
+
+def calculate_search_score(query: str, text: str, first_index: int, space_indices: List[int], match_length: int, all_substrings_contained_in_text: bool):
+    score = 100 * (len(query) + 1) / ((1 + first_index) + (match_length + 1))
+
+    if first_index == 0 and all_substrings_contained_in_text:
+        score -= len(space_indices)
+
+    if (len(text) - len(query)) < 5:
+        score += 20
+    elif (len(text) - len(query)) < 10:
+        score += 10
+
+    if all_substrings_contained_in_text:
+        count: int = len(query.replace(' ', ''))
+        threshold: int = 4
+        if count <= threshold:
+            score += count * 10
+        else:
+            score += threshold * 10 + (count - threshold) * 5
+
+    return score
+
+
+def get_updated_index_list(start_index_to_verify: int, current_query_substring_char_index: int, first_matched_index_in_word: int, index_list: List[int]):
+    updated_list: List[int] = []
+
+    for idx, item in enumerate(index_list):
+        if item >= first_matched_index_in_word:
+            index_list.pop(idx)
+
+    updated_list.extend(index_list)
+
+    for i in range(current_query_substring_char_index):
+        updated_list.append(start_index_to_verify + i)
+
+    return updated_list
+
+
+def all_query_substrings_matched_func(current_query_substring_index: int, query_substrings_length: int) -> bool:
+    return current_query_substring_index >= query_substrings_length
+
+
+def all_previous_chars_matched(start_index_to_verify: int, current_query_substring_char_index: int, full_text_lower: str, current_query_substring: str) -> bool:
+    all_match = True
+    for i in range(current_query_substring_char_index):
+        if full_text_lower[start_index_to_verify + i] != current_query_substring[i]:
+            all_match = False
+
+    return all_match
+
+
+def is_acronym(text: str, text_index: int) -> bool:
+    if is_acronym_char(text, text_index) or is_acronym_number(text, text_index):
+        return True
+    return False
+
+
+def is_acronym_count(text: str, text_index: int) -> bool:
+    if is_acronym_char(text, text_index):
+        return True
+    if is_acronym_number(text, text_index):
+        return text_index == 0 or text[text_index - 1] == SPACE_CHAR
+
+    return False
+
+
+def is_acronym_char(text: str, text_index: int) -> bool:
+    return text[text_index].isupper() or text_index == 0 or text[text_index - 1] == SPACE_CHAR
+
+
+def is_acronym_number(text: str, text_index: int) -> bool:
+    return text[text_index].isdigit()
+
+
+def calculate_closest_space_index(space_indices: List[int], first_match_index: int) -> int:
+
+    closest_space_index = -1
+
+    for i in space_indices:
+        if i < first_match_index:
+            closest_space_index = i
+        else:
+            break
+
+    return closest_space_index
diff --git a/flox/version b/flox/version
@@ -1 +1 @@
-0.18.1
+0.19.0