|
| 1 | +from typing import List |
| 2 | + |
| 3 | +SPACE_CHAR: str = ' ' |
| 4 | +USER_SEARCH_PRECISION = 50 |
| 5 | + |
| 6 | +""" |
| 7 | +This is a python copy of Flow Launcher's string matcher. |
| 8 | +I take no credit for the algorithm, I just translated it to python. |
| 9 | +""" |
| 10 | + |
| 11 | + |
| 12 | +def string_matcher(query: str, text: str, ignore_case: bool = True) -> float: |
| 13 | + """Compare query to text""" |
| 14 | + if not text or not query: |
| 15 | + return False |
| 16 | + |
| 17 | + query = query.strip() |
| 18 | + |
| 19 | + current_acronym_query_index = 0 |
| 20 | + acronym_match_data: List[int] = [] |
| 21 | + acronyms_total_count: int = 0 |
| 22 | + acronyms_matched: int = 0 |
| 23 | + |
| 24 | + full_text_lower: str = text.lower() if ignore_case else text |
| 25 | + query_lower: str = query.lower() if ignore_case else query |
| 26 | + |
| 27 | + query_substrings: List[str] = query_lower.split(' ') |
| 28 | + current_query_substring_index: int = 0 |
| 29 | + current_query_substring = query_substrings[current_query_substring_index] |
| 30 | + current_query_substring_char_index = 0 |
| 31 | + |
| 32 | + first_match_index = -1 |
| 33 | + first_match_index_in_word = -1 |
| 34 | + last_match_index = 0 |
| 35 | + all_query_substrings_matched: bool = False |
| 36 | + match_found_in_previous_loop: bool = False |
| 37 | + all_substrings_contained_in_text: bool = True |
| 38 | + |
| 39 | + index_list: List[int] = [] |
| 40 | + space_indices: List[int] = [] |
| 41 | + for text_index in range(len(full_text_lower)): |
| 42 | + if current_acronym_query_index >= len(query_lower) and acronyms_matched == len(query_lower): |
| 43 | + |
| 44 | + if is_acronym_count(full_text_lower, text_index): |
| 45 | + acronyms_total_count += 1 |
| 46 | + continue |
| 47 | + |
| 48 | + if current_acronym_query_index >= len(query_lower) or current_acronym_query_index >= len(query_lower) and all_query_substrings_matched: |
| 49 | + break |
| 50 | + |
| 51 | + if full_text_lower[text_index] == SPACE_CHAR and current_query_substring_char_index == 0: |
| 52 | + space_indices.append(text_index) |
| 53 | + |
| 54 | + if is_acronym(text, text_index): |
| 55 | + if full_text_lower[text_index] == query_lower[current_acronym_query_index]: |
| 56 | + acronym_match_data.append(text_index) |
| 57 | + acronyms_matched += 1 |
| 58 | + current_acronym_query_index += 1 |
| 59 | + |
| 60 | + if is_acronym_count(text, text_index): |
| 61 | + acronyms_total_count += 1 |
| 62 | + |
| 63 | + if all_query_substrings_matched or full_text_lower[text_index] != current_query_substring[current_query_substring_char_index]: |
| 64 | + match_found_in_previous_loop = False |
| 65 | + continue |
| 66 | + |
| 67 | + if first_match_index < 0: |
| 68 | + first_match_index = text_index |
| 69 | + |
| 70 | + if current_query_substring_char_index == 0: |
| 71 | + match_found_in_previous_loop = True |
| 72 | + first_match_index_in_word = text_index |
| 73 | + elif not match_found_in_previous_loop: |
| 74 | + start_index_to_verify = text_index - current_query_substring_char_index |
| 75 | + |
| 76 | + if all_previous_chars_matched(start_index_to_verify, current_query_substring_char_index, full_text_lower, current_query_substring): |
| 77 | + match_found_in_previous_loop = True |
| 78 | + first_match_index_in_word = start_index_to_verify if current_query_substring_index == 0 else first_match_index |
| 79 | + |
| 80 | + index_list = get_updated_index_list( |
| 81 | + start_index_to_verify, current_query_substring_char_index, first_match_index_in_word, index_list) |
| 82 | + |
| 83 | + last_match_index = text_index + 1 |
| 84 | + index_list.append(text_index) |
| 85 | + |
| 86 | + current_query_substring_char_index += 1 |
| 87 | + |
| 88 | + if current_query_substring_char_index == len(current_query_substring): |
| 89 | + all_substrings_contained_in_text = match_found_in_previous_loop and all_substrings_contained_in_text |
| 90 | + |
| 91 | + current_query_substring_index += 1 |
| 92 | + |
| 93 | + all_query_substrings_matched = all_query_substrings_matched_func( |
| 94 | + current_query_substring_index, len(query_substrings)) |
| 95 | + |
| 96 | + if all_query_substrings_matched: |
| 97 | + continue |
| 98 | + |
| 99 | + current_query_substring = query_substrings[current_query_substring_index] |
| 100 | + current_query_substring_char_index = 0 |
| 101 | + |
| 102 | + if acronyms_matched > 0 and acronyms_matched == len(query): |
| 103 | + acronyms_score: int = acronyms_matched * 100 / acronyms_total_count |
| 104 | + |
| 105 | + if acronyms_score >= USER_SEARCH_PRECISION: |
| 106 | + return (True, USER_SEARCH_PRECISION, acronym_match_data, acronyms_score) |
| 107 | + |
| 108 | + if all_query_substrings_matched: |
| 109 | + |
| 110 | + nearest_space_index = calculate_closest_space_index( |
| 111 | + space_indices, first_match_index) |
| 112 | + |
| 113 | + score = calculate_search_score(query, text, first_match_index - nearest_space_index - 1, |
| 114 | + space_indices, last_match_index - first_match_index, all_substrings_contained_in_text) |
| 115 | + |
| 116 | + return (True, USER_SEARCH_PRECISION, index_list, score) |
| 117 | + |
| 118 | + return (False, USER_SEARCH_PRECISION) |
| 119 | + |
| 120 | + |
| 121 | +def calculate_search_score(query: str, text: str, first_index: int, space_indices: List[int], match_length: int, all_substrings_contained_in_text: bool): |
| 122 | + score = 100 * (len(query) + 1) / ((1 + first_index) + (match_length + 1)) |
| 123 | + |
| 124 | + if first_index == 0 and all_substrings_contained_in_text: |
| 125 | + score -= len(space_indices) |
| 126 | + |
| 127 | + if (len(text) - len(query)) < 5: |
| 128 | + score += 20 |
| 129 | + elif (len(text) - len(query)) < 10: |
| 130 | + score += 10 |
| 131 | + |
| 132 | + if all_substrings_contained_in_text: |
| 133 | + count: int = len(query.replace(' ', '')) |
| 134 | + threshold: int = 4 |
| 135 | + if count <= threshold: |
| 136 | + score += count * 10 |
| 137 | + else: |
| 138 | + score += threshold * 10 + (count - threshold) * 5 |
| 139 | + |
| 140 | + return score |
| 141 | + |
| 142 | + |
| 143 | +def get_updated_index_list(start_index_to_verify: int, current_query_substring_char_index: int, first_matched_index_in_word: int, index_list: List[int]): |
| 144 | + updated_list: List[int] = [] |
| 145 | + |
| 146 | + for idx, item in enumerate(index_list): |
| 147 | + if item >= first_matched_index_in_word: |
| 148 | + index_list.pop(idx) |
| 149 | + |
| 150 | + updated_list.extend(index_list) |
| 151 | + |
| 152 | + for i in range(current_query_substring_char_index): |
| 153 | + updated_list.append(start_index_to_verify + i) |
| 154 | + |
| 155 | + return updated_list |
| 156 | + |
| 157 | + |
| 158 | +def all_query_substrings_matched_func(current_query_substring_index: int, query_substrings_length: int) -> bool: |
| 159 | + return current_query_substring_index >= query_substrings_length |
| 160 | + |
| 161 | + |
| 162 | +def all_previous_chars_matched(start_index_to_verify: int, current_query_substring_char_index: int, full_text_lower: str, current_query_substring: str) -> bool: |
| 163 | + all_match = True |
| 164 | + for i in range(current_query_substring_char_index): |
| 165 | + if full_text_lower[start_index_to_verify + i] != current_query_substring[i]: |
| 166 | + all_match = False |
| 167 | + |
| 168 | + return all_match |
| 169 | + |
| 170 | + |
| 171 | +def is_acronym(text: str, text_index: int) -> bool: |
| 172 | + if is_acronym_char(text, text_index) or is_acronym_number(text, text_index): |
| 173 | + return True |
| 174 | + return False |
| 175 | + |
| 176 | + |
| 177 | +def is_acronym_count(text: str, text_index: int) -> bool: |
| 178 | + if is_acronym_char(text, text_index): |
| 179 | + return True |
| 180 | + if is_acronym_number(text, text_index): |
| 181 | + return text_index == 0 or text[text_index - 1] == SPACE_CHAR |
| 182 | + |
| 183 | + return False |
| 184 | + |
| 185 | + |
| 186 | +def is_acronym_char(text: str, text_index: int) -> bool: |
| 187 | + return text[text_index].isupper() or text_index == 0 or text[text_index - 1] == SPACE_CHAR |
| 188 | + |
| 189 | + |
| 190 | +def is_acronym_number(text: str, text_index: int) -> bool: |
| 191 | + return text[text_index].isdigit() |
| 192 | + |
| 193 | + |
| 194 | +def calculate_closest_space_index(space_indices: List[int], first_match_index: int) -> int: |
| 195 | + |
| 196 | + closest_space_index = -1 |
| 197 | + |
| 198 | + for i in space_indices: |
| 199 | + if i < first_match_index: |
| 200 | + closest_space_index = i |
| 201 | + else: |
| 202 | + break |
| 203 | + |
| 204 | + return closest_space_index |
0 commit comments