|
| 1 | +from vllm.beam.emoji_data import EMOJI_DATA |
| 2 | + |
| 3 | +_EMOJI_SEARCH_TREE = None |
| 4 | + |
| 5 | +def emoji_count(input: str) -> int: |
| 6 | + return len(emoji_list(input)) |
| 7 | + |
| 8 | +def emoji_list(input: str) -> list: |
| 9 | + _entities = [] |
| 10 | + |
| 11 | + def f(emj, emj_data): |
| 12 | + _entities.append({ |
| 13 | + 'match_start': emj_data['match_start'], |
| 14 | + 'match_end': emj_data['match_end'], |
| 15 | + 'emoji': emj, |
| 16 | + }) |
| 17 | + |
| 18 | + demojize(input, language='en', version=-1, handle_version=f) |
| 19 | + return _entities |
| 20 | + |
| 21 | +def demojize( |
| 22 | + string, |
| 23 | + delimiters=(":", ":"), |
| 24 | + language='en', |
| 25 | + version=None, |
| 26 | + handle_version=None |
| 27 | +): |
| 28 | + """ |
| 29 | + Replace unicode emoji in a string with emoji shortcodes. Useful for storage. |
| 30 | + >>> import emoji |
| 31 | + >>> print(emoji.emojize("Python is fun :thumbs_up:")) |
| 32 | + Python is fun 👍 |
| 33 | + >>> print(emoji.demojize(u"Python is fun 👍")) |
| 34 | + Python is fun :thumbs_up: |
| 35 | + >>> print(emoji.demojize(u"Unicode is tricky 😯", delimiters=("__", "__"))) |
| 36 | + Unicode is tricky __hushed_face__ |
| 37 | +
|
| 38 | + :param string: String contains unicode characters. MUST BE UNICODE. |
| 39 | + :param delimiters: (optional) User delimiters other than ``_DEFAULT_DELIMITER`` |
| 40 | + :param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias' |
| 41 | + to use English aliases |
| 42 | + :param version: (optional) Max version. If set to an Emoji Version, |
| 43 | + all emoji above this version will be removed. |
| 44 | + :param handle_version: (optional) Replace the emoji above ``version`` |
| 45 | + instead of removing it. handle_version can be either a string or a |
| 46 | + callable ``handle_version(emj: str, data: dict) -> str``; If it is |
| 47 | + a callable, it's passed the unicode emoji and the data dict from |
| 48 | + emoji.EMOJI_DATA and must return a replacement string to be used. |
| 49 | + The passed data is in the form of:: |
| 50 | +
|
| 51 | + handle_version(u'\\U0001F6EB', { |
| 52 | + 'en' : ':airplane_departure:', |
| 53 | + 'status' : fully_qualified, |
| 54 | + 'E' : 1, |
| 55 | + 'alias' : [u':flight_departure:'], |
| 56 | + 'de': u':abflug:', |
| 57 | + 'es': u':avión_despegando:', |
| 58 | + ... |
| 59 | + }) |
| 60 | +
|
| 61 | + """ |
| 62 | + if language == 'alias': |
| 63 | + language = 'en' |
| 64 | + _use_aliases = True |
| 65 | + else: |
| 66 | + _use_aliases = False |
| 67 | + |
| 68 | + tree = _get_search_tree() |
| 69 | + result = [] |
| 70 | + i = 0 |
| 71 | + length = len(string) |
| 72 | + while i < length: |
| 73 | + consumed = False |
| 74 | + char = string[i] |
| 75 | + if char in tree: |
| 76 | + j = i + 1 |
| 77 | + sub_tree = tree[char] |
| 78 | + while j < length and string[j] in sub_tree: |
| 79 | + sub_tree = sub_tree[string[j]] |
| 80 | + j += 1 |
| 81 | + if 'data' in sub_tree: |
| 82 | + emj_data = sub_tree['data'] |
| 83 | + code_points = string[i:j] |
| 84 | + replace_str = None |
| 85 | + if version is not None and emj_data['E'] > version: |
| 86 | + if callable(handle_version): |
| 87 | + emj_data = emj_data.copy() |
| 88 | + emj_data['match_start'] = i |
| 89 | + emj_data['match_end'] = j |
| 90 | + replace_str = handle_version(code_points, emj_data) |
| 91 | + elif handle_version is not None: |
| 92 | + replace_str = str(handle_version) |
| 93 | + else: |
| 94 | + replace_str = None |
| 95 | + elif language in emj_data: |
| 96 | + if _use_aliases and 'alias' in emj_data: |
| 97 | + replace_str = delimiters[0] + emj_data['alias'][0][1:-1] + delimiters[1] |
| 98 | + else: |
| 99 | + replace_str = delimiters[0] + emj_data[language][1:-1] + delimiters[1] |
| 100 | + else: |
| 101 | + # The emoji exists, but it is not translated, so we keep the emoji |
| 102 | + replace_str = code_points |
| 103 | + |
| 104 | + i = j - 1 |
| 105 | + consumed = True |
| 106 | + if replace_str: |
| 107 | + result.append(replace_str) |
| 108 | + |
| 109 | + if not consumed and char != u'\ufe0e' and char != u'\ufe0f': |
| 110 | + result.append(char) |
| 111 | + i += 1 |
| 112 | + |
| 113 | + return "".join(result) |
| 114 | + |
| 115 | +def _get_search_tree(): |
| 116 | + """ |
| 117 | + Generate a search tree for demojize(). |
| 118 | + Example of a search tree:: |
| 119 | +
|
| 120 | + EMOJI_DATA = |
| 121 | + {'a': {'en': ':Apple:'}, |
| 122 | + 'b': {'en': ':Bus:'}, |
| 123 | + 'ba': {'en': ':Bat:'}, |
| 124 | + 'band': {'en': ':Beatles:'}, |
| 125 | + 'bandit': {'en': ':Outlaw:'}, |
| 126 | + 'bank': {'en': ':BankOfEngland:'}, |
| 127 | + 'bb': {'en': ':BB-gun:'}, |
| 128 | + 'c': {'en': ':Car:'}} |
| 129 | +
|
| 130 | + _SEARCH_TREE = |
| 131 | + {'a': {'data': {'en': ':Apple:'}}, |
| 132 | + 'b': {'a': {'data': {'en': ':Bat:'}, |
| 133 | + 'n': {'d': {'data': {'en': ':Beatles:'}, |
| 134 | + 'i': {'t': {'data': {'en': ':Outlaw:'}}}}, |
| 135 | + 'k': {'data': {'en': ':BankOfEngland:'}}}}, |
| 136 | + 'b': {'data': {'en': ':BB-gun:'}}, |
| 137 | + 'data': {'en': ':Bus:'}}, |
| 138 | + 'c': {'data': {'en': ':Car:'}}} |
| 139 | +
|
| 140 | + _SEARCH_TREE |
| 141 | + / | ⧵ |
| 142 | + / | ⧵ |
| 143 | + a b c |
| 144 | + | / | ⧵ | |
| 145 | + | / | ⧵ | |
| 146 | + :Apple: ba :Bus: bb :Car: |
| 147 | + / ⧵ | |
| 148 | + / ⧵ | |
| 149 | + :Bat: ban :BB-gun: |
| 150 | + / ⧵ |
| 151 | + / ⧵ |
| 152 | + band bank |
| 153 | + / ⧵ | |
| 154 | + / ⧵ | |
| 155 | + bandi :Beatles: :BankOfEngland: |
| 156 | + | |
| 157 | + bandit |
| 158 | + | |
| 159 | + :Outlaw: |
| 160 | +
|
| 161 | +
|
| 162 | + """ |
| 163 | + global _EMOJI_SEARCH_TREE |
| 164 | + if _EMOJI_SEARCH_TREE is None: |
| 165 | + _EMOJI_SEARCH_TREE = {} |
| 166 | + for emj in EMOJI_DATA: |
| 167 | + sub_tree = _EMOJI_SEARCH_TREE |
| 168 | + lastidx = len(emj) - 1 |
| 169 | + for i, char in enumerate(emj): |
| 170 | + if char not in sub_tree: |
| 171 | + sub_tree[char] = {} |
| 172 | + sub_tree = sub_tree[char] |
| 173 | + if i == lastidx: |
| 174 | + sub_tree['data'] = EMOJI_DATA[emj] |
| 175 | + return _EMOJI_SEARCH_TREE |
0 commit comments