Skip to content

Commit b0f5869

Browse files
committed
Add metrics
1 parent 8532824 commit b0f5869

File tree

5 files changed

+43890
-1
lines changed

5 files changed

+43890
-1
lines changed

vllm/beam/emoji.py

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
from vllm.beam.emoji_data import EMOJI_DATA
2+
3+
_EMOJI_SEARCH_TREE = None
4+
5+
def emoji_count(input: str) -> int:
6+
return len(emoji_list(input))
7+
8+
def emoji_list(input: str) -> list:
9+
_entities = []
10+
11+
def f(emj, emj_data):
12+
_entities.append({
13+
'match_start': emj_data['match_start'],
14+
'match_end': emj_data['match_end'],
15+
'emoji': emj,
16+
})
17+
18+
demojize(input, language='en', version=-1, handle_version=f)
19+
return _entities
20+
21+
def demojize(
22+
string,
23+
delimiters=(":", ":"),
24+
language='en',
25+
version=None,
26+
handle_version=None
27+
):
28+
"""
29+
Replace unicode emoji in a string with emoji shortcodes. Useful for storage.
30+
>>> import emoji
31+
>>> print(emoji.emojize("Python is fun :thumbs_up:"))
32+
Python is fun 👍
33+
>>> print(emoji.demojize(u"Python is fun 👍"))
34+
Python is fun :thumbs_up:
35+
>>> print(emoji.demojize(u"Unicode is tricky 😯", delimiters=("__", "__")))
36+
Unicode is tricky __hushed_face__
37+
38+
:param string: String contains unicode characters. MUST BE UNICODE.
39+
:param delimiters: (optional) User delimiters other than ``_DEFAULT_DELIMITER``
40+
:param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias'
41+
to use English aliases
42+
:param version: (optional) Max version. If set to an Emoji Version,
43+
all emoji above this version will be removed.
44+
:param handle_version: (optional) Replace the emoji above ``version``
45+
instead of removing it. handle_version can be either a string or a
46+
callable ``handle_version(emj: str, data: dict) -> str``; If it is
47+
a callable, it's passed the unicode emoji and the data dict from
48+
emoji.EMOJI_DATA and must return a replacement string to be used.
49+
The passed data is in the form of::
50+
51+
handle_version(u'\\U0001F6EB', {
52+
'en' : ':airplane_departure:',
53+
'status' : fully_qualified,
54+
'E' : 1,
55+
'alias' : [u':flight_departure:'],
56+
'de': u':abflug:',
57+
'es': u':avión_despegando:',
58+
...
59+
})
60+
61+
"""
62+
if language == 'alias':
63+
language = 'en'
64+
_use_aliases = True
65+
else:
66+
_use_aliases = False
67+
68+
tree = _get_search_tree()
69+
result = []
70+
i = 0
71+
length = len(string)
72+
while i < length:
73+
consumed = False
74+
char = string[i]
75+
if char in tree:
76+
j = i + 1
77+
sub_tree = tree[char]
78+
while j < length and string[j] in sub_tree:
79+
sub_tree = sub_tree[string[j]]
80+
j += 1
81+
if 'data' in sub_tree:
82+
emj_data = sub_tree['data']
83+
code_points = string[i:j]
84+
replace_str = None
85+
if version is not None and emj_data['E'] > version:
86+
if callable(handle_version):
87+
emj_data = emj_data.copy()
88+
emj_data['match_start'] = i
89+
emj_data['match_end'] = j
90+
replace_str = handle_version(code_points, emj_data)
91+
elif handle_version is not None:
92+
replace_str = str(handle_version)
93+
else:
94+
replace_str = None
95+
elif language in emj_data:
96+
if _use_aliases and 'alias' in emj_data:
97+
replace_str = delimiters[0] + emj_data['alias'][0][1:-1] + delimiters[1]
98+
else:
99+
replace_str = delimiters[0] + emj_data[language][1:-1] + delimiters[1]
100+
else:
101+
# The emoji exists, but it is not translated, so we keep the emoji
102+
replace_str = code_points
103+
104+
i = j - 1
105+
consumed = True
106+
if replace_str:
107+
result.append(replace_str)
108+
109+
if not consumed and char != u'\ufe0e' and char != u'\ufe0f':
110+
result.append(char)
111+
i += 1
112+
113+
return "".join(result)
114+
115+
def _get_search_tree():
116+
"""
117+
Generate a search tree for demojize().
118+
Example of a search tree::
119+
120+
EMOJI_DATA =
121+
{'a': {'en': ':Apple:'},
122+
'b': {'en': ':Bus:'},
123+
'ba': {'en': ':Bat:'},
124+
'band': {'en': ':Beatles:'},
125+
'bandit': {'en': ':Outlaw:'},
126+
'bank': {'en': ':BankOfEngland:'},
127+
'bb': {'en': ':BB-gun:'},
128+
'c': {'en': ':Car:'}}
129+
130+
_SEARCH_TREE =
131+
{'a': {'data': {'en': ':Apple:'}},
132+
'b': {'a': {'data': {'en': ':Bat:'},
133+
'n': {'d': {'data': {'en': ':Beatles:'},
134+
'i': {'t': {'data': {'en': ':Outlaw:'}}}},
135+
'k': {'data': {'en': ':BankOfEngland:'}}}},
136+
'b': {'data': {'en': ':BB-gun:'}},
137+
'data': {'en': ':Bus:'}},
138+
'c': {'data': {'en': ':Car:'}}}
139+
140+
_SEARCH_TREE
141+
/ | ⧵
142+
/ | ⧵
143+
a b c
144+
| / | ⧵ |
145+
| / | ⧵ |
146+
:Apple: ba :Bus: bb :Car:
147+
/ ⧵ |
148+
/ ⧵ |
149+
:Bat: ban :BB-gun:
150+
/ ⧵
151+
/ ⧵
152+
band bank
153+
/ ⧵ |
154+
/ ⧵ |
155+
bandi :Beatles: :BankOfEngland:
156+
|
157+
bandit
158+
|
159+
:Outlaw:
160+
161+
162+
"""
163+
global _EMOJI_SEARCH_TREE
164+
if _EMOJI_SEARCH_TREE is None:
165+
_EMOJI_SEARCH_TREE = {}
166+
for emj in EMOJI_DATA:
167+
sub_tree = _EMOJI_SEARCH_TREE
168+
lastidx = len(emj) - 1
169+
for i, char in enumerate(emj):
170+
if char not in sub_tree:
171+
sub_tree[char] = {}
172+
sub_tree = sub_tree[char]
173+
if i == lastidx:
174+
sub_tree['data'] = EMOJI_DATA[emj]
175+
return _EMOJI_SEARCH_TREE

0 commit comments

Comments
 (0)