Skip to content

Commit 79843f0

Browse files
lyrixxnicolas-grekas
authored andcommitted
[Intl] Add EmojiTransliterator to translate emoji to many locales
1 parent 9fc07ba commit 79843f0

File tree

9 files changed

+351
-1
lines changed

9 files changed

+351
-1
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
/phpunit.xml.dist export-ignore
33
/.gitattributes export-ignore
44
/.gitignore export-ignore
5+
/Resources/emoji export-ignore

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
CHANGELOG
22
=========
33

4+
6.2
5+
---
6+
7+
* Add `EmojiTransliterator` to translate emoji to many locales
8+
49
6.0
510
---
611

Resources/emoji/Makefile

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
.PHONY: help update build
2+
.DEFAULT_GOAL := help
3+
4+
update: ## Update sources
5+
@composer update
6+
7+
build: ## Build rules
8+
@./build.php
9+
10+
help:
11+
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-7s\033[0m %s\n", $$1, $$2}'

Resources/emoji/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Emoji Transliterator Builder
2+
3+
This folder contains the tool to build all transliterator rules.
4+
5+
## Requirements
6+
7+
* composer
8+
* PHP
9+
10+
## Update the rules
11+
12+
To update the rules, you need to update the version of `unicode-org/cldr` in the
13+
`composer.json` file, then run `make update`.
14+
15+
Finally, run the following command:
16+
17+
```bash
18+
make build
19+
```

Resources/emoji/build.php

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#!/usr/bin/env php
2+
<?php
3+
4+
/*
5+
* This file is part of the Symfony package.
6+
*
7+
* (c) Fabien Potencier <fabien@symfony.com>
8+
*
9+
* For the full copyright and license information, please view the LICENSE
10+
* file that was distributed with this source code.
11+
*/
12+
13+
require __DIR__.'/vendor/autoload.php';
14+
15+
use Symfony\Component\Filesystem\Filesystem;
16+
use Symfony\Component\Finder\Finder;
17+
18+
Builder::cleanTarget();
19+
$emojisCodePoints = Builder::getEmojisCodePoints();
20+
Builder::saveRules(Builder::buildRules($emojisCodePoints));
21+
22+
final class Builder
23+
{
24+
private const TARGET_DIR = __DIR__.'/../data/transliterator/emoji/';
25+
26+
public static function getEmojisCodePoints(): array
27+
{
28+
$lines = file(__DIR__.'/vendor/unicode-org/cldr/tools/cldr-code/src/main/resources/org/unicode/cldr/util/data/emoji/emoji-test.txt');
29+
30+
$emojisCodePoints = [];
31+
foreach ($lines as $line) {
32+
$line = trim($line);
33+
if (!$line || str_starts_with($line, '#')) {
34+
continue;
35+
}
36+
37+
// 263A FE0F ; fully-qualified # ☺️ E0.6 smiling face
38+
preg_match('{^(?<codePoints>[\w ]+) +; [\w-]+ +# (?<emoji>.+) E\d+\.\d+ ?(?<name>.+)$}Uu', $line, $matches);
39+
if (!$matches) {
40+
throw new \DomainException("Could not parse line: \"$line\".");
41+
}
42+
43+
$codePoints = strtolower(trim($matches['codePoints']));
44+
$emojisCodePoints[$codePoints] = $matches['emoji'];
45+
// We also add a version without the "Zero Width Joiner"
46+
$codePoints = str_replace('200d ', '', $codePoints);
47+
$emojisCodePoints[$codePoints] = $matches['emoji'];
48+
}
49+
50+
return $emojisCodePoints;
51+
}
52+
53+
public static function buildRules(array $emojisCodePoints): Generator
54+
{
55+
$files = (new Finder())
56+
->files()
57+
->in([
58+
__DIR__.'/vendor/unicode-org/cldr/common/annotationsDerived',
59+
__DIR__.'/vendor/unicode-org/cldr/common/annotations',
60+
])
61+
->name('*.xml')
62+
;
63+
64+
$ignored = [];
65+
$mapsByLocale = [];
66+
67+
foreach ($files as $file) {
68+
$locale = $file->getBasename('.xml');
69+
70+
$document = new DOMDocument();
71+
$document->loadXML(file_get_contents($file));
72+
$xpath = new DOMXPath($document);
73+
$results = $xpath->query('.//annotation[@type="tts"]');
74+
75+
foreach ($results as $result) {
76+
$emoji = $result->getAttribute('cp');
77+
$name = $result->textContent;
78+
$parts = preg_split('//u', $emoji, -1, \PREG_SPLIT_NO_EMPTY);
79+
$emojiCodePoints = implode(' ', array_map('dechex', array_map('mb_ord', $parts)));
80+
if (!array_key_exists($emojiCodePoints, $emojisCodePoints)) {
81+
$ignored[] = [
82+
'locale' => $locale,
83+
'emoji' => $emoji,
84+
'name' => $name,
85+
];
86+
continue;
87+
}
88+
self::testEmoji($emoji, $locale);
89+
$codePointsCount = mb_strlen($emoji);
90+
$mapsByLocale[$locale][$codePointsCount][$emoji] = $name;
91+
}
92+
}
93+
94+
foreach ($mapsByLocale as $locale => $maps) {
95+
yield $locale => self::createRules($maps);
96+
}
97+
}
98+
99+
public static function cleanTarget(): void
100+
{
101+
$fs = new Filesystem();
102+
$fs->remove(self::TARGET_DIR);
103+
$fs->mkdir(self::TARGET_DIR);
104+
}
105+
106+
public static function saveRules(iterable $rulesByLocale): void
107+
{
108+
foreach ($rulesByLocale as $locale => $rules) {
109+
file_put_contents(self::TARGET_DIR."/$locale.txt", $rules);
110+
}
111+
}
112+
113+
private static function testEmoji(string $emoji, string $locale): void
114+
{
115+
if (!Transliterator::createFromRules("\\$emoji > test ;")) {
116+
throw new \RuntimeException(sprintf('Could not create transliterator for "%s" in "%s" locale. Error: "%s".', $emoji, $locale, intl_get_error_message()));
117+
}
118+
}
119+
120+
private static function createRules(array $maps): string
121+
{
122+
// We must sort the maps by the number of code points, because the order really matters:
123+
// 🫶🏼 must be before 🫶
124+
krsort($maps);
125+
$maps = array_merge(...$maps);
126+
127+
$rules = '';
128+
foreach ($maps as $emoji => $name) {
129+
$name = preg_replace('{([^[:alnum:]])}u', '\\\\$1', $name);
130+
$rules .= "\\$emoji > $name ;\n";
131+
}
132+
133+
return $rules;
134+
}
135+
}

Resources/emoji/composer.json

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"repositories": [
3+
{
4+
"type": "package",
5+
"package": {
6+
"name": "unicode-org/cldr",
7+
"version": "2022.06.29",
8+
"source": {
9+
"type": "git",
10+
"url": "https://github.com/unicode-org/cldr",
11+
"reference": "production/2022-06-29-1740z"
12+
}
13+
}
14+
}
15+
],
16+
"require": {
17+
"php": ">=7.2",
18+
"symfony/filesystem": "^6",
19+
"symfony/finder": "^6",
20+
"unicode-org/cldr": "*"
21+
}
22+
}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <fabien@symfony.com>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\Component\Intl\Tests\Transliterator;
13+
14+
use PHPUnit\Framework\TestCase;
15+
use Symfony\Component\Finder\Finder;
16+
use Symfony\Component\Intl\Transliterator\EmojiTransliterator;
17+
18+
/**
19+
* @requires extension intl
20+
*/
21+
final class EmojiTransliteratorTest extends TestCase
22+
{
23+
public function provideTransliterateTests(): iterable
24+
{
25+
yield [
26+
'fr',
27+
'un 😺, 🐈‍⬛, et a 🦁 vont au 🏞️',
28+
'un chat qui sourit, chat noir, et a tête de lion vont au parc national️',
29+
];
30+
yield [
31+
'en',
32+
'a 😺, 🐈‍⬛, and a 🦁 go to 🏞️... 😍 🎉 💛',
33+
'a grinning cat, black cat, and a lion go to national park️... smiling face with heart-eyes party popper yellow heart',
34+
];
35+
36+
$specialArrowInput = '↔ - ↔️'; // The first arrow is particularly problematic!
37+
yield [
38+
'en',
39+
$specialArrowInput,
40+
'left-right arrow - left-right arrow️',
41+
];
42+
yield [
43+
'fr',
44+
$specialArrowInput,
45+
'flèche gauche droite - flèche gauche droite️',
46+
];
47+
}
48+
49+
/** @dataProvider provideTransliterateTests */
50+
public function testTransliterate(string $locale, string $input, string $expected)
51+
{
52+
$tr = EmojiTransliterator::getInstance($locale);
53+
54+
$this->assertSame($expected, $tr->transliterate($input));
55+
}
56+
57+
public function testTransliteratorCache()
58+
{
59+
$tr1 = EmojiTransliterator::getInstance('en');
60+
$tr2 = EmojiTransliterator::getInstance('en');
61+
62+
$this->assertSame($tr1, $tr2);
63+
}
64+
65+
public function provideLocaleTest(): iterable
66+
{
67+
$file = (new Finder())
68+
->in(__DIR__.'/../../Resources/data/transliterator/emoji')
69+
->name('*.txt')
70+
->files()
71+
;
72+
73+
foreach ($file as $file) {
74+
yield [$file->getBasename('.txt')];
75+
}
76+
}
77+
78+
/** @dataProvider provideLocaleTest */
79+
public function testAllTransliterator(string $locale)
80+
{
81+
$tr = EmojiTransliterator::getInstance($locale);
82+
83+
$this->assertNotEmpty($tr->transliterate('😀'));
84+
}
85+
86+
public function testTransliterateWithInvalidLocale()
87+
{
88+
$this->expectException(\InvalidArgumentException::class);
89+
$this->expectExceptionMessage('Invalid "../emoji/en" locale.');
90+
91+
EmojiTransliterator::getInstance('../emoji/en');
92+
}
93+
94+
public function testTransliterateWithMissingLocale()
95+
{
96+
$this->expectException(\RuntimeException::class);
97+
$this->expectExceptionMessage('The transliterator rules source does not exist for locale "invalid".');
98+
99+
EmojiTransliterator::getInstance('invalid');
100+
}
101+
102+
public function testTransliterateWithBrokenLocale()
103+
{
104+
$brokenFilename = __DIR__.'/../../Resources/data/transliterator/emoji/broken.txt';
105+
file_put_contents($brokenFilename, '😀 > oups\' ;');
106+
107+
$this->expectException(\RuntimeException::class);
108+
$this->expectExceptionMessage('Unable to create EmojiTransliterator instance: "transliterator_create_from_rules: unable to create ICU transliterator from rules (parse error at offset 4, after "😀 >", before or at " oups\' ;"): U_UNTERMINATED_QUOTE".');
109+
110+
try {
111+
EmojiTransliterator::getInstance('broken');
112+
} finally {
113+
unlink($brokenFilename);
114+
}
115+
}
116+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <fabien@symfony.com>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\Component\Intl\Transliterator;
13+
14+
final class EmojiTransliterator
15+
{
16+
private static array $transliteratorsByLocale = [];
17+
18+
public static function getInstance(string $locale): \Transliterator
19+
{
20+
return self::$transliteratorsByLocale[$locale] ??= self::createTransliterator($locale);
21+
}
22+
23+
private static function createTransliterator(string $locale): \Transliterator
24+
{
25+
if (!preg_match('/^[a-z0-9@_\\.\\-]*$/i', $locale)) {
26+
throw new \InvalidArgumentException(sprintf('Invalid "%s" locale.', $locale));
27+
}
28+
29+
$rulesFilename = __DIR__."/../Resources/data/transliterator/emoji/$locale.txt";
30+
if (!is_file($rulesFilename)) {
31+
throw new \RuntimeException(sprintf('The transliterator rules source does not exist for locale "%s".', $locale));
32+
}
33+
34+
if (!$transliterator = \Transliterator::createFromRules(file_get_contents($rulesFilename))) {
35+
throw new \RuntimeException(sprintf('Unable to create EmojiTransliterator instance: "%s".', intl_get_error_message()));
36+
}
37+
38+
return $transliterator;
39+
}
40+
}

composer.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
"php": ">=8.1"
2828
},
2929
"require-dev": {
30-
"symfony/filesystem": "^5.4|^6.0"
30+
"symfony/filesystem": "^5.4|^6.0",
31+
"symfony/finder": "^5.4|^6.0"
3132
},
3233
"autoload": {
3334
"psr-4": { "Symfony\\Component\\Intl\\": "" },

0 commit comments

Comments
 (0)