Skip to content

Commit 2b3d047

Browse files
Christoph Hellwigkrisman-at-collabora
authored andcommitted
unicode: Add utf8-data module
utf8data.h contains a large database table which is an auto-generated decodification trie for the unicode normalization functions. Allow building it into a separate module. Based on a patch from Shreeya Patel <shreeya.patel@collabora.com>. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
1 parent 6ca99ce commit 2b3d047

File tree

9 files changed

+126
-91
lines changed

9 files changed

+126
-91
lines changed

fs/unicode/Kconfig

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,16 @@ config UNICODE
88
Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
99
support.
1010

11+
config UNICODE_UTF8_DATA
12+
tristate "UTF-8 normalization and casefolding tables"
13+
depends on UNICODE
14+
default UNICODE
15+
help
16+
This contains a large table of case foldings, which can be loaded as
17+
a separate module if you say M here. To be on the safe side stick
18+
to the default of Y. Saying N here makes no sense, if you do not want
19+
utf8 casefolding support, disable CONFIG_UNICODE instead.
20+
1121
config UNICODE_NORMALIZATION_SELFTEST
1222
tristate "Test UTF-8 normalization support"
13-
depends on UNICODE
14-
default n
23+
depends on UNICODE_UTF8_DATA

fs/unicode/Makefile

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@
22

33
obj-$(CONFIG_UNICODE) += unicode.o
44
obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
5+
obj-$(CONFIG_UNICODE_UTF8_DATA) += utf8data.o
56

67
unicode-y := utf8-norm.o utf8-core.o
78

8-
$(obj)/utf8-norm.o: $(obj)/utf8data.h
9+
$(obj)/utf8-data.o: $(obj)/utf8data.c
910

10-
# In the normal build, the checked-in utf8data.h is just shipped.
11+
# In the normal build, the checked-in utf8data.c is just shipped.
1112
#
12-
# To generate utf8data.h from UCD, put *.txt files in this directory
13+
# To generate utf8data.c from UCD, put *.txt files in this directory
1314
# and pass REGENERATE_UTF8DATA=1 from the command line.
1415
ifdef REGENERATE_UTF8DATA
1516

@@ -24,15 +25,15 @@ quiet_cmd_utf8data = GEN $@
2425
-t $(srctree)/$(src)/NormalizationTest.txt \
2526
-o $@
2627

27-
$(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
28+
$(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
2829
$(call if_changed,utf8data)
2930

3031
else
3132

32-
$(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE
33+
$(obj)/utf8data.c: $(src)/utf8data.c_shipped FORCE
3334
$(call if_changed,shipped)
3435

3536
endif
3637

37-
targets += utf8data.h
38+
targets += utf8data.c
3839
hostprogs += mkutf8data

fs/unicode/mkutf8data.c

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3287,12 +3287,10 @@ static void write_file(void)
32873287
open_fail(utf8_name, errno);
32883288

32893289
fprintf(file, "/* This file is generated code, do not edit. */\n");
3290-
fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3291-
fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3292-
fprintf(file, "#endif\n");
32933290
fprintf(file, "\n");
3294-
fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3295-
unicode_maxage);
3291+
fprintf(file, "#include <linux/module.h>\n");
3292+
fprintf(file, "#include <linux/kernel.h>\n");
3293+
fprintf(file, "#include \"utf8n.h\"\n");
32963294
fprintf(file, "\n");
32973295
fprintf(file, "static const unsigned int utf8agetab[] = {\n");
32983296
for (i = 0; i != ages_count; i++)
@@ -3339,6 +3337,22 @@ static void write_file(void)
33393337
fprintf(file, "\n");
33403338
}
33413339
fprintf(file, "};\n");
3340+
fprintf(file, "\n");
3341+
fprintf(file, "struct utf8data_table utf8_data_table = {\n");
3342+
fprintf(file, "\t.utf8agetab = utf8agetab,\n");
3343+
fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
3344+
fprintf(file, "\n");
3345+
fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
3346+
fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
3347+
fprintf(file, "\n");
3348+
fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
3349+
fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
3350+
fprintf(file, "\n");
3351+
fprintf(file, "\t.utf8data = utf8data,\n");
3352+
fprintf(file, "};\n");
3353+
fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
3354+
fprintf(file, "\n");
3355+
fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
33423356
fclose(file);
33433357
}
33443358

fs/unicode/utf8-core.c

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -160,25 +160,45 @@ int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
160160
}
161161
EXPORT_SYMBOL(utf8_normalize);
162162

163+
static const struct utf8data *find_table_version(const struct utf8data *table,
164+
size_t nr_entries, unsigned int version)
165+
{
166+
size_t i = nr_entries - 1;
167+
168+
while (version < table[i].maxage)
169+
i--;
170+
if (version > table[i].maxage)
171+
return NULL;
172+
return &table[i];
173+
}
174+
163175
struct unicode_map *utf8_load(unsigned int version)
164176
{
165177
struct unicode_map *um;
166178

167-
if (!utf8version_is_supported(version))
168-
return ERR_PTR(-EINVAL);
169-
170179
um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
171180
if (!um)
172181
return ERR_PTR(-ENOMEM);
173182
um->version = version;
174-
um->ntab[UTF8_NFDI] = utf8nfdi(version);
175-
if (!um->ntab[UTF8_NFDI])
183+
184+
um->tables = symbol_request(utf8_data_table);
185+
if (!um->tables)
176186
goto out_free_um;
177-
um->ntab[UTF8_NFDICF] = utf8nfdicf(version);
187+
188+
if (!utf8version_is_supported(um, version))
189+
goto out_symbol_put;
190+
um->ntab[UTF8_NFDI] = find_table_version(um->tables->utf8nfdidata,
191+
um->tables->utf8nfdidata_size, um->version);
192+
if (!um->ntab[UTF8_NFDI])
193+
goto out_symbol_put;
194+
um->ntab[UTF8_NFDICF] = find_table_version(um->tables->utf8nfdicfdata,
195+
um->tables->utf8nfdicfdata_size, um->version);
178196
if (!um->ntab[UTF8_NFDICF])
179-
goto out_free_um;
197+
goto out_symbol_put;
180198
return um;
181199

200+
out_symbol_put:
201+
symbol_put(um->tables);
182202
out_free_um:
183203
kfree(um);
184204
return ERR_PTR(-EINVAL);
@@ -187,7 +207,10 @@ EXPORT_SYMBOL(utf8_load);
187207

188208
void utf8_unload(struct unicode_map *um)
189209
{
190-
kfree(um);
210+
if (um) {
211+
symbol_put(utf8_data_table);
212+
kfree(um);
213+
}
191214
}
192215
EXPORT_SYMBOL(utf8_unload);
193216

fs/unicode/utf8-norm.c

Lines changed: 9 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,12 @@
66

77
#include "utf8n.h"
88

9-
struct utf8data {
10-
unsigned int maxage;
11-
unsigned int offset;
12-
};
13-
14-
#define __INCLUDED_FROM_UTF8NORM_C__
15-
#include "utf8data.h"
16-
#undef __INCLUDED_FROM_UTF8NORM_C__
17-
18-
int utf8version_is_supported(unsigned int version)
9+
int utf8version_is_supported(const struct unicode_map *um, unsigned int version)
1910
{
20-
int i = ARRAY_SIZE(utf8agetab) - 1;
11+
int i = um->tables->utf8agetab_size - 1;
2112

22-
while (i >= 0 && utf8agetab[i] != 0) {
23-
if (version == utf8agetab[i])
13+
while (i >= 0 && um->tables->utf8agetab[i] != 0) {
14+
if (version == um->tables->utf8agetab[i])
2415
return 1;
2516
i--;
2617
}
@@ -161,7 +152,7 @@ typedef const unsigned char utf8trie_t;
161152
* underlying datatype: unsigned char.
162153
*
163154
* leaf[0]: The unicode version, stored as a generation number that is
164-
* an index into utf8agetab[]. With this we can filter code
155+
* an index into ->utf8agetab[]. With this we can filter code
165156
* points based on the unicode version in which they were
166157
* defined. The CCC of a non-defined code point is 0.
167158
* leaf[1]: Canonical Combining Class. During normalization, we need
@@ -313,7 +304,7 @@ static utf8leaf_t *utf8nlookup(const struct unicode_map *um,
313304
enum utf8_normalization n, unsigned char *hangul, const char *s,
314305
size_t len)
315306
{
316-
utf8trie_t *trie = utf8data + um->ntab[n]->offset;
307+
utf8trie_t *trie = um->tables->utf8data + um->ntab[n]->offset;
317308
int offlen;
318309
int offset;
319310
int mask;
@@ -404,7 +395,8 @@ ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n,
404395
leaf = utf8nlookup(um, n, hangul, s, len);
405396
if (!leaf)
406397
return -1;
407-
if (utf8agetab[LEAF_GEN(leaf)] > um->ntab[n]->maxage)
398+
if (um->tables->utf8agetab[LEAF_GEN(leaf)] >
399+
um->ntab[n]->maxage)
408400
ret += utf8clen(s);
409401
else if (LEAF_CCC(leaf) == DECOMPOSE)
410402
ret += strlen(LEAF_STR(leaf));
@@ -520,7 +512,7 @@ int utf8byte(struct utf8cursor *u8c)
520512

521513
ccc = LEAF_CCC(leaf);
522514
/* Characters that are too new have CCC 0. */
523-
if (utf8agetab[LEAF_GEN(leaf)] >
515+
if (u8c->um->tables->utf8agetab[LEAF_GEN(leaf)] >
524516
u8c->um->ntab[u8c->n]->maxage) {
525517
ccc = STOPPER;
526518
} else if (ccc == DECOMPOSE) {
@@ -597,25 +589,3 @@ int utf8byte(struct utf8cursor *u8c)
597589
}
598590
}
599591
EXPORT_SYMBOL(utf8byte);
600-
601-
const struct utf8data *utf8nfdi(unsigned int maxage)
602-
{
603-
int i = ARRAY_SIZE(utf8nfdidata) - 1;
604-
605-
while (maxage < utf8nfdidata[i].maxage)
606-
i--;
607-
if (maxage > utf8nfdidata[i].maxage)
608-
return NULL;
609-
return &utf8nfdidata[i];
610-
}
611-
612-
const struct utf8data *utf8nfdicf(unsigned int maxage)
613-
{
614-
int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
615-
616-
while (maxage < utf8nfdicfdata[i].maxage)
617-
i--;
618-
if (maxage > utf8nfdicfdata[i].maxage)
619-
return NULL;
620-
return &utf8nfdicfdata[i];
621-
}

fs/unicode/utf8-selftest.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -255,21 +255,21 @@ static void check_utf8_comparisons(struct unicode_map *table)
255255
}
256256
}
257257

258-
static void check_supported_versions(void)
258+
static void check_supported_versions(struct unicode_map *um)
259259
{
260260
/* Unicode 7.0.0 should be supported. */
261-
test(utf8version_is_supported(UNICODE_AGE(7, 0, 0)));
261+
test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
262262

263263
/* Unicode 9.0.0 should be supported. */
264-
test(utf8version_is_supported(UNICODE_AGE(9, 0, 0)));
264+
test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
265265

266266
/* Unicode 1x.0.0 (the latest version) should be supported. */
267-
test(utf8version_is_supported(UTF8_LATEST));
267+
test(utf8version_is_supported(um, UTF8_LATEST));
268268

269269
/* Next versions don't exist. */
270-
test(!utf8version_is_supported(UNICODE_AGE(13, 0, 0)));
271-
test(!utf8version_is_supported(UNICODE_AGE(0, 0, 0)));
272-
test(!utf8version_is_supported(UNICODE_AGE(-1, -1, -1)));
270+
test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
271+
test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
272+
test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
273273
}
274274

275275
static int __init init_test_ucd(void)
@@ -285,7 +285,7 @@ static int __init init_test_ucd(void)
285285
return PTR_ERR(um);
286286
}
287287

288-
check_supported_versions();
288+
check_supported_versions(um);
289289
check_utf8_nfdi(um);
290290
check_utf8_nfdicf(um);
291291
check_utf8_comparisons(um);

fs/unicode/utf8data.h_shipped renamed to fs/unicode/utf8data.c_shipped

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
/* This file is generated code, do not edit. */
2-
#ifndef __INCLUDED_FROM_UTF8NORM_C__
3-
#error Only nls_utf8-norm.c should include this file.
4-
#endif
52

6-
static const unsigned int utf8vers = 0xc0100;
3+
#include <linux/module.h>
4+
#include <linux/kernel.h>
5+
#include "utf8n.h"
76

87
static const unsigned int utf8agetab[] = {
98
0,
@@ -4107,3 +4106,18 @@ static const unsigned char utf8data[64256] = {
41074106
0x52,0x04,0x00,0x00,0x11,0x04,0x00,0x00,0x02,0x00,0xcf,0x86,0xcf,0x06,0x02,0x00,
41084107
0x81,0x80,0xcf,0x86,0x85,0x84,0xcf,0x86,0xcf,0x06,0x02,0x00,0x00,0x00,0x00,0x00
41094108
};
4109+
4110+
struct utf8data_table utf8_data_table = {
4111+
.utf8agetab = utf8agetab,
4112+
.utf8agetab_size = ARRAY_SIZE(utf8agetab),
4113+
4114+
.utf8nfdicfdata = utf8nfdicfdata,
4115+
.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),
4116+
4117+
.utf8nfdidata = utf8nfdidata,
4118+
.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),
4119+
4120+
.utf8data = utf8data,
4121+
};
4122+
EXPORT_SYMBOL_GPL(utf8_data_table);
4123+
MODULE_LICENSE("GPL v2");

fs/unicode/utf8n.h

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,7 @@
1313
#include <linux/module.h>
1414
#include <linux/unicode.h>
1515

16-
int utf8version_is_supported(unsigned int version);
17-
18-
/*
19-
* Look for the correct const struct utf8data for a unicode version.
20-
* Returns NULL if the version requested is too new.
21-
*
22-
* Two normalization forms are supported: nfdi and nfdicf.
23-
*
24-
* nfdi:
25-
* - Apply unicode normalization form NFD.
26-
* - Remove any Default_Ignorable_Code_Point.
27-
*
28-
* nfdicf:
29-
* - Apply unicode normalization form NFD.
30-
* - Remove any Default_Ignorable_Code_Point.
31-
* - Apply a full casefold (C + F).
32-
*/
33-
extern const struct utf8data *utf8nfdi(unsigned int maxage);
34-
extern const struct utf8data *utf8nfdicf(unsigned int maxage);
16+
int utf8version_is_supported(const struct unicode_map *um, unsigned int version);
3517

3618
/*
3719
* Determine the length of the normalized from of the string,
@@ -78,4 +60,24 @@ int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um,
7860
*/
7961
extern int utf8byte(struct utf8cursor *u8c);
8062

63+
struct utf8data {
64+
unsigned int maxage;
65+
unsigned int offset;
66+
};
67+
68+
struct utf8data_table {
69+
const unsigned int *utf8agetab;
70+
int utf8agetab_size;
71+
72+
const struct utf8data *utf8nfdicfdata;
73+
int utf8nfdicfdata_size;
74+
75+
const struct utf8data *utf8nfdidata;
76+
int utf8nfdidata_size;
77+
78+
const unsigned char *utf8data;
79+
};
80+
81+
extern struct utf8data_table utf8_data_table;
82+
8183
#endif /* UTF8NORM_H */

include/linux/unicode.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <linux/dcache.h>
77

88
struct utf8data;
9+
struct utf8data_table;
910

1011
#define UNICODE_MAJ_SHIFT 16
1112
#define UNICODE_MIN_SHIFT 8
@@ -49,6 +50,7 @@ enum utf8_normalization {
4950
struct unicode_map {
5051
unsigned int version;
5152
const struct utf8data *ntab[UTF8_NMAX];
53+
const struct utf8data_table *tables;
5254
};
5355

5456
int utf8_validate(const struct unicode_map *um, const struct qstr *str);

0 commit comments

Comments
 (0)