Skip to content

Commit c946931

Browse files
committed
Add binary:split/3 and string:find/2,3
Also update `string:split/2,3` to work with unicode binaries and fix specs. Signed-off-by: Paul Guyot <pguyot@kallisys.net>
1 parent 4b37211 commit c946931

File tree

10 files changed

+266
-38
lines changed

10 files changed

+266
-38
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ also non string parameters (e.g. `Enum.join([1, 2], ",")`
2727
- Support for `code:ensure_loaded/1`
2828
- Support for `io_lib:latin1_char_list/1`
2929
- Add support to Elixir for `Keyword.split/2`
30+
- Support for `binary:split/3` and `string:find/2,3`
3031

3132
### Changed
3233

libs/estdlib/src/binary.erl

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
%%-----------------------------------------------------------------------------
2525
-module(binary).
2626

27-
-export([at/2, part/3, split/2]).
27+
-export([at/2, part/3, split/2, split/3]).
2828

2929
%%-----------------------------------------------------------------------------
3030
%% @param Binary binary to get a byte from
@@ -51,6 +51,7 @@ part(_Binary, _Pos, _Len) ->
5151
erlang:nif_error(undefined).
5252

5353
%%-----------------------------------------------------------------------------
54+
%% @equiv split(Binary, Pattern, [])
5455
%% @param Binary binary to split
5556
%% @param Pattern pattern to perform the split
5657
%% @return a list composed of one or two binaries
@@ -62,3 +63,18 @@ part(_Binary, _Pos, _Len) ->
6263
-spec split(Binary :: binary(), Pattern :: binary()) -> [binary()].
6364
split(_Binary, _Pattern) ->
6465
erlang:nif_error(undefined).
66+
67+
%%-----------------------------------------------------------------------------
68+
%% @param Binary binary to split
69+
%% @param Pattern pattern to perform the split
70+
%% @param Options options for the split
71+
%% @return a list composed of one or two binaries
72+
%% @doc Split a binary according to pattern.
73+
%% If pattern is not found, returns a singleton list with the passed binary.
74+
%% Unlike Erlang/OTP, pattern must be a binary.
75+
%% Only implemented option is `global'
76+
%% @end
77+
%%-----------------------------------------------------------------------------
78+
-spec split(Binary :: binary(), Pattern :: binary(), Option :: [global]) -> [binary()].
79+
split(_Binary, _Pattern, _Option) ->
80+
erlang:nif_error(undefined).

libs/estdlib/src/string.erl

Lines changed: 93 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
%%-----------------------------------------------------------------------------
2828
-module(string).
2929

30-
-export([to_upper/1, to_lower/1, split/2, split/3, trim/1, trim/2]).
30+
-export([to_upper/1, to_lower/1, split/2, split/3, trim/1, trim/2, find/2, find/3]).
3131

3232
%%-----------------------------------------------------------------------------
3333
%% @param Input a string or character to convert
@@ -76,7 +76,7 @@ lower_char(C) when is_integer(C) ->
7676
%% @returns chardata
7777
%% @end
7878
%%-----------------------------------------------------------------------------
79-
-spec split(String :: string(), Pattern :: string()) -> string() | char().
79+
-spec split(String :: unicode:chardata(), Pattern :: unicode:chardata()) -> [unicode:chardata()].
8080
split(String, Pattern) ->
8181
split(String, Pattern, leading).
8282

@@ -98,25 +98,50 @@ split(String, Pattern) ->
9898
%% [<<"ab">>,<<"bc">>,<<>>,<<"cd">>]'''
9999
%% @end
100100
%%-----------------------------------------------------------------------------
101-
-spec split(String :: string(), Pattern :: string() | char(), Where :: atom()) -> string() | char().
102-
split(String, Pattern, Where) ->
103-
split(String, Pattern, Where, [], []).
101+
-spec split(
102+
String :: unicode:chardata(), Pattern :: unicode:chardata(), Where :: leading | trailing | all
103+
) -> [unicode:chardata()].
104+
split(String, Pattern, Where) when is_binary(String) andalso is_list(Pattern) ->
105+
split_binary(String, unicode:characters_to_binary(Pattern), Where);
106+
split(String, Pattern, Where) when is_binary(String) andalso is_binary(Pattern) ->
107+
split_binary(String, Pattern, Where);
108+
split(String, Pattern, Where) when is_list(String) andalso is_binary(Pattern) ->
109+
split_list(String, unicode:characters_to_list(Pattern), Where);
110+
split(String, Pattern, Where) when is_list(String) andalso is_list(Pattern) ->
111+
split_list(String, Pattern, Where).
104112

105113
%% @private
106-
split([], _Pattern, _Where, Token, Accum) ->
114+
split_binary(String, Pattern, leading) ->
115+
binary:split(String, Pattern);
116+
split_binary(String, Pattern, all) ->
117+
binary:split(String, Pattern, [global]);
118+
split_binary(String, Pattern, trailing) ->
119+
case find_binary(String, Pattern, trailing) of
120+
nomatch ->
121+
[String];
122+
Rest ->
123+
[binary:part(String, 0, byte_size(String) - byte_size(Rest) - byte_size(Pattern)), Rest]
124+
end.
125+
126+
%% @private
127+
split_list(String, Pattern, Where) ->
128+
split_list(String, Pattern, Where, [], []).
129+
130+
%% @private
131+
split_list([], _Pattern, _Where, Token, Accum) ->
107132
lists:reverse([lists:reverse(Token) | Accum]);
108-
split(String, Pattern, Where, Token, Accum) ->
133+
split_list(String, Pattern, Where, Token, Accum) ->
109134
case prefix_match(String, Pattern) of
110135
{ok, Rest} ->
111136
case Where of
112137
leading ->
113138
[lists:reverse(Token), Rest];
114139
all ->
115-
split(Rest, Pattern, Where, [], [lists:reverse(Token) | Accum])
140+
split_list(Rest, Pattern, Where, [], [lists:reverse(Token) | Accum])
116141
end;
117142
no ->
118143
[Char | Rest] = String,
119-
split(Rest, Pattern, Where, [Char | Token], Accum)
144+
split_list(Rest, Pattern, Where, [Char | Token], Accum)
120145
end.
121146

122147
%% @private
@@ -167,3 +192,62 @@ triml([$\s | R]) ->
167192
triml(R);
168193
triml(R) ->
169194
R.
195+
196+
%%-----------------------------------------------------------------------------
197+
%% @equiv find(String, SearchPattern, leading)
198+
%% @param String string to search in
199+
%% @param SearchPattern pattern to search
200+
%% @returns remainder of String starting from first occurrence of SearchPattern
201+
%% or `nomatch' if SearchPattern cannot be found in String
202+
%% @end
203+
%%-----------------------------------------------------------------------------
204+
-spec find(String :: unicode:chardata(), SearchPattern :: unicode:chardata()) ->
205+
unicode:chardata() | nomatch.
206+
find(String, SearchPattern) ->
207+
find(String, SearchPattern, leading).
208+
209+
%%-----------------------------------------------------------------------------
210+
%% @param String string to search in
211+
%% @param SearchPattern pattern to search
212+
%% @param Direction direction to search, `leading' or `trailing'
213+
%% @returns remainder of String starting from first or last occurrence of
214+
%% SearchPattern or `nomatch' if SearchPattern cannot be found in String
215+
%% @end
216+
%%-----------------------------------------------------------------------------
217+
-spec find(
218+
String :: unicode:chardata(),
219+
SearchPattern :: unicode:chardata(),
220+
Direction :: leading | trailing
221+
) -> unicode:chardata() | nomatch.
222+
find(String, "", _Direction) ->
223+
String;
224+
find(String, <<>>, _Direction) ->
225+
String;
226+
find(String, SearchPattern, Direction) when is_binary(String) andalso is_list(SearchPattern) ->
227+
find_binary(String, unicode:characters_to_binary(SearchPattern), Direction);
228+
find(String, SearchPattern, Direction) when is_binary(String) andalso is_binary(SearchPattern) ->
229+
find_binary(String, SearchPattern, Direction);
230+
find(String, SearchPattern, Direction) when is_list(String) andalso is_binary(SearchPattern) ->
231+
find_list(String, unicode:characters_to_list(SearchPattern), Direction);
232+
find(String, SearchPattern, Direction) when is_list(String) andalso is_list(SearchPattern) ->
233+
find_list(String, SearchPattern, Direction).
234+
235+
%% @private
236+
find_binary(<<_C, Rest/binary>> = String, SearchPattern, leading) when
237+
byte_size(String) >= byte_size(SearchPattern)
238+
->
239+
case binary:part(String, 0, byte_size(SearchPattern)) =:= SearchPattern of
240+
true -> String;
241+
false -> find_binary(Rest, SearchPattern, leading)
242+
end;
243+
find_binary(_Sring, _SearchPattern, leading) ->
244+
nomatch.
245+
246+
%% @private
247+
find_list([_C | Rest] = String, SearchPattern, leading) ->
248+
case prefix_match(String, SearchPattern) of
249+
{ok, _Rest} -> String;
250+
no -> find_list(Rest, SearchPattern, leading)
251+
end;
252+
find_list([], _SearchPattern, leading) ->
253+
nomatch.

src/libAtomVM/defaultatoms.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ static const char *const cast_atom = "\x5" "$cast";
160160

161161
static const char *const unicode_atom = "\x7" "unicode";
162162

163+
static const char *const global_atom = "\x6" "global";
164+
163165
void defaultatoms_init(GlobalContext *glb)
164166
{
165167
int ok = 1;
@@ -304,6 +306,8 @@ void defaultatoms_init(GlobalContext *glb)
304306

305307
ok &= globalcontext_insert_atom(glb, unicode_atom) == UNICODE_ATOM_INDEX;
306308

309+
ok &= globalcontext_insert_atom(glb, global_atom) == GLOBAL_ATOM_INDEX;
310+
307311
if (!ok) {
308312
AVM_ABORT();
309313
}

src/libAtomVM/defaultatoms.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,9 @@ extern "C" {
169169

170170
#define UNICODE_ATOM_INDEX 110
171171

172-
#define PLATFORM_ATOMS_BASE_INDEX 111
172+
#define GLOBAL_ATOM_INDEX 111
173+
174+
#define PLATFORM_ATOMS_BASE_INDEX 112
173175

174176
#define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX)
175177
#define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX)
@@ -313,6 +315,8 @@ extern "C" {
313315

314316
#define UNICODE_ATOM TERM_FROM_ATOM_INDEX(UNICODE_ATOM_INDEX)
315317

318+
#define GLOBAL_ATOM TERM_FROM_ATOM_INDEX(GLOBAL_ATOM_INDEX)
319+
316320
void defaultatoms_init(GlobalContext *glb);
317321

318322
void platform_defaultatoms_init(GlobalContext *glb);

src/libAtomVM/nifs.c

Lines changed: 77 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ static term nif_binary_at_2(Context *ctx, int argc, term argv[]);
9191
static term nif_binary_first_1(Context *ctx, int argc, term argv[]);
9292
static term nif_binary_last_1(Context *ctx, int argc, term argv[]);
9393
static term nif_binary_part_3(Context *ctx, int argc, term argv[]);
94-
static term nif_binary_split_2(Context *ctx, int argc, term argv[]);
94+
static term nif_binary_split(Context *ctx, int argc, term argv[]);
9595
static term nif_calendar_system_time_to_universal_time_2(Context *ctx, int argc, term argv[]);
9696
static term nif_erlang_delete_element_2(Context *ctx, int argc, term argv[]);
9797
static term nif_erlang_atom_to_binary(Context *ctx, int argc, term argv[]);
@@ -232,7 +232,7 @@ static const struct Nif binary_part_nif =
232232
static const struct Nif binary_split_nif =
233233
{
234234
.base.type = NIFFunctionType,
235-
.nif_ptr = nif_binary_split_2
235+
.nif_ptr = nif_binary_split
236236
};
237237

238238
static const struct Nif make_ref_nif =
@@ -3007,16 +3007,33 @@ static term nif_binary_part_3(Context *ctx, int argc, term argv[])
30073007
return term_maybe_create_sub_binary(bin_term, pos, len, &ctx->heap, ctx->global);
30083008
}
30093009

3010-
static term nif_binary_split_2(Context *ctx, int argc, term argv[])
3010+
static term nif_binary_split(Context *ctx, int argc, term argv[])
30113011
{
3012-
UNUSED(argc);
3013-
30143012
term bin_term = argv[0];
30153013
term pattern_term = argv[1];
30163014

30173015
VALIDATE_VALUE(bin_term, term_is_binary);
30183016
VALIDATE_VALUE(pattern_term, term_is_binary);
30193017

3018+
bool global = false;
3019+
if (argc == 3) {
3020+
term options = argv[2];
3021+
if (UNLIKELY(!term_is_list(options))) {
3022+
RAISE_ERROR(BADARG_ATOM);
3023+
}
3024+
if (term_is_nonempty_list(options)) {
3025+
term head = term_get_list_head(options);
3026+
term tail = term_get_list_tail(options);
3027+
if (UNLIKELY(head != GLOBAL_ATOM)) {
3028+
RAISE_ERROR(BADARG_ATOM);
3029+
}
3030+
if (UNLIKELY(!term_is_nil(tail))) {
3031+
RAISE_ERROR(BADARG_ATOM);
3032+
}
3033+
global = true;
3034+
}
3035+
}
3036+
30203037
int bin_size = term_binary_size(bin_term);
30213038
int pattern_size = term_binary_size(pattern_term);
30223039

@@ -3027,38 +3044,71 @@ static term nif_binary_split_2(Context *ctx, int argc, term argv[])
30273044
const char *bin_data = term_binary_data(bin_term);
30283045
const char *pattern_data = term_binary_data(pattern_term);
30293046

3030-
const char *found = (const char *) memmem(bin_data, bin_size, pattern_data, pattern_size);
3047+
// Count segments first to allocate memory once.
3048+
size_t num_segments = 1;
3049+
const char *temp_bin_data = bin_data;
3050+
int temp_bin_size = bin_size;
3051+
do {
3052+
const char *found = (const char *) memmem(temp_bin_data, temp_bin_size, pattern_data, pattern_size);
3053+
if (!found) break;
3054+
num_segments++;
3055+
int next_search_offset = found - temp_bin_data + pattern_size;
3056+
temp_bin_data += next_search_offset;
3057+
temp_bin_size -= next_search_offset;
3058+
} while (global && temp_bin_size >= pattern_size);
3059+
3060+
term result_list = term_nil();
3061+
3062+
if (num_segments == 1) {
3063+
// not found
3064+
if (UNLIKELY(memory_ensure_free_with_roots(ctx, 2, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
3065+
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
3066+
}
30313067

3032-
int offset = found - bin_data;
3068+
return term_list_prepend(argv[0], result_list, &ctx->heap);
3069+
}
30333070

3034-
if (found) {
3035-
int tok_size = offset;
3036-
size_t tok_size_in_terms = term_sub_binary_heap_size(bin_term, tok_size);
3071+
// binary:split/2,3 always return sub binaries, except when copied binaries are as small as sub-binaries.
3072+
if (UNLIKELY(memory_ensure_free_with_roots(ctx, LIST_SIZE(num_segments, TERM_BOXED_SUB_BINARY_SIZE), 2, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
3073+
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
3074+
}
30373075

3038-
int rest_size = bin_size - offset - pattern_size;
3039-
size_t rest_size_in_terms = term_sub_binary_heap_size(bin_term, rest_size);
3076+
// Allocate list first
3077+
for (size_t index_segments = 0; index_segments < num_segments; index_segments++) {
3078+
result_list = term_list_prepend(term_nil(), result_list, &ctx->heap);
3079+
}
30403080

3041-
// + 4 which is the result cons
3042-
if (UNLIKELY(memory_ensure_free_with_roots(ctx, tok_size_in_terms + rest_size_in_terms + 4, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
3043-
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
3044-
}
3081+
// Reset pointers after allocation
3082+
bin_data = term_binary_data(argv[0]);
3083+
pattern_data = term_binary_data(argv[1]);
3084+
3085+
term list_cursor = result_list;
3086+
temp_bin_data = bin_data;
3087+
temp_bin_size = bin_size;
3088+
term *list_ptr = term_get_list_ptr(list_cursor);
3089+
do {
3090+
const char *found = (const char *) memmem(temp_bin_data, temp_bin_size, pattern_data, pattern_size);
30453091

3046-
bin_term = argv[0];
3047-
term tok = term_maybe_create_sub_binary(bin_term, 0, tok_size, &ctx->heap, ctx->global);
3048-
term rest = term_maybe_create_sub_binary(bin_term, offset + pattern_size, rest_size, &ctx->heap, ctx->global);
3092+
if (found) {
3093+
term tok = term_maybe_create_sub_binary(argv[0], temp_bin_data - bin_data, found - temp_bin_data, &ctx->heap, ctx->global);
3094+
list_ptr[LIST_HEAD_INDEX] = tok;
30493095

3050-
term result_list = term_list_prepend(rest, term_nil(), &ctx->heap);
3051-
result_list = term_list_prepend(tok, result_list, &ctx->heap);
3096+
list_cursor = list_ptr[LIST_TAIL_INDEX];
3097+
list_ptr = term_get_list_ptr(list_cursor);
30523098

3053-
return result_list;
3099+
int next_search_offset = found - temp_bin_data + pattern_size;
3100+
temp_bin_data += next_search_offset;
3101+
temp_bin_size -= next_search_offset;
3102+
}
30543103

3055-
} else {
3056-
if (UNLIKELY(memory_ensure_free_with_roots(ctx, 2, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
3057-
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
3104+
if (!found || !global) {
3105+
term rest = term_maybe_create_sub_binary(argv[0], temp_bin_data - bin_data, temp_bin_size, &ctx->heap, ctx->global);
3106+
list_ptr[LIST_HEAD_INDEX] = rest;
3107+
break;
30583108
}
3109+
} while (!term_is_nil(list_cursor));
30593110

3060-
return term_list_prepend(argv[0], term_nil(), &ctx->heap);
3061-
}
3111+
return result_list;
30623112
}
30633113

30643114
static term nif_erlang_throw(Context *ctx, int argc, term argv[])

src/libAtomVM/nifs.gperf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ binary:first/1, &binary_first_nif
3636
binary:last/1, &binary_last_nif
3737
binary:part/3, &binary_part_nif
3838
binary:split/2, &binary_split_nif
39+
binary:split/3, &binary_split_nif
3940
calendar:system_time_to_universal_time/2, &system_time_to_universal_time_nif
4041
erlang:atom_to_binary/1, &atom_to_binary_nif
4142
erlang:atom_to_binary/2, &atom_to_binary_nif

tests/libs/estdlib/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ include(BuildErlang)
2424

2525
set(ERLANG_MODULES
2626
test_apply
27+
test_binary
2728
test_calendar
2829
test_gen_event
2930
test_gen_server

0 commit comments

Comments
 (0)