Skip to content

Commit 5c938ea

Browse files
committed
Merge pull request #1293 from pguyot/w39/add-string-find-binary-split-3
Add `binary:split/3` and `string:find/2,3` Also update `string:split/2,3` to work with unicode binaries and fix specs. These changes are made under both the "Apache 2.0" and the "GNU Lesser General Public License 2.1 or later" license terms (dual license). SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
2 parents 4b37211 + c946931 commit 5c938ea

File tree

10 files changed

+266
-38
lines changed

10 files changed

+266
-38
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ also non string parameters (e.g. `Enum.join([1, 2], ",")`
2727
- Support for `code:ensure_loaded/1`
2828
- Support for `io_lib:latin1_char_list/1`
2929
- Add support to Elixir for `Keyword.split/2`
30+
- Support for `binary:split/3` and `string:find/2,3`
3031

3132
### Changed
3233

libs/estdlib/src/binary.erl

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
%%-----------------------------------------------------------------------------
2525
-module(binary).
2626

27-
-export([at/2, part/3, split/2]).
27+
-export([at/2, part/3, split/2, split/3]).
2828

2929
%%-----------------------------------------------------------------------------
3030
%% @param Binary binary to get a byte from
@@ -51,6 +51,7 @@ part(_Binary, _Pos, _Len) ->
5151
erlang:nif_error(undefined).
5252

5353
%%-----------------------------------------------------------------------------
54+
%% @equiv split(Binary, Pattern, [])
5455
%% @param Binary binary to split
5556
%% @param Pattern pattern to perform the split
5657
%% @return a list composed of one or two binaries
@@ -62,3 +63,18 @@ part(_Binary, _Pos, _Len) ->
6263
-spec split(Binary :: binary(), Pattern :: binary()) -> [binary()].
6364
split(_Binary, _Pattern) ->
6465
erlang:nif_error(undefined).
66+
67+
%%-----------------------------------------------------------------------------
68+
%% @param Binary binary to split
69+
%% @param Pattern pattern to perform the split
70+
%% @param Options options for the split
71+
%% @return a list composed of one or two binaries
72+
%% @doc Split a binary according to pattern.
73+
%% If pattern is not found, returns a singleton list with the passed binary.
74+
%% Unlike Erlang/OTP, pattern must be a binary.
75+
%% Only implemented option is `global'
76+
%% @end
77+
%%-----------------------------------------------------------------------------
78+
-spec split(Binary :: binary(), Pattern :: binary(), Option :: [global]) -> [binary()].
79+
split(_Binary, _Pattern, _Option) ->
80+
erlang:nif_error(undefined).

libs/estdlib/src/string.erl

Lines changed: 93 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
%%-----------------------------------------------------------------------------
2828
-module(string).
2929

30-
-export([to_upper/1, to_lower/1, split/2, split/3, trim/1, trim/2]).
30+
-export([to_upper/1, to_lower/1, split/2, split/3, trim/1, trim/2, find/2, find/3]).
3131

3232
%%-----------------------------------------------------------------------------
3333
%% @param Input a string or character to convert
@@ -76,7 +76,7 @@ lower_char(C) when is_integer(C) ->
7676
%% @returns chardata
7777
%% @end
7878
%%-----------------------------------------------------------------------------
79-
-spec split(String :: string(), Pattern :: string()) -> string() | char().
79+
-spec split(String :: unicode:chardata(), Pattern :: unicode:chardata()) -> [unicode:chardata()].
8080
split(String, Pattern) ->
8181
split(String, Pattern, leading).
8282

@@ -98,25 +98,50 @@ split(String, Pattern) ->
9898
%% [<<"ab">>,<<"bc">>,<<>>,<<"cd">>]'''
9999
%% @end
100100
%%-----------------------------------------------------------------------------
101-
-spec split(String :: string(), Pattern :: string() | char(), Where :: atom()) -> string() | char().
102-
split(String, Pattern, Where) ->
103-
split(String, Pattern, Where, [], []).
101+
-spec split(
102+
String :: unicode:chardata(), Pattern :: unicode:chardata(), Where :: leading | trailing | all
103+
) -> [unicode:chardata()].
104+
split(String, Pattern, Where) when is_binary(String) andalso is_list(Pattern) ->
105+
split_binary(String, unicode:characters_to_binary(Pattern), Where);
106+
split(String, Pattern, Where) when is_binary(String) andalso is_binary(Pattern) ->
107+
split_binary(String, Pattern, Where);
108+
split(String, Pattern, Where) when is_list(String) andalso is_binary(Pattern) ->
109+
split_list(String, unicode:characters_to_list(Pattern), Where);
110+
split(String, Pattern, Where) when is_list(String) andalso is_list(Pattern) ->
111+
split_list(String, Pattern, Where).
104112

105113
%% @private
106-
split([], _Pattern, _Where, Token, Accum) ->
114+
split_binary(String, Pattern, leading) ->
115+
binary:split(String, Pattern);
116+
split_binary(String, Pattern, all) ->
117+
binary:split(String, Pattern, [global]);
118+
split_binary(String, Pattern, trailing) ->
119+
case find_binary(String, Pattern, trailing) of
120+
nomatch ->
121+
[String];
122+
Rest ->
123+
[binary:part(String, 0, byte_size(String) - byte_size(Rest) - byte_size(Pattern)), Rest]
124+
end.
125+
126+
%% @private
127+
split_list(String, Pattern, Where) ->
128+
split_list(String, Pattern, Where, [], []).
129+
130+
%% @private
131+
split_list([], _Pattern, _Where, Token, Accum) ->
107132
lists:reverse([lists:reverse(Token) | Accum]);
108-
split(String, Pattern, Where, Token, Accum) ->
133+
split_list(String, Pattern, Where, Token, Accum) ->
109134
case prefix_match(String, Pattern) of
110135
{ok, Rest} ->
111136
case Where of
112137
leading ->
113138
[lists:reverse(Token), Rest];
114139
all ->
115-
split(Rest, Pattern, Where, [], [lists:reverse(Token) | Accum])
140+
split_list(Rest, Pattern, Where, [], [lists:reverse(Token) | Accum])
116141
end;
117142
no ->
118143
[Char | Rest] = String,
119-
split(Rest, Pattern, Where, [Char | Token], Accum)
144+
split_list(Rest, Pattern, Where, [Char | Token], Accum)
120145
end.
121146

122147
%% @private
@@ -167,3 +192,62 @@ triml([$\s | R]) ->
167192
triml(R);
168193
triml(R) ->
169194
R.
195+
196+
%%-----------------------------------------------------------------------------
197+
%% @equiv find(String, SearchPattern, leading)
198+
%% @param String string to search in
199+
%% @param SearchPattern pattern to search
200+
%% @returns remainder of String starting from first occurrence of SearchPattern
201+
%% or `nomatch' if SearchPattern cannot be found in String
202+
%% @end
203+
%%-----------------------------------------------------------------------------
204+
-spec find(String :: unicode:chardata(), SearchPattern :: unicode:chardata()) ->
205+
unicode:chardata() | nomatch.
206+
find(String, SearchPattern) ->
207+
find(String, SearchPattern, leading).
208+
209+
%%-----------------------------------------------------------------------------
210+
%% @param String string to search in
211+
%% @param SearchPattern pattern to search
212+
%% @param Direction direction to search, `leading' or `trailing'
213+
%% @returns remainder of String starting from first or last occurrence of
214+
%% SearchPattern or `nomatch' if SearchPattern cannot be found in String
215+
%% @end
216+
%%-----------------------------------------------------------------------------
217+
-spec find(
218+
String :: unicode:chardata(),
219+
SearchPattern :: unicode:chardata(),
220+
Direction :: leading | trailing
221+
) -> unicode:chardata() | nomatch.
222+
find(String, "", _Direction) ->
223+
String;
224+
find(String, <<>>, _Direction) ->
225+
String;
226+
find(String, SearchPattern, Direction) when is_binary(String) andalso is_list(SearchPattern) ->
227+
find_binary(String, unicode:characters_to_binary(SearchPattern), Direction);
228+
find(String, SearchPattern, Direction) when is_binary(String) andalso is_binary(SearchPattern) ->
229+
find_binary(String, SearchPattern, Direction);
230+
find(String, SearchPattern, Direction) when is_list(String) andalso is_binary(SearchPattern) ->
231+
find_list(String, unicode:characters_to_list(SearchPattern), Direction);
232+
find(String, SearchPattern, Direction) when is_list(String) andalso is_list(SearchPattern) ->
233+
find_list(String, SearchPattern, Direction).
234+
235+
%% @private
236+
find_binary(<<_C, Rest/binary>> = String, SearchPattern, leading) when
237+
byte_size(String) >= byte_size(SearchPattern)
238+
->
239+
case binary:part(String, 0, byte_size(SearchPattern)) =:= SearchPattern of
240+
true -> String;
241+
false -> find_binary(Rest, SearchPattern, leading)
242+
end;
243+
find_binary(_Sring, _SearchPattern, leading) ->
244+
nomatch.
245+
246+
%% @private
247+
find_list([_C | Rest] = String, SearchPattern, leading) ->
248+
case prefix_match(String, SearchPattern) of
249+
{ok, _Rest} -> String;
250+
no -> find_list(Rest, SearchPattern, leading)
251+
end;
252+
find_list([], _SearchPattern, leading) ->
253+
nomatch.

src/libAtomVM/defaultatoms.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ static const char *const cast_atom = "\x5" "$cast";
160160

161161
static const char *const unicode_atom = "\x7" "unicode";
162162

163+
static const char *const global_atom = "\x6" "global";
164+
163165
void defaultatoms_init(GlobalContext *glb)
164166
{
165167
int ok = 1;
@@ -304,6 +306,8 @@ void defaultatoms_init(GlobalContext *glb)
304306

305307
ok &= globalcontext_insert_atom(glb, unicode_atom) == UNICODE_ATOM_INDEX;
306308

309+
ok &= globalcontext_insert_atom(glb, global_atom) == GLOBAL_ATOM_INDEX;
310+
307311
if (!ok) {
308312
AVM_ABORT();
309313
}

src/libAtomVM/defaultatoms.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,9 @@ extern "C" {
169169

170170
#define UNICODE_ATOM_INDEX 110
171171

172-
#define PLATFORM_ATOMS_BASE_INDEX 111
172+
#define GLOBAL_ATOM_INDEX 111
173+
174+
#define PLATFORM_ATOMS_BASE_INDEX 112
173175

174176
#define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX)
175177
#define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX)
@@ -313,6 +315,8 @@ extern "C" {
313315

314316
#define UNICODE_ATOM TERM_FROM_ATOM_INDEX(UNICODE_ATOM_INDEX)
315317

318+
#define GLOBAL_ATOM TERM_FROM_ATOM_INDEX(GLOBAL_ATOM_INDEX)
319+
316320
void defaultatoms_init(GlobalContext *glb);
317321

318322
void platform_defaultatoms_init(GlobalContext *glb);

src/libAtomVM/nifs.c

Lines changed: 77 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ static term nif_binary_at_2(Context *ctx, int argc, term argv[]);
9191
static term nif_binary_first_1(Context *ctx, int argc, term argv[]);
9292
static term nif_binary_last_1(Context *ctx, int argc, term argv[]);
9393
static term nif_binary_part_3(Context *ctx, int argc, term argv[]);
94-
static term nif_binary_split_2(Context *ctx, int argc, term argv[]);
94+
static term nif_binary_split(Context *ctx, int argc, term argv[]);
9595
static term nif_calendar_system_time_to_universal_time_2(Context *ctx, int argc, term argv[]);
9696
static term nif_erlang_delete_element_2(Context *ctx, int argc, term argv[]);
9797
static term nif_erlang_atom_to_binary(Context *ctx, int argc, term argv[]);
@@ -232,7 +232,7 @@ static const struct Nif binary_part_nif =
232232
static const struct Nif binary_split_nif =
233233
{
234234
.base.type = NIFFunctionType,
235-
.nif_ptr = nif_binary_split_2
235+
.nif_ptr = nif_binary_split
236236
};
237237

238238
static const struct Nif make_ref_nif =
@@ -3007,16 +3007,33 @@ static term nif_binary_part_3(Context *ctx, int argc, term argv[])
30073007
return term_maybe_create_sub_binary(bin_term, pos, len, &ctx->heap, ctx->global);
30083008
}
30093009

3010-
static term nif_binary_split_2(Context *ctx, int argc, term argv[])
3010+
static term nif_binary_split(Context *ctx, int argc, term argv[])
30113011
{
3012-
UNUSED(argc);
3013-
30143012
term bin_term = argv[0];
30153013
term pattern_term = argv[1];
30163014

30173015
VALIDATE_VALUE(bin_term, term_is_binary);
30183016
VALIDATE_VALUE(pattern_term, term_is_binary);
30193017

3018+
bool global = false;
3019+
if (argc == 3) {
3020+
term options = argv[2];
3021+
if (UNLIKELY(!term_is_list(options))) {
3022+
RAISE_ERROR(BADARG_ATOM);
3023+
}
3024+
if (term_is_nonempty_list(options)) {
3025+
term head = term_get_list_head(options);
3026+
term tail = term_get_list_tail(options);
3027+
if (UNLIKELY(head != GLOBAL_ATOM)) {
3028+
RAISE_ERROR(BADARG_ATOM);
3029+
}
3030+
if (UNLIKELY(!term_is_nil(tail))) {
3031+
RAISE_ERROR(BADARG_ATOM);
3032+
}
3033+
global = true;
3034+
}
3035+
}
3036+
30203037
int bin_size = term_binary_size(bin_term);
30213038
int pattern_size = term_binary_size(pattern_term);
30223039

@@ -3027,38 +3044,71 @@ static term nif_binary_split_2(Context *ctx, int argc, term argv[])
30273044
const char *bin_data = term_binary_data(bin_term);
30283045
const char *pattern_data = term_binary_data(pattern_term);
30293046

3030-
const char *found = (const char *) memmem(bin_data, bin_size, pattern_data, pattern_size);
3047+
// Count segments first to allocate memory once.
3048+
size_t num_segments = 1;
3049+
const char *temp_bin_data = bin_data;
3050+
int temp_bin_size = bin_size;
3051+
do {
3052+
const char *found = (const char *) memmem(temp_bin_data, temp_bin_size, pattern_data, pattern_size);
3053+
if (!found) break;
3054+
num_segments++;
3055+
int next_search_offset = found - temp_bin_data + pattern_size;
3056+
temp_bin_data += next_search_offset;
3057+
temp_bin_size -= next_search_offset;
3058+
} while (global && temp_bin_size >= pattern_size);
3059+
3060+
term result_list = term_nil();
3061+
3062+
if (num_segments == 1) {
3063+
// not found
3064+
if (UNLIKELY(memory_ensure_free_with_roots(ctx, 2, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
3065+
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
3066+
}
30313067

3032-
int offset = found - bin_data;
3068+
return term_list_prepend(argv[0], result_list, &ctx->heap);
3069+
}
30333070

3034-
if (found) {
3035-
int tok_size = offset;
3036-
size_t tok_size_in_terms = term_sub_binary_heap_size(bin_term, tok_size);
3071+
// binary:split/2,3 always return sub binaries, except when copied binaries are as small as sub-binaries.
3072+
if (UNLIKELY(memory_ensure_free_with_roots(ctx, LIST_SIZE(num_segments, TERM_BOXED_SUB_BINARY_SIZE), 2, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
3073+
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
3074+
}
30373075

3038-
int rest_size = bin_size - offset - pattern_size;
3039-
size_t rest_size_in_terms = term_sub_binary_heap_size(bin_term, rest_size);
3076+
// Allocate list first
3077+
for (size_t index_segments = 0; index_segments < num_segments; index_segments++) {
3078+
result_list = term_list_prepend(term_nil(), result_list, &ctx->heap);
3079+
}
30403080

3041-
// + 4 which is the result cons
3042-
if (UNLIKELY(memory_ensure_free_with_roots(ctx, tok_size_in_terms + rest_size_in_terms + 4, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
3043-
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
3044-
}
3081+
// Reset pointers after allocation
3082+
bin_data = term_binary_data(argv[0]);
3083+
pattern_data = term_binary_data(argv[1]);
3084+
3085+
term list_cursor = result_list;
3086+
temp_bin_data = bin_data;
3087+
temp_bin_size = bin_size;
3088+
term *list_ptr = term_get_list_ptr(list_cursor);
3089+
do {
3090+
const char *found = (const char *) memmem(temp_bin_data, temp_bin_size, pattern_data, pattern_size);
30453091

3046-
bin_term = argv[0];
3047-
term tok = term_maybe_create_sub_binary(bin_term, 0, tok_size, &ctx->heap, ctx->global);
3048-
term rest = term_maybe_create_sub_binary(bin_term, offset + pattern_size, rest_size, &ctx->heap, ctx->global);
3092+
if (found) {
3093+
term tok = term_maybe_create_sub_binary(argv[0], temp_bin_data - bin_data, found - temp_bin_data, &ctx->heap, ctx->global);
3094+
list_ptr[LIST_HEAD_INDEX] = tok;
30493095

3050-
term result_list = term_list_prepend(rest, term_nil(), &ctx->heap);
3051-
result_list = term_list_prepend(tok, result_list, &ctx->heap);
3096+
list_cursor = list_ptr[LIST_TAIL_INDEX];
3097+
list_ptr = term_get_list_ptr(list_cursor);
30523098

3053-
return result_list;
3099+
int next_search_offset = found - temp_bin_data + pattern_size;
3100+
temp_bin_data += next_search_offset;
3101+
temp_bin_size -= next_search_offset;
3102+
}
30543103

3055-
} else {
3056-
if (UNLIKELY(memory_ensure_free_with_roots(ctx, 2, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
3057-
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
3104+
if (!found || !global) {
3105+
term rest = term_maybe_create_sub_binary(argv[0], temp_bin_data - bin_data, temp_bin_size, &ctx->heap, ctx->global);
3106+
list_ptr[LIST_HEAD_INDEX] = rest;
3107+
break;
30583108
}
3109+
} while (!term_is_nil(list_cursor));
30593110

3060-
return term_list_prepend(argv[0], term_nil(), &ctx->heap);
3061-
}
3111+
return result_list;
30623112
}
30633113

30643114
static term nif_erlang_throw(Context *ctx, int argc, term argv[])

src/libAtomVM/nifs.gperf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ binary:first/1, &binary_first_nif
3636
binary:last/1, &binary_last_nif
3737
binary:part/3, &binary_part_nif
3838
binary:split/2, &binary_split_nif
39+
binary:split/3, &binary_split_nif
3940
calendar:system_time_to_universal_time/2, &system_time_to_universal_time_nif
4041
erlang:atom_to_binary/1, &atom_to_binary_nif
4142
erlang:atom_to_binary/2, &atom_to_binary_nif

tests/libs/estdlib/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ include(BuildErlang)
2424

2525
set(ERLANG_MODULES
2626
test_apply
27+
test_binary
2728
test_calendar
2829
test_gen_event
2930
test_gen_server

0 commit comments

Comments
 (0)