Skip to content

Commit

Permalink
Add generators for UTF-8 strings
Browse files Browse the repository at this point in the history
As reported in #318, `proper_types:string()` may generate invalid Unicode
strings; the Erlang type language overapproximates the set of lists that
represent valid Unicode strings.

This `proper_unicode` module is therefore extended with functions that
generate valid UTF-8 character lists.
  • Loading branch information
kostis committed Jan 21, 2025
1 parent 546818b commit fd976ee
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 14 deletions.
7 changes: 4 additions & 3 deletions include/proper.hrl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
%%% -*- coding: utf-8; erlang-indent-level: 2 -*-
%%% -------------------------------------------------------------------
%%% Copyright 2010-2022 Manolis Papadakis <manopapad@gmail.com>,
%%% Copyright 2010-2025 Manolis Papadakis <manopapad@gmail.com>,
%%% Eirini Arvaniti <eirinibob@gmail.com>,
%%% and Kostis Sagonas <kostis@cs.ntua.gr>
%%%
Expand All @@ -19,7 +19,7 @@
%%% You should have received a copy of the GNU General Public License
%%% along with PropEr. If not, see <http://www.gnu.org/licenses/>.

%%% @copyright 2010-2022 Manolis Papadakis, Eirini Arvaniti, and Kostis Sagonas
%%% @copyright 2010-2025 Manolis Papadakis, Eirini Arvaniti, and Kostis Sagonas
%%% @version {@version}
%%% @author Manolis Papadakis
%%% @doc User header file: This file should be included in each file containing
Expand Down Expand Up @@ -70,7 +70,8 @@
%% Unicode
%%------------------------------------------------------------------------------

-import(proper_unicode, [utf8/0, utf8/1, utf8/2]).
-import(proper_unicode, [utf8/0, utf8/1, utf8/2,
utf8_string/0, utf8_string/1, utf8_string/2]).


%%------------------------------------------------------------------------------
Expand Down
36 changes: 26 additions & 10 deletions src/proper_unicode.erl
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
%%% -*- coding: utf-8 -*-
%%% -*- erlang-indent-level: 2 -*-
%%% -*- coding: utf-8; erlang-indent-level: 2 -*-
%%% -------------------------------------------------------------------
%%% Copyright 2014 Motiejus Jakstys <desired.mta@gmail.com>
%%%
Expand All @@ -24,10 +23,10 @@

%%% @doc Unicode generators for PropEr
%%%
%%% This module exposes utf8 binary generator.
%%% This module exposes utf8 binary and string generators.
%%%
%%% Makes it easy to create custom-encoded unicode binaries. For example,
%%% utf16 binary generator:
%%% Makes it easy to create custom-encoded unicode binaries and strings.
%%% For example, utf16 binary generator:
%%%
%%% ```
%%% utf16() ->
Expand All @@ -40,18 +39,20 @@
%%% ?FORALL(S, utf16(),
%%% size(S) >= 2*length(unicode:characters_to_list(S, utf16))).
%%% '''
%%% Only utf8 generation is supported: {@link utf8/0}, {@link utf8/1}, {@link
%%% utf8/2}. Unicode codepoints and other encodings are trivial to get with
%%% utf8 generators and {@link unicode} module in OTP.

%%% Only utf8 generation is supported: see {@link utf8/0}, {@link utf8/1},
%%% {@link utf8/2} which generate binaries and the corresponding functions
%%% generating strings. Unicode codepoints and other encodings are trivial
%%% to get with utf8 generators and the {@link unicode} module in OTP.
-module(proper_unicode).

-export([utf8/0, utf8/1, utf8/2]).
-export([utf8/0, utf8/1, utf8/2, utf8_string/0, utf8_string/1, utf8_string/2]).

-include("proper_common.hrl").

%% @private_type
%% @alias
-type nonnegextint() :: non_neg_integer() | 'inf'.
-type nonnegextint() :: non_neg_integer() | 'inf'.


%% @doc utf8-encoded unbounded size binary.
Expand All @@ -78,6 +79,21 @@ utf8(N, MaxCodePointSize) ->
unicode:characters_to_binary(Str)).


%% @doc utf8-encoded unbounded size string.
-spec utf8_string() -> proper_types:type().
utf8_string() ->
utf8_string(inf, 4).

%% @doc utf8-encoded bounded upper size string.
-spec utf8_string(nonnegextint()) -> proper_types:type().
utf8_string(N) ->
utf8_string(N, 4).

%% @doc Bounded upper size utf8 string, `codepoint length =< MaxCodePointSize'.
-spec utf8_string(nonnegextint(), 1..4) -> proper_types:type().
utf8_string(N, MaxCodePointSize) ->
vector_upto(N, unicode_codepoint_upto(MaxCodePointSize)).

%% =============================================================================
%% Internal functions
%% =============================================================================
Expand Down
5 changes: 4 additions & 1 deletion test/proper_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -937,7 +937,10 @@ native_type_props_test_() ->
?_passes(?FORALL(B, utf8(2, 1), byte_size(B) =< 2)),
?_passes(?FORALL(B, utf8(4), byte_size(B) =< 16)),
?_passes(?FORALL(B, utf8(),
length(unicode:characters_to_list(B)) =< byte_size(B)))
length(unicode:characters_to_list(B)) =< byte_size(B))),
?_passes(?FORALL(S, utf8_string(), unicode:characters_to_list(S) =:= S)),
?_passes(?FORALL(S, utf8_string(4),
byte_size(unicode:characters_to_binary(S)) =< 16))
].

-type bin4() :: <<_:32>>.
Expand Down

0 comments on commit fd976ee

Please sign in to comment.