Browse Source

Add `allow_invalid_utf8` option for decoding

Takeru Ohta 6 years ago
parent
commit
d76d72994b
3 changed files with 22 additions and 3 deletions
  1. 5 0
      src/jsone.erl
  2. 10 3
      src/jsone_decode.erl
  3. 7 0
      test/jsone_decode_tests.erl

+ 5 - 0
src/jsone.erl

@@ -228,6 +228,7 @@
 
 -type decode_option() :: {object_format, tuple | proplist | map}
                        | {allow_ctrl_chars, boolean()}
+                       | {allow_invalid_utf8, boolean()}
                        | {'keys', 'binary' | 'atom' | 'existing_atom' | 'attempt_atom'}
                        | common_option().
 %% `object_format': <br />
@@ -241,6 +242,10 @@
 %% - If the value is `true', strings which contain unescaped control characters will be regarded as a legal JSON string <br />
 %% - default: `false'<br />
 %%
+%% `allow_invalid_utf8': <br />
+%% - If the value is `true', strings which contain invalid UTF-8 byte sequences will be regarded as a legal JSON string <br />
+%% - default: `true'<br />
+%%
 %% `keys': <br />
 %% Defines way how object keys are decoded. The default value is `binary'.
 %% The option is compatible with `labels' option in `jsx'. <br />

+ 10 - 3
src/jsone_decode.erl

@@ -67,6 +67,7 @@
         {
           object_format=?DEFAULT_OBJECT_FORMAT :: tuple | proplist | map,
           allow_ctrl_chars=false :: boolean(),
+          allow_invalid_utf8=true :: boolean(),
           keys=binary :: 'binary' | 'atom' | 'existing_atom' | 'attempt_atom',
           undefined_as_null=false :: boolean()
         }).
@@ -188,11 +189,15 @@ string(<<$\\, B/binary>>, Base, Start, Nexts, Buf, Opt) ->
         <<$u, Bin/binary>> -> unicode_string(Bin, Start, Nexts, <<Buf/binary, Prefix/binary>>, Opt);
         _                  -> ?ERROR(string, [<<$\\, B/binary>>, Base, Start, Nexts, Buf, Opt])
     end;
-string(<<_, Bin/binary>>, Base, Start, Nexts, Buf, Opt) when Opt?OPT.allow_ctrl_chars ->
+string(<<_, Bin/binary>>, Base, Start, Nexts, Buf, Opt) when Opt?OPT.allow_ctrl_chars, Opt?OPT.allow_invalid_utf8 ->
     string(Bin, Base, Start, Nexts, Buf, Opt);
-string(<<C, Bin/binary>>, Base, Start, Nexts, Buf, Opt) when 16#20 =< C ->
+string(<<C, Bin/binary>>, Base, Start, Nexts, Buf, Opt) when 16#20 =< C, Opt?OPT.allow_invalid_utf8 ->
     string(Bin, Base, Start, Nexts, Buf, Opt);
- string(Bin, Base, Start, Nexts, Buf, Opt) ->
+string(<<_/utf8, Bin/binary>>, Base, Start, Nexts, Buf, Opt) when Opt?OPT.allow_ctrl_chars ->
+    string(Bin, Base, Start, Nexts, Buf, Opt);
+string(<<C/utf8, Bin/binary>>, Base, Start, Nexts, Buf, Opt) when 16#20 =< C ->
+    string(Bin, Base, Start, Nexts, Buf, Opt);
+string(Bin, Base, Start, Nexts, Buf, Opt) ->
      ?ERROR(string, [Bin, Base, Start, Nexts, Buf, Opt]).
 
 -spec unicode_string(binary(), non_neg_integer(), [next()], binary(), opt()) -> decode_result().
@@ -301,6 +306,8 @@ parse_option([{object_format,F}|T], Opt) when F =:= tuple; F =:= proplist; F =:=
     parse_option(T, Opt?OPT{object_format=F});
 parse_option([{allow_ctrl_chars,B}|T], Opt) when is_boolean(B) ->
     parse_option(T, Opt?OPT{allow_ctrl_chars=B});
+parse_option([{allow_invalid_utf8,B}|T], Opt) when is_boolean(B) ->
+    parse_option(T, Opt?OPT{allow_invalid_utf8=B});
 parse_option([{keys, K}|T], Opt)
   when K =:= binary; K =:= atom; K =:= existing_atom; K =:= attempt_atom ->
     parse_option(T, Opt?OPT{keys = K});

+ 7 - 0
test/jsone_decode_tests.erl

@@ -291,5 +291,12 @@ decode_test_() ->
       fun() ->
               ?assertEqual({ok, undefined, <<>>},  jsone_decode:decode(<<"null">>,[undefined_as_null])), % OK
               ?assertEqual({ok, null, <<>>},       jsone_decode:decode(<<"null">>,[])) % OK
+      end},
+     {"Invalid UTF-8 characters",
+      fun () ->
+              Input = <<123,34,105,100,34,58,34,190,72,94,90,253,121,94,71,73,68,91,122,211,253,32,94,86,67,163,253,230,34,125>>,
+              ?assertMatch({ok, _, _}, jsone:try_decode(Input)),
+              ?assertMatch({ok, _, _}, jsone:try_decode(Input, [{allow_invalid_utf8, true}])),
+              ?assertMatch({error, {badarg, _}}, jsone:try_decode(Input, [{allow_invalid_utf8, false}]))
       end}
     ].