Browse Source

Use binary pattern matching to handle unicode

Takeru Ohta 9 years ago
parent
commit
ee974ba263
2 changed files with 14 additions and 56 deletions
  1. 8 27
      src/jsone_decode.erl
  2. 6 29
      src/jsone_encode.erl

+ 8 - 27
src/jsone_decode.erl

@@ -2,7 +2,7 @@
 %%% @private
 %%% @end
 %%%
-%%% Copyright (c) 2013-2015, Takeru Ohta <phjgt308@gmail.com>
+%%% Copyright (c) 2013-2016, Takeru Ohta <phjgt308@gmail.com>
 %%%
 %%% The MIT License
 %%%
@@ -170,12 +170,12 @@ string(<<$\\, B/binary>>, Base, Start, Nexts, Buf, Opt) ->
         <<$u, Bin/binary>> -> unicode_string(Bin, Start, Nexts, <<Buf/binary, Prefix/binary>>, Opt);
         _                  -> ?ERROR(string, [<<$\\, B/binary>>, Base, Start, Nexts, Buf, Opt])
     end;
-string(<<C, Bin/binary>>, Base, Start, Nexts, Buf, Opt) when 16#20 =< C ->
-    string(Bin, Base, Start, Nexts, Buf, Opt);
 string(<<_, Bin/binary>>, Base, Start, Nexts, Buf, Opt) when Opt?OPT.allow_ctrl_chars ->
     string(Bin, Base, Start, Nexts, Buf, Opt);
-string(Bin, Base, Start, Nexts, Buf, Opt) ->
-    ?ERROR(string, [Bin, Base, Start, Nexts, Buf, Opt]).
+string(<<C, Bin/binary>>, Base, Start, Nexts, Buf, Opt) when 16#20 =< C ->
+    string(Bin, Base, Start, Nexts, Buf, Opt);
+ string(Bin, Base, Start, Nexts, Buf, Opt) ->
+     ?ERROR(string, [Bin, Base, Start, Nexts, Buf, Opt]).
 
 -spec unicode_string(binary(), non_neg_integer(), [next()], binary(), opt()) -> decode_result().
 unicode_string(<<N:4/binary, Bin/binary>>, Start, Nexts, Buf, Opt) ->
@@ -186,8 +186,8 @@ unicode_string(<<N:4/binary, Bin/binary>>, Start, Nexts, Buf, Opt) ->
                 <<$\\, $u, N2:4/binary, Bin2/binary>> ->
                     case binary_to_integer(N2, 16) of
                         Low when 16#DC00 =< Low, Low =< 16#DFFF ->
-                            Unicode = 16#10000 + (High - 16#D800) * 16#400 + (Low - 16#DC00),
-                            string(Bin2, Start, Nexts, unicode_to_utf8(Unicode, Buf), Opt);
+                            <<Unicode/utf16>> = <<High:16, Low:16>>,
+                            string(Bin2, Start, Nexts, <<Buf/binary, Unicode/utf8>>, Opt);
                         _ -> ?ERROR(unicode_string, [<<N/binary, Bin/binary>>, Start, Nexts, Buf, Opt])
                     end;
                 _ -> ?ERROR(unicode_string, [<<N/binary, Bin/binary>>, Start, Nexts, Buf, Opt])
@@ -195,30 +195,11 @@ unicode_string(<<N:4/binary, Bin/binary>>, Start, Nexts, Buf, Opt) ->
         Unicode when 16#DC00 =< Unicode, Unicode =< 16#DFFF ->  % second part of surrogate pair (without first part)
             ?ERROR(unicode_string, [<<N/binary, Bin/binary>>, Start, Nexts, Buf, Opt]);
         Unicode ->
-            string(Bin, Start, Nexts, unicode_to_utf8(Unicode, Buf), Opt)
+            string(Bin, Start, Nexts, <<Buf/binary, Unicode/utf8>>, Opt)
     end;
 unicode_string(Bin, Start, Nexts, Buf, Opt) ->
     ?ERROR(unicode_string, [Bin, Start, Nexts, Buf, Opt]).
 
--spec unicode_to_utf8(0..1114111, binary()) -> binary().
-unicode_to_utf8(Code, Buf) when Code < 16#80 ->
-    <<Buf/binary, Code>>;
-unicode_to_utf8(Code, Buf) when Code < 16#800 ->
-    A = 2#11000000 bor (Code bsr 6),
-    B = 2#10000000 bor (Code band 2#111111),
-    <<Buf/binary, A, B>>;
-unicode_to_utf8(Code, Buf) when Code < 16#10000 ->
-    A = 2#11100000 bor (Code bsr 12),
-    B = 2#10000000 bor ((Code bsr 6) band 2#111111),
-    C = 2#10000000 bor (Code band 2#111111),
-    <<Buf/binary, A, B, C>>;
-unicode_to_utf8(Code, Buf) ->
-    A = 2#11110000 bor (Code bsr 18),
-    B = 2#10000000 bor ((Code bsr 12) band 2#111111),
-    C = 2#10000000 bor ((Code bsr  6) band 2#111111),
-    D = 2#10000000 bor (Code band 2#111111),
-    <<Buf/binary, A, B, C, D>>.
-
 -spec number(binary(), [next()], binary(), opt()) -> decode_result().
 number(<<$-, Bin/binary>>, Nexts, Buf, Opt) -> number_integer_part(Bin, -1, Nexts, Buf, Opt);
 number(<<Bin/binary>>,     Nexts, Buf, Opt) -> number_integer_part(Bin,  1, Nexts, Buf, Opt).

+ 6 - 29
src/jsone_encode.erl

@@ -2,7 +2,7 @@
 %%% @private
 %%% @end
 %%%
-%%% Copyright (c) 2013-2015, Takeru Ohta <phjgt308@gmail.com>
+%%% Copyright (c) 2013-2016, Takeru Ohta <phjgt308@gmail.com>
 %%%
 %%% The MIT License
 %%%
@@ -36,7 +36,6 @@
 %% Macros & Records & Types
 %%--------------------------------------------------------------------------------
 -define(ERROR(Function, Args), {error, {badarg, [{?MODULE, Function, Args, [{line, ?LINE}]}]}}).
--define(IS_REDUNDANT_UTF8(B1, B2, FirstBitN), (B1 =:= 0 andalso B2 < (1 bsl (FirstBitN + 1)))).
 -define(HEX(N, I), (binary:at(<<"0123456789abcdef">>, (N bsr (I * 4)) band 2#1111))).
 -define(UNICODE_TO_HEX(Code), ?HEX(Code, 3), ?HEX(Code, 2), ?HEX(Code, 1), ?HEX(Code, 0)).
 -define(IS_STR(X), (is_binary(X) orelse is_atom(X))).
@@ -168,43 +167,21 @@ escape_string(<<$\n, Str/binary>>,      Nexts, Buf, Opt) -> escape_string(Str, N
 escape_string(<<$\r, Str/binary>>,      Nexts, Buf, Opt) -> escape_string(Str, Nexts, <<Buf/binary, $\\, $r>>, Opt);
 escape_string(<<$\t, Str/binary>>,      Nexts, Buf, Opt) -> escape_string(Str, Nexts, <<Buf/binary, $\\, $t>>, Opt);
 escape_string(<<0:1, C:7, Str/binary>>, Nexts, Buf, Opt) -> escape_string(Str, Nexts, <<Buf/binary, C>>, Opt);
-escape_string(<<2#110:3, B1:5, 2#10:2, B2:6, Str/binary>>, Nexts, Buf, Opt) when not ?IS_REDUNDANT_UTF8(B1, B2, 5) ->
+escape_string(<<Unicode/utf8, Str/binary>>, Nexts, Buf, Opt) ->
     case Opt?OPT.native_utf8 of
-        false ->
-            Unicode = (B1 bsl 6) + B2,
-            escape_unicode_char(Str, Unicode, Nexts, Buf, Opt);
-        true ->
-            unicode_char(Str, <<2#110:3, B1:5, 2#10:2, B2:6>>, Nexts, Buf, Opt)
-    end;
-escape_string(<<2#1110:4, B1:4, 2#10:2, B2:6, 2#10:2, B3:6, Str/binary>>, Nexts, Buf, Opt) when not ?IS_REDUNDANT_UTF8(B1, B2, 4) ->
-    case Opt?OPT.native_utf8 of
-        false ->
-            Unicode = (B1 bsl 12) + (B2 bsl 6) + B3,
-            escape_unicode_char(Str, Unicode, Nexts, Buf, Opt);
-        true ->
-            unicode_char(Str, <<2#1110:4, B1:4, 2#10:2, B2:6, 2#10:2, B3:6>>, Nexts, Buf, Opt)
-    end;
-escape_string(<<2#11110:5, B1:3, 2#10:2, B2:6, 2#10:2, B3:6, 2#10:2, B4:6, Str/binary>>, Nexts, Buf, Opt) when not ?IS_REDUNDANT_UTF8(B1, B2, 3) ->
-    case Opt?OPT.native_utf8 of
-        false ->
-            Unicode = (B1 bsl 18) + (B2 bsl 12) + (B3 bsl 6) + B4,
-            escape_unicode_char(Str, Unicode, Nexts, Buf, Opt);
-        true ->
-            unicode_char(Str, <<2#11000:5, B1:3, 2#10:2, B2:6, 2#10:2, B3:6, 2#10:2, B4:6>>, Nexts, Buf, Opt)
+        false -> escape_unicode_char(Str, Unicode, Nexts, Buf, Opt);
+        true  -> escape_string(Str, Nexts, <<Buf/binary, Unicode/utf8>>, Opt)
     end;
 escape_string(Str, Nexts, Buf, Opt) ->
     ?ERROR(escape_string, [Str, Nexts, Buf, Opt]).
 
-unicode_char(Str, Char, Nexts, Buf, Opt) ->
-    escape_string(Str, Nexts, <<Buf/binary, Char/binary>>, Opt).
-
 -spec escape_unicode_char(binary(), char(), [next()], binary(), opt()) -> encode_result().
 escape_unicode_char(<<Str/binary>>, Unicode, Nexts, Buf, Opt) when Unicode =< 16#FFFF ->
     escape_string(Str, Nexts, <<Buf/binary, $\\, $u, ?UNICODE_TO_HEX(Unicode)>>, Opt);
 escape_unicode_char(<<Str/binary>>, Unicode, Nexts, Buf, Opt) ->
     %% Surrogate Pair
-    <<High:10, Low:10>> = <<(Unicode - 16#10000):20>>, % XXX: inefficient
-    escape_string(Str, Nexts, <<Buf/binary, $\\, $u, ?UNICODE_TO_HEX(High + 16#D800), $\\, $u, ?UNICODE_TO_HEX(Low + 16#DC00)>>, Opt).
+    <<High:16, Low:16>> = <<Unicode/utf16>>,
+    escape_string(Str, Nexts, <<Buf/binary, $\\, $u, ?UNICODE_TO_HEX(High), $\\, $u, ?UNICODE_TO_HEX(Low)>>, Opt).
 
 -spec array(jsone:json_array(), [next()], binary(), opt()) -> encode_result().
 array(List, Nexts, Buf, Opt) ->