Browse Source

Merge pull request #8 from altworx/master

Improve string encoding performance
Takeru Ohta 8 years ago
parent
commit
638a33f97c
3 changed files with 215 additions and 26 deletions
  1. 1 0
      .gitignore
  2. 212 25
      src/jsone_encode.erl
  3. 2 1
      test/jsone_encode_tests.erl

+ 1 - 0
.gitignore

@@ -11,3 +11,4 @@ doc/*
 !doc/*.md
 .rebar
 _build
+.*.sw?

+ 212 - 25
src/jsone_encode.erl

@@ -40,8 +40,6 @@
 %% Macros & Records & Types
 %%--------------------------------------------------------------------------------
 -define(ERROR(Function, Args), {error, {badarg, [{?MODULE, Function, Args, [{line, ?LINE}]}]}}).
--define(HEX(N, I), (binary:at(<<"0123456789abcdef">>, (N bsr (I * 4)) band 2#1111))).
--define(UNICODE_TO_HEX(Code), ?HEX(Code, 3), ?HEX(Code, 2), ?HEX(Code, 1), ?HEX(Code, 0)).
 -define(IS_STR(X), (is_binary(X) orelse is_atom(X))).
 -define(IS_UINT(X), (is_integer(X) andalso X >= 0)).
 -define(IS_DATETIME(Y,M,D,H,Mi,S), (?IS_UINT(Y) andalso ?IS_UINT(M) andalso ?IS_UINT(D) andalso
@@ -161,31 +159,220 @@ object_key(Key, Nexts, Buf, Opt) ->
     ?ERROR(object_key, [Key, Nexts, Buf, Opt]).
 
 -spec escape_string(binary(), [next()], binary(), opt()) -> encode_result().
-escape_string(<<"">>,                   Nexts, Buf, Opt) -> next(Nexts, <<Buf/binary, $">>, Opt);
-escape_string(<<$", Str/binary>>,       Nexts, Buf, Opt) -> escape_string(Str, Nexts, <<Buf/binary, $\\, $">>, Opt);
-escape_string(<<$\/, Str/binary>>,      Nexts, Buf, Opt) -> escape_string(Str, Nexts, <<Buf/binary, $\\, $\/>>, Opt);
-escape_string(<<$\\, Str/binary>>,      Nexts, Buf, Opt) -> escape_string(Str, Nexts, <<Buf/binary, $\\, $\\>>, Opt);
-escape_string(<<$\b, Str/binary>>,      Nexts, Buf, Opt) -> escape_string(Str, Nexts, <<Buf/binary, $\\, $b>>, Opt);
-escape_string(<<$\f, Str/binary>>,      Nexts, Buf, Opt) -> escape_string(Str, Nexts, <<Buf/binary, $\\, $f>>, Opt);
-escape_string(<<$\n, Str/binary>>,      Nexts, Buf, Opt) -> escape_string(Str, Nexts, <<Buf/binary, $\\, $n>>, Opt);
-escape_string(<<$\r, Str/binary>>,      Nexts, Buf, Opt) -> escape_string(Str, Nexts, <<Buf/binary, $\\, $r>>, Opt);
-escape_string(<<$\t, Str/binary>>,      Nexts, Buf, Opt) -> escape_string(Str, Nexts, <<Buf/binary, $\\, $t>>, Opt);
-escape_string(<<0:1, C:7, Str/binary>>, Nexts, Buf, Opt) -> escape_string(Str, Nexts, <<Buf/binary, C>>, Opt);
-escape_string(<<Unicode/utf8, Str/binary>>, Nexts, Buf, Opt) ->
-    case Opt?OPT.native_utf8 of
-        false -> escape_unicode_char(Str, Unicode, Nexts, Buf, Opt);
-        true  -> escape_string(Str, Nexts, <<Buf/binary, Unicode/utf8>>, Opt)
-    end;
-escape_string(Str, Nexts, Buf, Opt) ->
+escape_string(Str, Nexts, Buf, ?OPT{native_utf8 = true} = Opt) ->
+    escape_string_native_utf8(Str, Nexts, Buf, Opt);
+escape_string(Str, Nexts, Buf, ?OPT{native_utf8 = false} = Opt) ->
+    escape_string_escaped_utf8(Str, Nexts, Buf, Opt).
+
+-define(H8(X), (hex(X)):16).
+-define(H16(X), ?H8(X bsr 8), ?H8(X band 16#FF)).
+
+-ifdef(ENABLE_HIPE).
+-define(COPY_UTF8,
+escape_string_native_utf8(<<2#110:3, C1:5, C2, Str/binary>>, Nexts, Buf, Opt) ->
+    escape_string_native_utf8(Str, Nexts, <<Buf/binary, (2#11000000+C1), C2>>, Opt);
+escape_string_native_utf8(<<2#1110:4, C1:4, C2:16, Str/binary>>, Nexts, Buf, Opt) ->
+    escape_string_native_utf8(Str, Nexts, <<Buf/binary, (2#11100000+C1), C2:16>>, Opt);
+escape_string_native_utf8(<<2#11110:5, C1:3, C2:24, Str/binary>>, Nexts, Buf, Opt) ->
+    escape_string_native_utf8(Str, Nexts, <<Buf/binary, (2#11110000+C1), C2:24>>, Opt)
+    ).
+-else.
+-define(COPY_UTF8,
+escape_string_native_utf8(<<Ch/utf8, Str/binary>>, Nexts, Buf, Opt) ->
+    escape_string_native_utf8(Str, Nexts, <<Buf/binary, Ch/utf8>>, Opt)
+    ).
+-endif.
+
+escape_string_native_utf8(<<>>, Nexts, Buf, Opt) -> next(Nexts, <<Buf/binary, $">>, Opt);
+escape_string_native_utf8(<<0:1, Ch:7, Str/binary>>, Nexts, Buf, Opt) ->
+    escape_string_native_utf8(Str, Nexts, ascii(Ch, Buf), Opt);
+?COPY_UTF8;
+escape_string_native_utf8(Str, Nexts, Buf, Opt) ->
+    ?ERROR(escape_string, [Str, Nexts, Buf, Opt]).
+
+escape_string_escaped_utf8(<<>>, Nexts, Buf, Opt) -> next(Nexts, <<Buf/binary, $">>, Opt);
+escape_string_escaped_utf8(<<0:1, Ch:7, Str/binary>>, Nexts, Buf, Opt) ->
+    escape_string_escaped_utf8(Str, Nexts, ascii(Ch, Buf), Opt);
+escape_string_escaped_utf8(<<Ch/utf8, Str/binary>>, Nexts, Buf, Opt) ->
+    NewBuf = if
+                 Ch =< 16#FFFF -> <<Buf/binary, $\\, $u, ?H16(Ch)>>;
+                 true ->
+                     <<H1, H2, L1, L2>> = <<Ch/utf16>>,
+                     <<Buf/binary, $\\, $u, ?H8(H1), ?H8(H2), $\\, $u, ?H8(L1), ?H8(L2)>>
+             end,
+    escape_string_escaped_utf8(Str, Nexts, NewBuf, Opt);
+escape_string_escaped_utf8(Str, Nexts, Buf, Opt) ->
     ?ERROR(escape_string, [Str, Nexts, Buf, Opt]).
 
--spec escape_unicode_char(binary(), char(), [next()], binary(), opt()) -> encode_result().
-escape_unicode_char(<<Str/binary>>, Unicode, Nexts, Buf, Opt) when Unicode =< 16#FFFF ->
-    escape_string(Str, Nexts, <<Buf/binary, $\\, $u, ?UNICODE_TO_HEX(Unicode)>>, Opt);
-escape_unicode_char(<<Str/binary>>, Unicode, Nexts, Buf, Opt) ->
-    %% Surrogate Pair
-    <<High:16, Low:16>> = <<Unicode/utf16>>,
-    escape_string(Str, Nexts, <<Buf/binary, $\\, $u, ?UNICODE_TO_HEX(High), $\\, $u, ?UNICODE_TO_HEX(Low)>>, Opt).
+-compile({inline, [hex/1]}).
+
+hex(X) ->
+  element(
+    X+1,
+    {16#3030, 16#3031, 16#3032, 16#3033, 16#3034, 16#3035, 16#3036, 16#3037,
+     16#3038, 16#3039, 16#3061, 16#3062, 16#3063, 16#3064, 16#3065, 16#3066,
+     16#3130, 16#3131, 16#3132, 16#3133, 16#3134, 16#3135, 16#3136, 16#3137,
+     16#3138, 16#3139, 16#3161, 16#3162, 16#3163, 16#3164, 16#3165, 16#3166,
+     16#3230, 16#3231, 16#3232, 16#3233, 16#3234, 16#3235, 16#3236, 16#3237,
+     16#3238, 16#3239, 16#3261, 16#3262, 16#3263, 16#3264, 16#3265, 16#3266,
+     16#3330, 16#3331, 16#3332, 16#3333, 16#3334, 16#3335, 16#3336, 16#3337,
+     16#3338, 16#3339, 16#3361, 16#3362, 16#3363, 16#3364, 16#3365, 16#3366,
+     16#3430, 16#3431, 16#3432, 16#3433, 16#3434, 16#3435, 16#3436, 16#3437,
+     16#3438, 16#3439, 16#3461, 16#3462, 16#3463, 16#3464, 16#3465, 16#3466,
+     16#3530, 16#3531, 16#3532, 16#3533, 16#3534, 16#3535, 16#3536, 16#3537,
+     16#3538, 16#3539, 16#3561, 16#3562, 16#3563, 16#3564, 16#3565, 16#3566,
+     16#3630, 16#3631, 16#3632, 16#3633, 16#3634, 16#3635, 16#3636, 16#3637,
+     16#3638, 16#3639, 16#3661, 16#3662, 16#3663, 16#3664, 16#3665, 16#3666,
+     16#3730, 16#3731, 16#3732, 16#3733, 16#3734, 16#3735, 16#3736, 16#3737,
+     16#3738, 16#3739, 16#3761, 16#3762, 16#3763, 16#3764, 16#3765, 16#3766,
+     16#3830, 16#3831, 16#3832, 16#3833, 16#3834, 16#3835, 16#3836, 16#3837,
+     16#3838, 16#3839, 16#3861, 16#3862, 16#3863, 16#3864, 16#3865, 16#3866,
+     16#3930, 16#3931, 16#3932, 16#3933, 16#3934, 16#3935, 16#3936, 16#3937,
+     16#3938, 16#3939, 16#3961, 16#3962, 16#3963, 16#3964, 16#3965, 16#3966,
+     16#6130, 16#6131, 16#6132, 16#6133, 16#6134, 16#6135, 16#6136, 16#6137,
+     16#6138, 16#6139, 16#6161, 16#6162, 16#6163, 16#6164, 16#6165, 16#6166,
+     16#6230, 16#6231, 16#6232, 16#6233, 16#6234, 16#6235, 16#6236, 16#6237,
+     16#6238, 16#6239, 16#6261, 16#6262, 16#6263, 16#6264, 16#6265, 16#6266,
+     16#6330, 16#6331, 16#6332, 16#6333, 16#6334, 16#6335, 16#6336, 16#6337,
+     16#6338, 16#6339, 16#6361, 16#6362, 16#6363, 16#6364, 16#6365, 16#6366,
+     16#6430, 16#6431, 16#6432, 16#6433, 16#6434, 16#6435, 16#6436, 16#6437,
+     16#6438, 16#6439, 16#6461, 16#6462, 16#6463, 16#6464, 16#6465, 16#6466,
+     16#6530, 16#6531, 16#6532, 16#6533, 16#6534, 16#6535, 16#6536, 16#6537,
+     16#6538, 16#6539, 16#6561, 16#6562, 16#6563, 16#6564, 16#6565, 16#6566,
+     16#6630, 16#6631, 16#6632, 16#6633, 16#6634, 16#6635, 16#6636, 16#6637,
+     16#6638, 16#6639, 16#6661, 16#6662, 16#6663, 16#6664, 16#6665, 16#6666}
+          ).
+
+-compile({inline, [ascii/2]}).
+
+ascii(0, Buf) -> <<Buf/binary, "\\u0000">>;
+ascii(1, Buf) -> <<Buf/binary, "\\u0001">>;
+ascii(2, Buf) -> <<Buf/binary, "\\u0002">>;
+ascii(3, Buf) -> <<Buf/binary, "\\u0003">>;
+ascii(4, Buf) -> <<Buf/binary, "\\u0004">>;
+ascii(5, Buf) -> <<Buf/binary, "\\u0005">>;
+ascii(6, Buf) -> <<Buf/binary, "\\u0006">>;
+ascii(7, Buf) -> <<Buf/binary, "\\u0007">>;
+ascii(8, Buf) -> <<Buf/binary, "\\b">>;
+ascii(9, Buf) -> <<Buf/binary, "\\t">>;
+ascii(10, Buf) -> <<Buf/binary, "\\n">>;
+ascii(11, Buf) -> <<Buf/binary, "\\u000b">>;
+ascii(12, Buf) -> <<Buf/binary, "\\f">>;
+ascii(13, Buf) -> <<Buf/binary, "\\r">>;
+ascii(14, Buf) -> <<Buf/binary, "\\u000e">>;
+ascii(15, Buf) -> <<Buf/binary, "\\u000f">>;
+ascii(16, Buf) -> <<Buf/binary, "\\u0010">>;
+ascii(17, Buf) -> <<Buf/binary, "\\u0011">>;
+ascii(18, Buf) -> <<Buf/binary, "\\u0012">>;
+ascii(19, Buf) -> <<Buf/binary, "\\u0013">>;
+ascii(20, Buf) -> <<Buf/binary, "\\u0014">>;
+ascii(21, Buf) -> <<Buf/binary, "\\u0015">>;
+ascii(22, Buf) -> <<Buf/binary, "\\u0016">>;
+ascii(23, Buf) -> <<Buf/binary, "\\u0017">>;
+ascii(24, Buf) -> <<Buf/binary, "\\u0018">>;
+ascii(25, Buf) -> <<Buf/binary, "\\u0019">>;
+ascii(26, Buf) -> <<Buf/binary, "\\u001a">>;
+ascii(27, Buf) -> <<Buf/binary, "\\u001b">>;
+ascii(28, Buf) -> <<Buf/binary, "\\u001c">>;
+ascii(29, Buf) -> <<Buf/binary, "\\u001d">>;
+ascii(30, Buf) -> <<Buf/binary, "\\u001e">>;
+ascii(31, Buf) -> <<Buf/binary, "\\u001f">>;
+ascii(32, Buf) -> <<Buf/binary, 32>>;
+ascii(33, Buf) -> <<Buf/binary, 33>>;
+ascii(34, Buf) -> <<Buf/binary, $\\, 34>>;
+ascii(35, Buf) -> <<Buf/binary, 35>>;
+ascii(36, Buf) -> <<Buf/binary, 36>>;
+ascii(37, Buf) -> <<Buf/binary, 37>>;
+ascii(38, Buf) -> <<Buf/binary, 38>>;
+ascii(39, Buf) -> <<Buf/binary, 39>>;
+ascii(40, Buf) -> <<Buf/binary, 40>>;
+ascii(41, Buf) -> <<Buf/binary, 41>>;
+ascii(42, Buf) -> <<Buf/binary, 42>>;
+ascii(43, Buf) -> <<Buf/binary, 43>>;
+ascii(44, Buf) -> <<Buf/binary, 44>>;
+ascii(45, Buf) -> <<Buf/binary, 45>>;
+ascii(46, Buf) -> <<Buf/binary, 46>>;
+ascii(47, Buf) -> <<Buf/binary, $\\, 47>>;
+ascii(48, Buf) -> <<Buf/binary, 48>>;
+ascii(49, Buf) -> <<Buf/binary, 49>>;
+ascii(50, Buf) -> <<Buf/binary, 50>>;
+ascii(51, Buf) -> <<Buf/binary, 51>>;
+ascii(52, Buf) -> <<Buf/binary, 52>>;
+ascii(53, Buf) -> <<Buf/binary, 53>>;
+ascii(54, Buf) -> <<Buf/binary, 54>>;
+ascii(55, Buf) -> <<Buf/binary, 55>>;
+ascii(56, Buf) -> <<Buf/binary, 56>>;
+ascii(57, Buf) -> <<Buf/binary, 57>>;
+ascii(58, Buf) -> <<Buf/binary, 58>>;
+ascii(59, Buf) -> <<Buf/binary, 59>>;
+ascii(60, Buf) -> <<Buf/binary, 60>>;
+ascii(61, Buf) -> <<Buf/binary, 61>>;
+ascii(62, Buf) -> <<Buf/binary, 62>>;
+ascii(63, Buf) -> <<Buf/binary, 63>>;
+ascii(64, Buf) -> <<Buf/binary, 64>>;
+ascii(65, Buf) -> <<Buf/binary, 65>>;
+ascii(66, Buf) -> <<Buf/binary, 66>>;
+ascii(67, Buf) -> <<Buf/binary, 67>>;
+ascii(68, Buf) -> <<Buf/binary, 68>>;
+ascii(69, Buf) -> <<Buf/binary, 69>>;
+ascii(70, Buf) -> <<Buf/binary, 70>>;
+ascii(71, Buf) -> <<Buf/binary, 71>>;
+ascii(72, Buf) -> <<Buf/binary, 72>>;
+ascii(73, Buf) -> <<Buf/binary, 73>>;
+ascii(74, Buf) -> <<Buf/binary, 74>>;
+ascii(75, Buf) -> <<Buf/binary, 75>>;
+ascii(76, Buf) -> <<Buf/binary, 76>>;
+ascii(77, Buf) -> <<Buf/binary, 77>>;
+ascii(78, Buf) -> <<Buf/binary, 78>>;
+ascii(79, Buf) -> <<Buf/binary, 79>>;
+ascii(80, Buf) -> <<Buf/binary, 80>>;
+ascii(81, Buf) -> <<Buf/binary, 81>>;
+ascii(82, Buf) -> <<Buf/binary, 82>>;
+ascii(83, Buf) -> <<Buf/binary, 83>>;
+ascii(84, Buf) -> <<Buf/binary, 84>>;
+ascii(85, Buf) -> <<Buf/binary, 85>>;
+ascii(86, Buf) -> <<Buf/binary, 86>>;
+ascii(87, Buf) -> <<Buf/binary, 87>>;
+ascii(88, Buf) -> <<Buf/binary, 88>>;
+ascii(89, Buf) -> <<Buf/binary, 89>>;
+ascii(90, Buf) -> <<Buf/binary, 90>>;
+ascii(91, Buf) -> <<Buf/binary, 91>>;
+ascii(92, Buf) -> <<Buf/binary, $\\, 92>>;
+ascii(93, Buf) -> <<Buf/binary, 93>>;
+ascii(94, Buf) -> <<Buf/binary, 94>>;
+ascii(95, Buf) -> <<Buf/binary, 95>>;
+ascii(96, Buf) -> <<Buf/binary, 96>>;
+ascii(97, Buf) -> <<Buf/binary, 97>>;
+ascii(98, Buf) -> <<Buf/binary, 98>>;
+ascii(99, Buf) -> <<Buf/binary, 99>>;
+ascii(100, Buf) -> <<Buf/binary, 100>>;
+ascii(101, Buf) -> <<Buf/binary, 101>>;
+ascii(102, Buf) -> <<Buf/binary, 102>>;
+ascii(103, Buf) -> <<Buf/binary, 103>>;
+ascii(104, Buf) -> <<Buf/binary, 104>>;
+ascii(105, Buf) -> <<Buf/binary, 105>>;
+ascii(106, Buf) -> <<Buf/binary, 106>>;
+ascii(107, Buf) -> <<Buf/binary, 107>>;
+ascii(108, Buf) -> <<Buf/binary, 108>>;
+ascii(109, Buf) -> <<Buf/binary, 109>>;
+ascii(110, Buf) -> <<Buf/binary, 110>>;
+ascii(111, Buf) -> <<Buf/binary, 111>>;
+ascii(112, Buf) -> <<Buf/binary, 112>>;
+ascii(113, Buf) -> <<Buf/binary, 113>>;
+ascii(114, Buf) -> <<Buf/binary, 114>>;
+ascii(115, Buf) -> <<Buf/binary, 115>>;
+ascii(116, Buf) -> <<Buf/binary, 116>>;
+ascii(117, Buf) -> <<Buf/binary, 117>>;
+ascii(118, Buf) -> <<Buf/binary, 118>>;
+ascii(119, Buf) -> <<Buf/binary, 119>>;
+ascii(120, Buf) -> <<Buf/binary, 120>>;
+ascii(121, Buf) -> <<Buf/binary, 121>>;
+ascii(122, Buf) -> <<Buf/binary, 122>>;
+ascii(123, Buf) -> <<Buf/binary, 123>>;
+ascii(124, Buf) -> <<Buf/binary, 124>>;
+ascii(125, Buf) -> <<Buf/binary, 125>>;
+ascii(126, Buf) -> <<Buf/binary, 126>>;
+ascii(127, Buf) -> <<Buf/binary, 127>>.
 
 -spec array(jsone:json_array(), [next()], binary(), opt()) -> encode_result().
 array(List, Nexts, Buf, Opt) ->

+ 2 - 1
test/jsone_encode_tests.erl

@@ -79,7 +79,8 @@ encode_test_() ->
       fun () ->
               Input    = <<"\"\/\\\b\f\n\r\t">>,
               Expected = list_to_binary([$", [[$\\, C] || C <- [$", $/, $\\, $b, $f, $n, $r, $t]], $"]),
-              ?assertEqual({ok, Expected}, jsone_encode:encode(Input))
+              ?assertEqual({ok, Expected}, jsone_encode:encode(Input)),
+              ?assertEqual({ok, Expected}, jsone_encode:encode(Input, [native_utf8]))
       end},
      {"string: contains multi-byte (UTF-8 encoded) characters",
       fun () ->