cowboy_multipart.erl 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. %% Copyright (c) 2011, Anthony Ramine <nox@dev-extend.eu>
  2. %%
  3. %% Permission to use, copy, modify, and/or distribute this software for any
  4. %% purpose with or without fee is hereby granted, provided that the above
  5. %% copyright notice and this permission notice appear in all copies.
  6. %%
  7. %% THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  8. %% WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  9. %% MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  10. %% ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  11. %% WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  12. %% ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  13. %% OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  14. %% @doc Multipart parser.
  15. -module(cowboy_multipart).
  16. -export([parser/1]).
  17. -export([content_disposition/1]).
  18. -type part_parser() :: parser(more(part_result())).
  19. -type parser(T) :: fun((binary()) -> T).
  20. -type more(T) :: T | {more, parser(T)}.
  21. -type part_result() :: headers() | eof.
  22. -type headers() :: {headers, http_headers(), body_cont()}.
  23. -type http_headers() :: [{binary(), binary()}].
  24. -type body_cont() :: cont(more(body_result())).
  25. -type cont(T) :: fun(() -> T).
  26. -type body_result() :: {body, binary(), body_cont()} | end_of_part().
  27. -type end_of_part() :: {end_of_part, cont(more(part_result()))}.
  28. -type disposition() :: {binary(), [{binary(), binary()}]}.
  29. -ifdef(TEST).
  30. -include_lib("eunit/include/eunit.hrl").
  31. -endif.
  32. %% API.
  33. %% @doc Return a multipart parser for the given boundary.
  34. -spec parser(binary()) -> part_parser().
  35. parser(Boundary) when is_binary(Boundary) ->
  36. fun (Bin) when is_binary(Bin) -> parse(Bin, Boundary) end.
  37. %% @doc Parse a content disposition.
  38. %% @todo Parse the MIME header instead of the HTTP one.
  39. -spec content_disposition(binary()) -> disposition().
  40. content_disposition(Data) ->
  41. cowboy_http:token_ci(Data,
  42. fun (_Rest, <<>>) -> {error, badarg};
  43. (Rest, Disposition) ->
  44. cowboy_http:params(Rest,
  45. fun (<<>>, Params) -> {Disposition, Params};
  46. (_Rest2, _) -> {error, badarg}
  47. end)
  48. end).
  49. %% Internal.
  50. %% @doc Entry point of the multipart parser, skips over the preamble if any.
  51. -spec parse(binary(), binary()) -> more(part_result()).
  52. parse(Bin, Boundary) when byte_size(Bin) >= byte_size(Boundary) + 2 ->
  53. BoundarySize = byte_size(Boundary),
  54. Pattern = pattern(Boundary),
  55. case Bin of
  56. <<"--", Boundary:BoundarySize/binary, Rest/binary>> ->
  57. % Data starts with initial boundary, skip preamble parsing.
  58. parse_boundary_tail(Rest, Pattern);
  59. _ ->
  60. % Parse preamble.
  61. skip(Bin, Pattern)
  62. end;
  63. parse(Bin, Boundary) ->
  64. % Not enough data to know if the data begins with a boundary.
  65. more(Bin, fun (NewBin) -> parse(NewBin, Boundary) end).
  66. -type pattern() :: {binary:cp(), non_neg_integer()}.
  67. -type patterns() :: {pattern(), pattern()}.
  68. %% @doc Return two compiled binary patterns with their sizes in bytes.
  69. %% The boundary pattern is the boundary prepended with "\r\n--".
  70. %% The boundary suffix pattern matches all prefixes of the boundary.
  71. -spec pattern(binary()) -> patterns().
  72. pattern(Boundary) ->
  73. MatchPattern = <<"\r\n--", Boundary/binary>>,
  74. MatchPrefixes = prefixes(MatchPattern),
  75. {{binary:compile_pattern(MatchPattern), byte_size(MatchPattern)},
  76. {binary:compile_pattern(MatchPrefixes), byte_size(MatchPattern)}}.
  77. %% @doc Return all prefixes of a binary string.
  78. %% The list of prefixes includes the full string.
  79. -spec prefixes(binary()) -> [binary()].
  80. prefixes(<<C, Rest/binary>>) ->
  81. prefixes(Rest, <<C>>).
  82. -spec prefixes(binary(), binary()) -> [binary()].
  83. prefixes(<<C, Rest/binary>>, Acc) ->
  84. [Acc|prefixes(Rest, <<Acc/binary, C>>)];
  85. prefixes(<<>>, Acc) ->
  86. [Acc].
  87. %% @doc Test if a boundary is a possble suffix.
  88. %% The patterns are expected to have been returned from `pattern/1`.
  89. -spec suffix_match(binary(), patterns()) -> nomatch | {integer(), integer()}.
  90. suffix_match(Bin, {_Boundary, {Pat, Len}}) ->
  91. Size = byte_size(Bin),
  92. suffix_match(Bin, Pat, Size, max(-Size, -Len)).
  93. -spec suffix_match(binary(), tuple(), non_neg_integer(), 0|neg_integer()) ->
  94. nomatch | {integer(), integer()}.
  95. suffix_match(_Bin, _Pat, _Size, _Match=0) ->
  96. nomatch;
  97. suffix_match(Bin, Pat, Size, Match) when Match < 0 ->
  98. case binary:match(Bin, Pat, [{scope, {Size, Match}}]) of
  99. {Pos, Len}=Part when Pos + Len =:= Size -> Part;
  100. {_, Len} -> suffix_match(Bin, Pat, Size, Match + Len);
  101. nomatch -> nomatch
  102. end.
  103. %% @doc Parse remaining characters of a line beginning with the boundary.
  104. %% If followed by "--", <em>eof</em> is returned and parsing is finished.
  105. -spec parse_boundary_tail(binary(), patterns()) -> more(part_result()).
  106. parse_boundary_tail(Bin, Pattern) when byte_size(Bin) >= 2 ->
  107. case Bin of
  108. <<"--", _Rest/binary>> ->
  109. % Boundary is followed by "--", end parsing.
  110. eof;
  111. _ ->
  112. % No dash after boundary, proceed with unknown chars and lwsp
  113. % removal.
  114. parse_boundary_eol(Bin, Pattern)
  115. end;
  116. parse_boundary_tail(Bin, Pattern) ->
  117. % Boundary may be followed by "--", need more data.
  118. more(Bin, fun (NewBin) -> parse_boundary_tail(NewBin, Pattern) end).
  119. %% @doc Skip whitespace and unknown chars until CRLF.
  120. -spec parse_boundary_eol(binary(), patterns()) -> more(part_result()).
  121. parse_boundary_eol(Bin, Pattern) ->
  122. case binary:match(Bin, <<"\r\n">>) of
  123. {CrlfStart, _Length} ->
  124. % End of line found, remove optional whitespace.
  125. <<_:CrlfStart/binary, Rest/binary>> = Bin,
  126. Fun = fun (Rest2) -> parse_boundary_crlf(Rest2, Pattern) end,
  127. cowboy_http:whitespace(Rest, Fun);
  128. nomatch ->
  129. % CRLF not found in the given binary.
  130. RestStart = max(byte_size(Bin) - 1, 0),
  131. <<_:RestStart/binary, Rest/binary>> = Bin,
  132. more(Rest, fun (NewBin) -> parse_boundary_eol(NewBin, Pattern) end)
  133. end.
  134. -spec parse_boundary_crlf(binary(), patterns()) -> more(part_result()).
  135. parse_boundary_crlf(<<"\r\n", Rest/binary>>, Pattern) ->
  136. % The binary is at least 2 bytes long as this function is only called by
  137. % parse_boundary_eol/3 when CRLF has been found so a more tuple will never
  138. % be returned from here.
  139. parse_headers(Rest, Pattern);
  140. parse_boundary_crlf(Bin, Pattern) ->
  141. % Unspecified behaviour here: RFC 2046 doesn't say what to do when LWSP is
  142. % not followed directly by a new line. In this implementation it is
  143. % considered part of the boundary so EOL needs to be searched again.
  144. parse_boundary_eol(Bin, Pattern).
  145. -spec parse_headers(binary(), patterns()) -> more(part_result()).
  146. parse_headers(Bin, Pattern) ->
  147. parse_headers(Bin, Pattern, []).
  148. -spec parse_headers(binary(), patterns(), http_headers()) -> more(part_result()).
  149. parse_headers(Bin, Pattern, Acc) ->
  150. case erlang:decode_packet(httph_bin, Bin, []) of
  151. {ok, {http_header, _, Name, _, Value}, Rest} ->
  152. Name2 = case is_atom(Name) of
  153. true -> cowboy_bstr:to_lower(atom_to_binary(Name, latin1));
  154. false -> cowboy_bstr:to_lower(Name)
  155. end,
  156. parse_headers(Rest, Pattern, [{Name2, Value} | Acc]);
  157. {ok, http_eoh, Rest} ->
  158. Headers = lists:reverse(Acc),
  159. {headers, Headers, fun () -> parse_body(Rest, Pattern) end};
  160. {ok, {http_error, _}, _} ->
  161. % Skip malformed parts.
  162. skip(Bin, Pattern);
  163. {more, _} ->
  164. more(Bin, fun (NewBin) -> parse_headers(NewBin, Pattern, Acc) end)
  165. end.
  166. -spec parse_body(binary(), patterns()) -> more(body_result()).
  167. parse_body(Bin, Pattern = {{P, PSize}, _}) when byte_size(Bin) >= PSize ->
  168. case binary:match(Bin, P) of
  169. {0, _Length} ->
  170. <<_:PSize/binary, Rest/binary>> = Bin,
  171. end_of_part(Rest, Pattern);
  172. {BoundaryStart, _Length} ->
  173. % Boundary found, this is the latest partial body that will be
  174. % returned for this part.
  175. <<PBody:BoundaryStart/binary, _:PSize/binary, Rest/binary>> = Bin,
  176. FResult = end_of_part(Rest, Pattern),
  177. {body, PBody, fun () -> FResult end};
  178. nomatch ->
  179. case suffix_match(Bin, Pattern) of
  180. nomatch ->
  181. %% Prefix of boundary not found at end of input. it's
  182. %% safe to return the whole binary. Saves copying of
  183. %% next input onto tail of current input binary.
  184. {body, Bin, fun () -> parse_body(<<>>, Pattern) end};
  185. {BoundaryStart, Len} ->
  186. PBody = binary:part(Bin, BoundaryStart, Len),
  187. Rest = binary:part(Bin, 0, BoundaryStart),
  188. {body, PBody, fun () -> parse_body(Rest, Pattern) end}
  189. end
  190. end;
  191. parse_body(Bin, Pattern) ->
  192. more(Bin, fun (NewBin) -> parse_body(NewBin, Pattern) end).
  193. -spec end_of_part(binary(), patterns()) -> end_of_part().
  194. end_of_part(Bin, Pattern) ->
  195. {end_of_part, fun () -> parse_boundary_tail(Bin, Pattern) end}.
  196. -spec skip(binary(), patterns()) -> more(part_result()).
  197. skip(Bin, Pattern = {{P, PSize}, _}) ->
  198. case binary:match(Bin, P) of
  199. {BoundaryStart, _Length} ->
  200. % Boundary found, proceed with parsing of the next part.
  201. RestStart = BoundaryStart + PSize,
  202. <<_:RestStart/binary, Rest/binary>> = Bin,
  203. parse_boundary_tail(Rest, Pattern);
  204. nomatch ->
  205. % Boundary not found, need more data.
  206. RestStart = max(byte_size(Bin) - PSize + 1, 0),
  207. <<_:RestStart/binary, Rest/binary>> = Bin,
  208. more(Rest, fun (NewBin) -> skip(NewBin, Pattern) end)
  209. end.
  210. -spec more(binary(), parser(T)) -> {more, parser(T)}.
  211. more(<<>>, F) ->
  212. {more, F};
  213. more(Bin, InnerF) ->
  214. F = fun (NewData) when is_binary(NewData) ->
  215. InnerF(<<Bin/binary, NewData/binary>>)
  216. end,
  217. {more, F}.
  218. %% Tests.
  219. -ifdef(TEST).
  220. multipart_test_() ->
  221. %% {Body, Result}
  222. Tests = [
  223. {<<"--boundary--">>, []},
  224. {<<"preamble\r\n--boundary--">>, []},
  225. {<<"--boundary--\r\nepilogue">>, []},
  226. {<<"\r\n--boundary\r\nA:b\r\nC:d\r\n\r\n\r\n--boundary--">>,
  227. [{[{<<"a">>, <<"b">>}, {<<"c">>, <<"d">>}], <<>>}]},
  228. {
  229. <<
  230. "--boundary\r\nX-Name:answer\r\n\r\n42"
  231. "\r\n--boundary\r\nServer:Cowboy\r\n\r\nIt rocks!\r\n"
  232. "\r\n--boundary--"
  233. >>,
  234. [
  235. {[{<<"x-name">>, <<"answer">>}], <<"42">>},
  236. {[{<<"server">>, <<"Cowboy">>}], <<"It rocks!\r\n">>}
  237. ]
  238. }
  239. ],
  240. [{title(V), fun () -> R = acc_multipart(V) end} || {V, R} <- Tests].
  241. acc_multipart(V) ->
  242. acc_multipart((parser(<<"boundary">>))(V), []).
  243. acc_multipart({headers, Headers, Cont}, Acc) ->
  244. acc_multipart(Cont(), [{Headers, []}|Acc]);
  245. acc_multipart({body, Body, Cont}, [{Headers, BodyAcc}|Acc]) ->
  246. acc_multipart(Cont(), [{Headers, [Body|BodyAcc]}|Acc]);
  247. acc_multipart({end_of_part, Cont}, [{Headers, BodyAcc}|Acc]) ->
  248. Body = list_to_binary(lists:reverse(BodyAcc)),
  249. acc_multipart(Cont(), [{Headers, Body}|Acc]);
  250. acc_multipart(eof, Acc) ->
  251. lists:reverse(Acc).
  252. content_disposition_test_() ->
  253. %% {Disposition, Result}
  254. Tests = [
  255. {<<"form-data; name=id">>, {<<"form-data">>, [{<<"name">>, <<"id">>}]}},
  256. {<<"inline">>, {<<"inline">>, []}},
  257. {<<"attachment; \tfilename=brackets-slides.pdf">>,
  258. {<<"attachment">>, [{<<"filename">>, <<"brackets-slides.pdf">>}]}}
  259. ],
  260. [{title(V), fun () -> R = content_disposition(V) end} || {V, R} <- Tests].
  261. title(Bin) ->
  262. Title = lists:foldl(
  263. fun ({T, R}, V) -> re:replace(V, T, R, [global]) end,
  264. Bin,
  265. [{"\t", "\\\\t"}, {"\r", "\\\\r"}, {"\n", "\\\\n"}]
  266. ),
  267. iolist_to_binary(Title).
  268. suffix_test_() ->
  269. [?_assertEqual(Part, suffix_match(Packet, pattern(Boundary))) ||
  270. {Part, Packet, Boundary} <- [
  271. {nomatch, <<>>, <<"ABC">>},
  272. {{0, 1}, <<"\r">>, <<"ABC">>},
  273. {{0, 2}, <<"\r\n">>, <<"ABC">>},
  274. {{0, 4}, <<"\r\n--">>, <<"ABC">>},
  275. {{0, 5}, <<"\r\n--A">>, <<"ABC">>},
  276. {{0, 6}, <<"\r\n--AB">>, <<"ABC">>},
  277. {{0, 7}, <<"\r\n--ABC">>, <<"ABC">>},
  278. {nomatch, <<"\r\n--AB1">>, <<"ABC">>},
  279. {{1, 1}, <<"1\r">>, <<"ABC">>},
  280. {{2, 2}, <<"12\r\n">>, <<"ABC">>},
  281. {{3, 4}, <<"123\r\n--">>, <<"ABC">>}
  282. ]].
  283. -endif.