cowboy_multipart.erl 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. %% Copyright (c) 2011, Anthony Ramine <nox@dev-extend.eu>
  2. %%
  3. %% Permission to use, copy, modify, and/or distribute this software for any
  4. %% purpose with or without fee is hereby granted, provided that the above
  5. %% copyright notice and this permission notice appear in all copies.
  6. %%
  7. %% THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  8. %% WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  9. %% MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  10. %% ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  11. %% WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  12. %% ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  13. %% OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  14. %% @doc Multipart parser.
  15. -module(cowboy_multipart).
  16. -export([parser/1]).
  17. -export([content_disposition/1]).
  18. -type part_parser() :: parser(more(part_result())).
  19. -type parser(T) :: fun((binary()) -> T).
  20. -type more(T) :: T | {more, parser(T)}.
  21. -type part_result() :: headers() | eof.
  22. -type headers() :: {headers, http_headers(), body_cont()}.
  23. -type http_headers() :: [{binary(), binary()}].
  24. -type body_cont() :: cont(more(body_result())).
  25. -type cont(T) :: fun(() -> T).
  26. -type body_result() :: {body, binary(), body_cont()} | end_of_part().
  27. -type end_of_part() :: {end_of_part, cont(more(part_result()))}.
  28. -type disposition() :: {binary(), [{binary(), binary()}]}.
  29. %% API.
  30. %% @doc Return a multipart parser for the given boundary.
  31. -spec parser(binary()) -> part_parser().
  32. parser(Boundary) when is_binary(Boundary) ->
  33. fun (Bin) when is_binary(Bin) -> parse(Bin, Boundary) end.
  34. %% @doc Parse a content disposition.
  35. %% @todo Parse the MIME header instead of the HTTP one.
  36. -spec content_disposition(binary()) -> disposition().
  37. content_disposition(Data) ->
  38. cowboy_http:token_ci(Data,
  39. fun (_Rest, <<>>) -> {error, badarg};
  40. (Rest, Disposition) ->
  41. cowboy_http:params(Rest,
  42. fun (<<>>, Params) -> {Disposition, Params};
  43. (_Rest2, _) -> {error, badarg}
  44. end)
  45. end).
  46. %% Internal.
  47. %% @doc Entry point of the multipart parser, skips over the preamble if any.
  48. -spec parse(binary(), binary()) -> more(part_result()).
  49. parse(Bin, Boundary) when byte_size(Bin) >= byte_size(Boundary) + 2 ->
  50. BoundarySize = byte_size(Boundary),
  51. Pattern = pattern(Boundary),
  52. case Bin of
  53. <<"--", Boundary:BoundarySize/binary, Rest/binary>> ->
  54. % Data starts with initial boundary, skip preamble parsing.
  55. parse_boundary_tail(Rest, Pattern);
  56. _ ->
  57. % Parse preamble.
  58. skip(Bin, Pattern)
  59. end;
  60. parse(Bin, Boundary) ->
  61. % Not enough data to know if the data begins with a boundary.
  62. more(Bin, fun (NewBin) -> parse(NewBin, Boundary) end).
  63. -type pattern() :: {binary:cp(), non_neg_integer()}.
  64. -type patterns() :: {pattern(), pattern()}.
  65. %% @doc Return two compiled binary patterns with their sizes in bytes.
  66. %% The boundary pattern is the boundary prepended with "\r\n--".
  67. %% The boundary suffix pattern matches all prefixes of the boundary.
  68. -spec pattern(binary()) -> patterns().
  69. pattern(Boundary) ->
  70. MatchPattern = <<"\r\n--", Boundary/binary>>,
  71. MatchPrefixes = prefixes(MatchPattern),
  72. {{binary:compile_pattern(MatchPattern), byte_size(MatchPattern)},
  73. {binary:compile_pattern(MatchPrefixes), byte_size(MatchPattern)}}.
  74. %% @doc Return all prefixes of a binary string.
  75. %% The list of prefixes includes the full string.
  76. -spec prefixes(binary()) -> [binary()].
  77. prefixes(<<C, Rest/binary>>) ->
  78. prefixes(Rest, <<C>>).
  79. -spec prefixes(binary(), binary()) -> [binary()].
  80. prefixes(<<C, Rest/binary>>, Acc) ->
  81. [Acc|prefixes(Rest, <<Acc/binary, C>>)];
  82. prefixes(<<>>, Acc) ->
  83. [Acc].
  84. %% @doc Test if a boundary is a possble suffix.
  85. %% The patterns are expected to have been returned from `pattern/1'.
  86. -spec suffix_match(binary(), patterns()) -> nomatch | {integer(), integer()}.
  87. suffix_match(Bin, {_Boundary, {Pat, Len}}) ->
  88. Size = byte_size(Bin),
  89. suffix_match(Bin, Pat, Size, max(-Size, -Len)).
  90. -spec suffix_match(binary(), binary:cp(), non_neg_integer(), 0|neg_integer()) ->
  91. nomatch | {integer(), integer()}.
  92. suffix_match(_Bin, _Pat, _Size, _Match=0) ->
  93. nomatch;
  94. suffix_match(Bin, Pat, Size, Match) when Match < 0 ->
  95. case binary:match(Bin, Pat, [{scope, {Size, Match}}]) of
  96. {Pos, Len}=Part when Pos + Len =:= Size -> Part;
  97. {_, Len} -> suffix_match(Bin, Pat, Size, Match + Len);
  98. nomatch -> nomatch
  99. end.
  100. %% @doc Parse remaining characters of a line beginning with the boundary.
  101. %% If followed by "--", <em>eof</em> is returned and parsing is finished.
  102. -spec parse_boundary_tail(binary(), patterns()) -> more(part_result()).
  103. parse_boundary_tail(Bin, Pattern) when byte_size(Bin) >= 2 ->
  104. case Bin of
  105. <<"--", _Rest/binary>> ->
  106. % Boundary is followed by "--", end parsing.
  107. eof;
  108. _ ->
  109. % No dash after boundary, proceed with unknown chars and lwsp
  110. % removal.
  111. parse_boundary_eol(Bin, Pattern)
  112. end;
  113. parse_boundary_tail(Bin, Pattern) ->
  114. % Boundary may be followed by "--", need more data.
  115. more(Bin, fun (NewBin) -> parse_boundary_tail(NewBin, Pattern) end).
  116. %% @doc Skip whitespace and unknown chars until CRLF.
  117. -spec parse_boundary_eol(binary(), patterns()) -> more(part_result()).
  118. parse_boundary_eol(Bin, Pattern) ->
  119. case binary:match(Bin, <<"\r\n">>) of
  120. {CrlfStart, _Length} ->
  121. % End of line found, remove optional whitespace.
  122. <<_:CrlfStart/binary, Rest/binary>> = Bin,
  123. Fun = fun (Rest2) -> parse_boundary_crlf(Rest2, Pattern) end,
  124. cowboy_http:whitespace(Rest, Fun);
  125. nomatch ->
  126. % CRLF not found in the given binary.
  127. RestStart = max(byte_size(Bin) - 1, 0),
  128. <<_:RestStart/binary, Rest/binary>> = Bin,
  129. more(Rest, fun (NewBin) -> parse_boundary_eol(NewBin, Pattern) end)
  130. end.
  131. -spec parse_boundary_crlf(binary(), patterns()) -> more(part_result()).
  132. parse_boundary_crlf(<<"\r\n", Rest/binary>>, Pattern) ->
  133. % The binary is at least 2 bytes long as this function is only called by
  134. % parse_boundary_eol/3 when CRLF has been found so a more tuple will never
  135. % be returned from here.
  136. parse_headers(Rest, Pattern);
  137. parse_boundary_crlf(Bin, Pattern) ->
  138. % Unspecified behaviour here: RFC 2046 doesn't say what to do when LWSP is
  139. % not followed directly by a new line. In this implementation it is
  140. % considered part of the boundary so EOL needs to be searched again.
  141. parse_boundary_eol(Bin, Pattern).
  142. -spec parse_headers(binary(), patterns()) -> more(part_result()).
  143. parse_headers(Bin, Pattern) ->
  144. parse_headers(Bin, Pattern, []).
  145. -spec parse_headers(binary(), patterns(), http_headers()) -> more(part_result()).
  146. parse_headers(Bin, Pattern, Acc) ->
  147. case erlang:decode_packet(httph_bin, Bin, []) of
  148. {ok, {http_header, _, Name, _, Value}, Rest} ->
  149. Name2 = case is_atom(Name) of
  150. true -> cowboy_bstr:to_lower(atom_to_binary(Name, latin1));
  151. false -> cowboy_bstr:to_lower(Name)
  152. end,
  153. parse_headers(Rest, Pattern, [{Name2, Value} | Acc]);
  154. {ok, http_eoh, Rest} ->
  155. Headers = lists:reverse(Acc),
  156. {headers, Headers, fun () -> parse_body(Rest, Pattern) end};
  157. {ok, {http_error, _}, _} ->
  158. % Skip malformed parts.
  159. skip(Bin, Pattern);
  160. {more, _} ->
  161. more(Bin, fun (NewBin) -> parse_headers(NewBin, Pattern, Acc) end)
  162. end.
  163. -spec parse_body(binary(), patterns()) -> more(body_result()).
  164. parse_body(Bin, Pattern = {{P, PSize}, _}) when byte_size(Bin) >= PSize ->
  165. case binary:match(Bin, P) of
  166. {0, _Length} ->
  167. <<_:PSize/binary, Rest/binary>> = Bin,
  168. end_of_part(Rest, Pattern);
  169. {BoundaryStart, _Length} ->
  170. % Boundary found, this is the latest partial body that will be
  171. % returned for this part.
  172. <<PBody:BoundaryStart/binary, _:PSize/binary, Rest/binary>> = Bin,
  173. FResult = end_of_part(Rest, Pattern),
  174. {body, PBody, fun () -> FResult end};
  175. nomatch ->
  176. case suffix_match(Bin, Pattern) of
  177. nomatch ->
  178. %% Prefix of boundary not found at end of input. it's
  179. %% safe to return the whole binary. Saves copying of
  180. %% next input onto tail of current input binary.
  181. {body, Bin, fun () -> parse_body(<<>>, Pattern) end};
  182. {BoundaryStart, Len} ->
  183. PBody = binary:part(Bin, 0, BoundaryStart),
  184. Rest = binary:part(Bin, BoundaryStart, Len),
  185. {body, PBody, fun () -> parse_body(Rest, Pattern) end}
  186. end
  187. end;
  188. parse_body(Bin, Pattern) ->
  189. more(Bin, fun (NewBin) -> parse_body(NewBin, Pattern) end).
  190. -spec end_of_part(binary(), patterns()) -> end_of_part().
  191. end_of_part(Bin, Pattern) ->
  192. {end_of_part, fun () -> parse_boundary_tail(Bin, Pattern) end}.
  193. -spec skip(binary(), patterns()) -> more(part_result()).
  194. skip(Bin, Pattern = {{P, PSize}, _}) ->
  195. case binary:match(Bin, P) of
  196. {BoundaryStart, _Length} ->
  197. % Boundary found, proceed with parsing of the next part.
  198. RestStart = BoundaryStart + PSize,
  199. <<_:RestStart/binary, Rest/binary>> = Bin,
  200. parse_boundary_tail(Rest, Pattern);
  201. nomatch ->
  202. % Boundary not found, need more data.
  203. RestStart = max(byte_size(Bin) - PSize + 1, 0),
  204. <<_:RestStart/binary, Rest/binary>> = Bin,
  205. more(Rest, fun (NewBin) -> skip(NewBin, Pattern) end)
  206. end.
  207. -spec more(binary(), parser(T)) -> {more, parser(T)}.
  208. more(<<>>, F) ->
  209. {more, F};
  210. more(Bin, InnerF) ->
  211. F = fun (NewData) when is_binary(NewData) ->
  212. InnerF(<<Bin/binary, NewData/binary>>)
  213. end,
  214. {more, F}.
  215. %% Tests.
  216. -ifdef(TEST).
  217. multipart_test_() ->
  218. %% {Body, Result}
  219. Tests = [
  220. {<<"--boundary--">>, []},
  221. {<<"preamble\r\n--boundary--">>, []},
  222. {<<"--boundary--\r\nepilogue">>, []},
  223. {<<"\r\n--boundary\r\nA:b\r\nC:d\r\n\r\n\r\n--boundary--">>,
  224. [{[{<<"a">>, <<"b">>}, {<<"c">>, <<"d">>}], <<>>}]},
  225. {
  226. <<
  227. "--boundary\r\nX-Name:answer\r\n\r\n42"
  228. "\r\n--boundary\r\nServer:Cowboy\r\n\r\nIt rocks!\r\n"
  229. "\r\n--boundary--"
  230. >>,
  231. [
  232. {[{<<"x-name">>, <<"answer">>}], <<"42">>},
  233. {[{<<"server">>, <<"Cowboy">>}], <<"It rocks!\r\n">>}
  234. ]
  235. }
  236. ],
  237. [{title(V), fun () -> R = acc_multipart(V) end} || {V, R} <- Tests].
  238. acc_multipart(V) ->
  239. acc_multipart((parser(<<"boundary">>))(V), []).
  240. acc_multipart({headers, Headers, Cont}, Acc) ->
  241. acc_multipart(Cont(), [{Headers, []}|Acc]);
  242. acc_multipart({body, Body, Cont}, [{Headers, BodyAcc}|Acc]) ->
  243. acc_multipart(Cont(), [{Headers, [Body|BodyAcc]}|Acc]);
  244. acc_multipart({end_of_part, Cont}, [{Headers, BodyAcc}|Acc]) ->
  245. Body = list_to_binary(lists:reverse(BodyAcc)),
  246. acc_multipart(Cont(), [{Headers, Body}|Acc]);
  247. acc_multipart(eof, Acc) ->
  248. lists:reverse(Acc).
  249. content_disposition_test_() ->
  250. %% {Disposition, Result}
  251. Tests = [
  252. {<<"form-data; name=id">>, {<<"form-data">>, [{<<"name">>, <<"id">>}]}},
  253. {<<"inline">>, {<<"inline">>, []}},
  254. {<<"attachment; \tfilename=brackets-slides.pdf">>,
  255. {<<"attachment">>, [{<<"filename">>, <<"brackets-slides.pdf">>}]}}
  256. ],
  257. [{title(V), fun () -> R = content_disposition(V) end} || {V, R} <- Tests].
  258. title(Bin) ->
  259. Title = lists:foldl(
  260. fun ({T, R}, V) -> re:replace(V, T, R, [global]) end,
  261. Bin,
  262. [{"\t", "\\\\t"}, {"\r", "\\\\r"}, {"\n", "\\\\n"}]
  263. ),
  264. iolist_to_binary(Title).
  265. suffix_test_() ->
  266. Tests = [
  267. {nomatch, <<>>, <<"ABC">>},
  268. {{0, 1}, <<"\r">>, <<"ABC">>},
  269. {{0, 2}, <<"\r\n">>, <<"ABC">>},
  270. {{0, 4}, <<"\r\n--">>, <<"ABC">>},
  271. {{0, 5}, <<"\r\n--A">>, <<"ABC">>},
  272. {{0, 6}, <<"\r\n--AB">>, <<"ABC">>},
  273. {{0, 7}, <<"\r\n--ABC">>, <<"ABC">>},
  274. {nomatch, <<"\r\n--AB1">>, <<"ABC">>},
  275. {{1, 1}, <<"1\r">>, <<"ABC">>},
  276. {{2, 2}, <<"12\r\n">>, <<"ABC">>},
  277. {{3, 4}, <<"123\r\n--">>, <<"ABC">>}
  278. ],
  279. [fun() -> Part = suffix_match(Packet, pattern(Boundary)) end ||
  280. {Part, Packet, Boundary} <- Tests].
  281. -endif.