syn_benchmark.erl 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. %% ==========================================================================================================
  2. %% Syn - A global Process Registry and Process Group manager.
  3. %%
  4. %% The MIT License (MIT)
  5. %%
  6. %% Copyright (c) 2019-2021 Roberto Ostinelli <roberto@ostinelli.net> and Neato Robotics, Inc.
  7. %%
  8. %% Permission is hereby granted, free of charge, to any person obtaining a copy
  9. %% of this software and associated documentation files (the "Software"), to deal
  10. %% in the Software without restriction, including without limitation the rights
  11. %% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. %% copies of the Software, and to permit persons to whom the Software is
  13. %% furnished to do so, subject to the following conditions:
  14. %%
  15. %% The above copyright notice and this permission notice shall be included in
  16. %% all copies or substantial portions of the Software.
  17. %%
  18. %% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. %% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. %% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. %% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. %% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. %% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24. %% THE SOFTWARE.
  25. %% ==========================================================================================================
  26. -module(syn_benchmark).
  27. %% API
  28. -export([
  29. start/0,
  30. start_processes/1,
  31. process_loop/0,
  32. register_on_node/4,
  33. unregister_on_node/4,
  34. join_on_node/3,
  35. leave_on_node/3,
  36. wait_registry_propagation/1,
  37. wait_groups_propagation/1
  38. ]).
  39. -export([
  40. start_profiling/0,
  41. stop_profiling/0
  42. ]).
  43. %% macros
  44. -define(TEST_GROUP_NAME, <<"test-group">>).
  45. %% ===================================================================
  46. %% API
  47. %% ===================================================================
  48. %% example run: `PROCESS_COUNT=100000 WORKERS_PER_NODE=100 NODES_COUNT=2 make bench`
  49. start() ->
  50. %% init
  51. ProcessCount = list_to_integer(os:getenv("PROCESS_COUNT", "100000")),
  52. WorkersPerNode = list_to_integer(os:getenv("WORKERS_PER_NODE", "1")),
  53. SlavesCount = list_to_integer(os:getenv("NODES_COUNT", "1")),
  54. SkipRegistry = case os:getenv("SKIP_REGISTRY") of false -> false; _ -> true end,
  55. SkipGroups = case os:getenv("SKIP_GROUPS") of false -> false; _ -> true end,
  56. ProcessesPerNode = round(ProcessCount / SlavesCount),
  57. io:format("-----> Starting benchmark~n"),
  58. io:format(" --> Nodes: ~w / ~w slave(s)~n", [SlavesCount + 1, SlavesCount]),
  59. io:format(" --> Total processes: ~w (~w / slave node)~n", [ProcessCount, ProcessesPerNode]),
  60. io:format(" --> Workers per node: ~w~n~n", [WorkersPerNode]),
  61. %% start nodes
  62. NodesInfo = lists:foldl(fun(I, Acc) ->
  63. %% start slave
  64. CountBin = integer_to_binary(I),
  65. NodeShortName = binary_to_atom(<<"slave_", CountBin/binary>>),
  66. {ok, Node} = ct_slave:start(NodeShortName, [
  67. {boot_timeout, 10},
  68. {monitor_master, true}
  69. ]),
  70. %% add code path
  71. CodePath = code:get_path(),
  72. true = rpc:call(Node, code, set_path, [CodePath]),
  73. %% start syn
  74. rpc:call(Node, syn, start, []),
  75. %% gather data
  76. FromName = (I - 1) * ProcessesPerNode + 1,
  77. ToName = FromName + ProcessesPerNode - 1,
  78. %% fold
  79. [{Node, FromName, ToName} | Acc]
  80. end, [], lists:seq(1, SlavesCount)),
  81. %% start syn locally
  82. ok = syn:start(),
  83. timer:sleep(1000),
  84. CollectorPid = self(),
  85. %% start processes
  86. PidsMap = lists:foldl(fun({Node, _FromName, _ToName}, Acc) ->
  87. Pids = rpc:call(Node, ?MODULE, start_processes, [ProcessesPerNode]),
  88. maps:put(Node, Pids, Acc)
  89. end, #{}, NodesInfo),
  90. case SkipRegistry of
  91. false ->
  92. %% start registration
  93. lists:foreach(fun({Node, FromName, _ToName}) ->
  94. Pids = maps:get(Node, PidsMap),
  95. rpc:cast(Node, ?MODULE, register_on_node, [CollectorPid, WorkersPerNode, FromName, Pids])
  96. end, NodesInfo),
  97. %% wait
  98. RegRemoteNodesTimes = wait_from_all_remote_nodes(nodes(), []),
  99. io:format("----> Remote registration times:~n"),
  100. io:format(" --> MIN: ~p secs.~n", [lists:min(RegRemoteNodesTimes)]),
  101. io:format(" --> MAX: ~p secs.~n", [lists:max(RegRemoteNodesTimes)]),
  102. {RegPropagationTimeMs, _} = timer:tc(?MODULE, wait_registry_propagation, [ProcessCount]),
  103. RegPropagationTime = RegPropagationTimeMs / 1000000,
  104. io:format("----> Eventual additional time to propagate all to master: ~p secs.~n", [RegPropagationTime]),
  105. %% sum
  106. RegTakenTime = (lists:max(RegRemoteNodesTimes) + RegPropagationTime),
  107. RegistrationRate = ProcessCount / RegTakenTime,
  108. io:format("====> Registeration rate (with propagation): ~p/sec.~n~n", [RegistrationRate]),
  109. %% start unregistration
  110. lists:foreach(fun({Node, FromName, ToName}) ->
  111. rpc:cast(Node, ?MODULE, unregister_on_node, [CollectorPid, WorkersPerNode, FromName, ToName])
  112. end, NodesInfo),
  113. %% wait
  114. UnregRemoteNodesTimes = wait_from_all_remote_nodes(nodes(), []),
  115. io:format("----> Remote unregistration times:~n"),
  116. io:format(" --> MIN: ~p secs.~n", [lists:min(UnregRemoteNodesTimes)]),
  117. io:format(" --> MAX: ~p secs.~n", [lists:max(UnregRemoteNodesTimes)]),
  118. {UnregPropagationTimeMs, _} = timer:tc(?MODULE, wait_registry_propagation, [0]),
  119. UnregPropagationTime = UnregPropagationTimeMs / 1000000,
  120. io:format("----> Eventual additional time to propagate all to master: ~p secs.~n", [UnregPropagationTime]),
  121. %% sum
  122. UnregTakenTime = (lists:max(UnregRemoteNodesTimes) + UnregPropagationTime),
  123. UnregistrationRate = ProcessCount / UnregTakenTime,
  124. io:format("====> Unregisteration rate (with propagation): ~p/sec.~n~n", [UnregistrationRate]),
  125. %% start re-registration
  126. lists:foreach(fun({Node, FromName, _ToName}) ->
  127. Pids = maps:get(Node, PidsMap),
  128. rpc:cast(Node, ?MODULE, register_on_node, [CollectorPid, WorkersPerNode, FromName, Pids])
  129. end, NodesInfo),
  130. %% wait
  131. ReRegRemoteNodesTimes = wait_from_all_remote_nodes(nodes(), []),
  132. io:format("----> Remote re-registration times:~n"),
  133. io:format(" --> MIN: ~p secs.~n", [lists:min(ReRegRemoteNodesTimes)]),
  134. io:format(" --> MAX: ~p secs.~n", [lists:max(ReRegRemoteNodesTimes)]),
  135. {ReRegPropagationTimeMs, _} = timer:tc(?MODULE, wait_registry_propagation, [ProcessCount]),
  136. ReRegPropagationTime = ReRegPropagationTimeMs / 1000000,
  137. io:format("----> Eventual additional time to propagate all to master: ~p secs.~n", [ReRegPropagationTime]),
  138. %% sum
  139. ReRegTakenTime = (lists:max(ReRegRemoteNodesTimes) + ReRegPropagationTime),
  140. ReRegistrationRate = ProcessCount / ReRegTakenTime,
  141. io:format("====> Re-registeration rate (with propagation): ~p/sec.~n~n", [ReRegistrationRate]),
  142. %% kill all processes
  143. maps:foreach(fun(_Node, Pids) ->
  144. lists:foreach(fun(Pid) -> exit(Pid, kill) end, Pids)
  145. end, PidsMap),
  146. %% wait all unregistered
  147. {RegKillPropagationTimeMs, _} = timer:tc(?MODULE, wait_registry_propagation, [0]),
  148. RegKillPropagationTime = RegKillPropagationTimeMs / 1000000,
  149. io:format("----> Time to propagate killed process to to master: ~p secs.~n", [RegKillPropagationTime]),
  150. RegKillRate = ProcessCount / RegKillPropagationTime,
  151. io:format("====> Unregistered after kill rate (with propagation): ~p/sec.~n~n", [RegKillRate]);
  152. true ->
  153. io:format("====> Skipping REGISTRY.~n")
  154. end,
  155. case SkipGroups of
  156. false ->
  157. %% start joining
  158. lists:foreach(fun({Node, _FromName, _ToName}) ->
  159. Pids = maps:get(Node, PidsMap),
  160. rpc:cast(Node, ?MODULE, join_on_node, [CollectorPid, WorkersPerNode, Pids])
  161. end, NodesInfo),
  162. %% wait
  163. JoinRemoteNodesTimes = wait_from_all_remote_nodes(nodes(), []),
  164. io:format("----> Remote join times:~n"),
  165. io:format(" --> MIN: ~p secs.~n", [lists:min(JoinRemoteNodesTimes)]),
  166. io:format(" --> MAX: ~p secs.~n", [lists:max(JoinRemoteNodesTimes)]),
  167. {JoinPropagationTimeMs, _} = timer:tc(?MODULE, wait_groups_propagation, [ProcessCount]),
  168. JoinPropagationTime = JoinPropagationTimeMs / 1000000,
  169. io:format("----> Eventual additional time to propagate all to master: ~p secs.~n", [JoinPropagationTime]),
  170. %% sum
  171. JoinTakenTime = (lists:max(JoinRemoteNodesTimes) + JoinPropagationTime),
  172. JoinRate = ProcessCount / JoinTakenTime,
  173. io:format("====> Join rate (with propagation): ~p/sec.~n~n", [JoinRate]),
  174. %% start leaving
  175. lists:foreach(fun({Node, _FromName, _ToName}) ->
  176. Pids = maps:get(Node, PidsMap),
  177. rpc:cast(Node, ?MODULE, leave_on_node, [CollectorPid, WorkersPerNode, Pids])
  178. end, NodesInfo),
  179. %% wait
  180. LeaveRemoteNodesTimes = wait_from_all_remote_nodes(nodes(), []),
  181. io:format("----> Remote leave times:~n"),
  182. io:format(" --> MIN: ~p secs.~n", [lists:min(LeaveRemoteNodesTimes)]),
  183. io:format(" --> MAX: ~p secs.~n", [lists:max(LeaveRemoteNodesTimes)]),
  184. {LeavePropagationTimeMs, _} = timer:tc(?MODULE, wait_groups_propagation, [0]),
  185. LeavePropagationTime = LeavePropagationTimeMs / 1000000,
  186. io:format("----> Eventual additional time to propagate all to master: ~p secs.~n", [LeavePropagationTime]),
  187. %% sum
  188. LeaveTakenTime = (lists:max(LeaveRemoteNodesTimes) + LeavePropagationTime),
  189. LeaveRate = ProcessCount / LeaveTakenTime,
  190. io:format("====> Leave rate (with propagation): ~p/sec.~n~n", [LeaveRate]),
  191. %% start re-joining
  192. lists:foreach(fun({Node, _FromName, _ToName}) ->
  193. Pids = maps:get(Node, PidsMap),
  194. rpc:cast(Node, ?MODULE, join_on_node, [CollectorPid, WorkersPerNode, Pids])
  195. end, NodesInfo),
  196. %% wait
  197. ReJoinRemoteNodesTimes = wait_from_all_remote_nodes(nodes(), []),
  198. io:format("----> Remote join times:~n"),
  199. io:format(" --> MIN: ~p secs.~n", [lists:min(ReJoinRemoteNodesTimes)]),
  200. io:format(" --> MAX: ~p secs.~n", [lists:max(ReJoinRemoteNodesTimes)]),
  201. {ReJoinPropagationTimeMs, _} = timer:tc(?MODULE, wait_groups_propagation, [ProcessCount]),
  202. ReJoinPropagationTime = ReJoinPropagationTimeMs / 1000000,
  203. io:format("----> Eventual additional time to propagate all to master: ~p secs.~n", [ReJoinPropagationTime]),
  204. %% sum
  205. ReJoinTakenTime = (lists:max(ReJoinRemoteNodesTimes) + ReJoinPropagationTime),
  206. ReJoinRate = ProcessCount / ReJoinTakenTime,
  207. io:format("====> Re-join rate (with propagation): ~p/sec.~n~n", [ReJoinRate]),
  208. %% kill all processes
  209. maps:foreach(fun(_Node, Pids) ->
  210. lists:foreach(fun(Pid) -> exit(Pid, kill) end, Pids)
  211. end, PidsMap),
  212. %% wait all unregistered
  213. {GroupsKillPropagationTimeMs, _} = timer:tc(?MODULE, wait_groups_propagation, [0]),
  214. GroupsKillPropagationTime = GroupsKillPropagationTimeMs / 1000000,
  215. io:format("----> Time to propagate killed process to to master: ~p secs.~n", [GroupsKillPropagationTime]),
  216. GroupsKillRate = ProcessCount / GroupsKillPropagationTime,
  217. io:format("====> Left after kill rate (with propagation): ~p/sec.~n~n", [GroupsKillRate]);
  218. true ->
  219. io:format("====> Skipping GROUPS.~n")
  220. end,
  221. %% stop node
  222. init:stop().
  223. register_on_node(CollectorPid, WorkersPerNode, FromName, Pids) ->
  224. %% split pids in workers
  225. PidsPerNode = round(length(Pids) / WorkersPerNode),
  226. {WorkerInfo, []} = lists:foldl(fun(I, {WInfo, RPids}) ->
  227. {WorkerPids, RestOfPids} = case I of
  228. WorkersPerNode ->
  229. %% last in the loop, get remaining pids
  230. {RPids, []};
  231. _ ->
  232. %% get portion of pids
  233. lists:split(PidsPerNode, RPids)
  234. end,
  235. WorkerFromName = FromName + (PidsPerNode * (I - 1)),
  236. {[{WorkerFromName, WorkerPids} | WInfo], RestOfPids}
  237. end, {[], Pids}, lists:seq(1, WorkersPerNode)),
  238. %% spawn workers
  239. ReplyPid = self(),
  240. lists:foreach(fun({WorkerFromName, WorkerPids}) ->
  241. spawn(fun() ->
  242. StartAt = os:system_time(millisecond),
  243. worker_register_on_node(WorkerFromName, WorkerPids),
  244. Time = (os:system_time(millisecond) - StartAt) / 1000,
  245. ReplyPid ! {done, Time}
  246. end)
  247. end, WorkerInfo),
  248. %% wait
  249. Time = wait_done_on_node(CollectorPid, 0, WorkersPerNode),
  250. io:format("----> Registered on node ~p on ~p secs.~n", [node(), Time]).
  251. worker_register_on_node(_Name, []) -> ok;
  252. worker_register_on_node(Name, [Pid | PidsTail]) ->
  253. ok = syn:register(Name, Pid),
  254. worker_register_on_node(Name + 1, PidsTail).
  255. unregister_on_node(CollectorPid, WorkersPerNode, FromName, ToName) ->
  256. %% split pids in workers
  257. ProcessesPerNode = ToName - FromName + 1,
  258. ProcessesPerWorker = round(ProcessesPerNode / WorkersPerNode),
  259. WorkerInfo = lists:foldl(fun(I, Acc) ->
  260. {WorkerFromName, WorkerToName} = case I of
  261. WorkersPerNode ->
  262. %% last in the loop
  263. {FromName + (I - 1) * ProcessesPerWorker, ToName};
  264. _ ->
  265. {FromName + (I - 1) * ProcessesPerWorker, FromName + I * ProcessesPerWorker - 1}
  266. end,
  267. [{WorkerFromName, WorkerToName} | Acc]
  268. end, [], lists:seq(1, WorkersPerNode)),
  269. %% spawn workers
  270. ReplyPid = self(),
  271. lists:foreach(fun({WorkerFromName, WorkerToName}) ->
  272. spawn(fun() ->
  273. StartAt = os:system_time(millisecond),
  274. worker_unregister_on_node(WorkerFromName, WorkerToName),
  275. Time = (os:system_time(millisecond) - StartAt) / 1000,
  276. ReplyPid ! {done, Time}
  277. end)
  278. end, WorkerInfo),
  279. %% wait
  280. Time = wait_done_on_node(CollectorPid, 0, WorkersPerNode),
  281. io:format("----> Unregistered on node ~p on ~p secs.~n", [node(), Time]).
  282. worker_unregister_on_node(FromName, ToName) when FromName > ToName -> ok;
  283. worker_unregister_on_node(Name, ToName) ->
  284. ok = syn:unregister(Name),
  285. worker_unregister_on_node(Name + 1, ToName).
  286. join_on_node(CollectorPid, WorkersPerNode, Pids) ->
  287. %% split pids in workers
  288. PidsPerNode = round(length(Pids) / WorkersPerNode),
  289. {PidsPerWorker, []} = lists:foldl(fun(I, {P, RPids}) ->
  290. {WPids, RestOfPids} = case I of
  291. WorkersPerNode ->
  292. %% last in the loop, get remaining pids
  293. {RPids, []};
  294. _ ->
  295. %% get portion of pids
  296. lists:split(PidsPerNode, RPids)
  297. end,
  298. {[WPids | P], RestOfPids}
  299. end, {[], Pids}, lists:seq(1, WorkersPerNode)),
  300. %% spawn workers
  301. ReplyPid = self(),
  302. lists:foreach(fun(WorkerPids) ->
  303. spawn(fun() ->
  304. StartAt = os:system_time(millisecond),
  305. worker_join_on_node(WorkerPids),
  306. Time = (os:system_time(millisecond) - StartAt) / 1000,
  307. ReplyPid ! {done, Time}
  308. end)
  309. end, PidsPerWorker),
  310. %% wait
  311. Time = wait_done_on_node(CollectorPid, 0, WorkersPerNode),
  312. io:format("----> Joined on node ~p on ~p secs.~n", [node(), Time]).
  313. worker_join_on_node([]) -> ok;
  314. worker_join_on_node([Pid | PidsTail]) ->
  315. ok = syn:join(?TEST_GROUP_NAME, Pid),
  316. worker_join_on_node(PidsTail).
  317. leave_on_node(CollectorPid, WorkersPerNode, Pids) ->
  318. %% split pids in workers
  319. PidsPerNode = round(length(Pids) / WorkersPerNode),
  320. {PidsPerWorker, []} = lists:foldl(fun(I, {P, RPids}) ->
  321. {WPids, RestOfPids} = case I of
  322. WorkersPerNode ->
  323. %% last in the loop, get remaining pids
  324. {RPids, []};
  325. _ ->
  326. %% get portion of pids
  327. lists:split(PidsPerNode, RPids)
  328. end,
  329. {[WPids | P], RestOfPids}
  330. end, {[], Pids}, lists:seq(1, WorkersPerNode)),
  331. %% spawn workers
  332. ReplyPid = self(),
  333. lists:foreach(fun(WorkerPids) ->
  334. spawn(fun() ->
  335. StartAt = os:system_time(millisecond),
  336. worker_leave_on_node(WorkerPids),
  337. Time = (os:system_time(millisecond) - StartAt) / 1000,
  338. ReplyPid ! {done, Time}
  339. end)
  340. end, PidsPerWorker),
  341. %% wait
  342. Time = wait_done_on_node(CollectorPid, 0, WorkersPerNode),
  343. io:format("----> Left on node ~p on ~p secs.~n", [node(), Time]).
  344. worker_leave_on_node([]) -> ok;
  345. worker_leave_on_node([Pid | PidsTail]) ->
  346. ok = syn:leave(?TEST_GROUP_NAME, Pid),
  347. worker_leave_on_node(PidsTail).
  348. wait_done_on_node(CollectorPid, Time, 0) ->
  349. CollectorPid ! {done, node(), Time},
  350. Time;
  351. wait_done_on_node(CollectorPid, Time, WorkersRemainingCount) ->
  352. receive
  353. {done, WorkerTime} ->
  354. Time1 = lists:max([WorkerTime, Time]),
  355. wait_done_on_node(CollectorPid, Time1, WorkersRemainingCount - 1)
  356. end.
  357. start_processes(Count) ->
  358. start_processes(Count, []).
  359. start_processes(0, Pids) ->
  360. Pids;
  361. start_processes(Count, Pids) ->
  362. Pid = spawn(fun process_loop/0),
  363. start_processes(Count - 1, [Pid | Pids]).
  364. process_loop() ->
  365. receive
  366. _ -> ok
  367. end.
  368. wait_from_all_remote_nodes([], Times) -> Times;
  369. wait_from_all_remote_nodes([RemoteNode | Tail], Times) ->
  370. receive
  371. {done, RemoteNode, Time} ->
  372. wait_from_all_remote_nodes(Tail, [Time | Times])
  373. end.
  374. wait_registry_propagation(DesiredCount) ->
  375. case syn:registry_count(default) of
  376. DesiredCount ->
  377. ok;
  378. _ ->
  379. timer:sleep(50),
  380. wait_registry_propagation(DesiredCount)
  381. end.
  382. wait_groups_propagation(DesiredCount) ->
  383. case length(syn:members(?TEST_GROUP_NAME)) of
  384. DesiredCount ->
  385. ok;
  386. _ ->
  387. timer:sleep(50),
  388. wait_groups_propagation(DesiredCount)
  389. end.
  390. start_profiling() ->
  391. {ok, P} = eprof:start(),
  392. eprof:start_profiling(erlang:processes() -- [P]).
  393. stop_profiling() ->
  394. eprof:stop_profiling(),
  395. eprof:analyze(total).