syn_netsplits.erl 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. %% ==========================================================================================================
  2. %% Syn - A global process registry.
  3. %%
  4. %% Copyright (C) 2015, Roberto Ostinelli <roberto@ostinelli.net>.
  5. %% All rights reserved.
  6. %%
  7. %% The MIT License (MIT)
  8. %%
  9. %% Copyright (c) 2015 Roberto Ostinelli
  10. %%
  11. %% Permission is hereby granted, free of charge, to any person obtaining a copy
  12. %% of this software and associated documentation files (the "Software"), to deal
  13. %% in the Software without restriction, including without limitation the rights
  14. %% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  15. %% copies of the Software, and to permit persons to whom the Software is
  16. %% furnished to do so, subject to the following conditions:
  17. %%
  18. %% The above copyright notice and this permission notice shall be included in
  19. %% all copies or substantial portions of the Software.
  20. %%
  21. %% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  22. %% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  23. %% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  24. %% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  25. %% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  26. %% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  27. %% THE SOFTWARE.
  28. %% ==========================================================================================================
  29. -module(syn_netsplits).
  30. -behaviour(gen_server).
  31. %% API
  32. -export([start_link/0]).
  33. %% gen_server callbacks
  34. -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]).
  35. %% internal
  36. -export([get_processes_info_of_node/1]).
  37. -export([write_processes_info_to_node/2]).
  38. %% records
  39. -record(state, {}).
  40. %% include
  41. -include("syn.hrl").
  42. %% ===================================================================
  43. %% API
  44. %% ===================================================================
  45. -spec start_link() -> {ok, pid()} | {error, any()}.
  46. start_link() ->
  47. Options = [],
  48. gen_server:start_link({local, ?MODULE}, ?MODULE, [], Options).
  49. %% ===================================================================
  50. %% Callbacks
  51. %% ===================================================================
  52. %% ----------------------------------------------------------------------------------------------------------
  53. %% Init
  54. %% ----------------------------------------------------------------------------------------------------------
  55. -spec init([]) ->
  56. {ok, #state{}} |
  57. {ok, #state{}, Timeout :: non_neg_integer()} |
  58. ignore |
  59. {stop, Reason :: any()}.
  60. init([]) ->
  61. %% trap linked processes signal
  62. process_flag(trap_exit, true),
  63. %% monitor mnesia events
  64. mnesia:subscribe(system),
  65. {ok, #state{}}.
  66. %% ----------------------------------------------------------------------------------------------------------
  67. %% Call messages
  68. %% ----------------------------------------------------------------------------------------------------------
  69. -spec handle_call(Request :: any(), From :: any(), #state{}) ->
  70. {reply, Reply :: any(), #state{}} |
  71. {reply, Reply :: any(), #state{}, Timeout :: non_neg_integer()} |
  72. {noreply, #state{}} |
  73. {noreply, #state{}, Timeout :: non_neg_integer()} |
  74. {stop, Reason :: any(), Reply :: any(), #state{}} |
  75. {stop, Reason :: any(), #state{}}.
  76. handle_call(Request, From, State) ->
  77. error_logger:warning_msg("Received from ~p an unknown call message: ~p~n", [Request, From]),
  78. {reply, undefined, State}.
  79. %% ----------------------------------------------------------------------------------------------------------
  80. %% Cast messages
  81. %% ----------------------------------------------------------------------------------------------------------
  82. -spec handle_cast(Msg :: any(), #state{}) ->
  83. {noreply, #state{}} |
  84. {noreply, #state{}, Timeout :: non_neg_integer()} |
  85. {stop, Reason :: any(), #state{}}.
  86. handle_cast(Msg, State) ->
  87. error_logger:warning_msg("Received an unknown cast message: ~p~n", [Msg]),
  88. {noreply, State}.
  89. %% ----------------------------------------------------------------------------------------------------------
  90. %% All non Call / Cast messages
  91. %% ----------------------------------------------------------------------------------------------------------
  92. -spec handle_info(Info :: any(), #state{}) ->
  93. {noreply, #state{}} |
  94. {noreply, #state{}, Timeout :: non_neg_integer()} |
  95. {stop, Reason :: any(), #state{}}.
  96. handle_info({mnesia_system_event, {inconsistent_database, Context, Node}}, State) ->
  97. error_logger:warning_msg("MNESIA signalled an inconsistent database on node: ~p with context: ~p, initiating automerge~n", [Node, Context]),
  98. automerge(Node),
  99. {noreply, State};
  100. handle_info({mnesia_system_event, {mnesia_down, Node}}, State) when Node =/= node() ->
  101. error_logger:warning_msg("Received a MNESIA down event, removing all pids of node ~p~n", [Node]),
  102. delete_pids_of_disconnected_node(Node),
  103. {noreply, State};
  104. handle_info({mnesia_system_event, _MnesiaEvent}, State) ->
  105. %% ignore mnesia event
  106. {noreply, State};
  107. handle_info(Info, State) ->
  108. error_logger:warning_msg("Received an unknown info message: ~p~n", [Info]),
  109. {noreply, State}.
  110. %% ----------------------------------------------------------------------------------------------------------
  111. %% Terminate
  112. %% ----------------------------------------------------------------------------------------------------------
  113. -spec terminate(Reason :: any(), #state{}) -> terminated.
  114. terminate(Reason, _State) ->
  115. error_logger:info_msg("Terminating syn with reason: ~p~n", [Reason]),
  116. terminated.
  117. %% ----------------------------------------------------------------------------------------------------------
  118. %% Convert process state when code is changed.
  119. %% ----------------------------------------------------------------------------------------------------------
  120. -spec code_change(OldVsn :: any(), #state{}, Extra :: any()) -> {ok, #state{}}.
  121. code_change(_OldVsn, State, _Extra) ->
  122. {ok, State}.
  123. %% ===================================================================
  124. %% Internal
  125. %% ===================================================================
  126. -spec delete_pids_of_disconnected_node(Node :: atom()) -> pid().
  127. delete_pids_of_disconnected_node(Node) ->
  128. %% don't lock gen server
  129. spawn(fun() ->
  130. %% build match specs
  131. MatchHead = #syn_processes_table{key = '$1', node = '$2', _ = '_'},
  132. Guard = {'=:=', '$2', Node},
  133. IdFormat = '$1',
  134. %% delete
  135. DelF = fun(Id) -> mnesia:dirty_delete({syn_processes_table, Id}) end,
  136. NodePids = mnesia:dirty_select(syn_processes_table, [{MatchHead, [Guard], [IdFormat]}]),
  137. lists:foreach(DelF, NodePids)
  138. end).
  139. -spec automerge(RemoteNode :: atom()) -> ok.
  140. automerge(RemoteNode) ->
  141. global:trans({{?MODULE, automerge}, self()},
  142. fun() ->
  143. error_logger:warning_msg("AUTOMERGE starting for remote node ~s (global lock is set)~n", [RemoteNode]),
  144. check_stitch(RemoteNode),
  145. error_logger:warning_msg("AUTOMERGE done (global lock will be unset)~n")
  146. end).
  147. -spec check_stitch(RemoteNode :: atom()) -> ok.
  148. check_stitch(RemoteNode) ->
  149. case lists:member(RemoteNode, mnesia:system_info(running_db_nodes)) of
  150. true ->
  151. ok;
  152. false ->
  153. stitch(RemoteNode),
  154. ok
  155. end.
  156. -spec stitch(RemoteNode :: atom()) -> {'ok', any()} | {'error', any()}.
  157. stitch(RemoteNode) ->
  158. mnesia_controller:connect_nodes(
  159. [RemoteNode],
  160. fun(MergeF) ->
  161. case MergeF([syn_processes_table]) of
  162. {merged, _, _} = Res ->
  163. stitch_tab(RemoteNode),
  164. Res;
  165. Other ->
  166. Other
  167. end
  168. end).
  169. -spec stitch_tab(RemoteNode :: atom()) -> ok.
  170. stitch_tab(RemoteNode) ->
  171. %% get remote processes info
  172. RemoteProcessesInfo = rpc:call(RemoteNode, ?MODULE, get_processes_info_of_node, [RemoteNode]),
  173. %% get local processes info
  174. LocalProcessesInfo = get_processes_info_of_node(node()),
  175. %% purge doubles (if any)
  176. {LocalProcessesInfo1, RemoteProcessesInfo1} = purge_double_processes_from_local_node(LocalProcessesInfo, RemoteProcessesInfo),
  177. %% write
  178. write_remote_processes_to_local(RemoteNode, RemoteProcessesInfo1),
  179. write_local_processes_to_remote(RemoteNode, LocalProcessesInfo1).
  180. -spec purge_double_processes_from_local_node(LocalProcessesInfo :: list(), RemoteProcessesInfo :: list()) ->
  181. {LocalProcessesInfo :: list(), RemoteProcessesInfo :: list()}.
  182. purge_double_processes_from_local_node(LocalProcessesInfo, RemoteProcessesInfo) ->
  183. %% create ETS table
  184. Tab = ets:new(syn_automerge_doubles_table, [set]),
  185. %% insert local processes info
  186. ets:insert(Tab, LocalProcessesInfo),
  187. %% find doubles
  188. F = fun({Key, _RemoteProcessPid}) ->
  189. case ets:lookup(Tab, Key) of
  190. [] -> ok;
  191. [{Key, LocalProcessPid}] ->
  192. error_logger:warning_msg("Found a double process for ~s, killing it on local node~n", [Key]),
  193. %% remove it from local mnesia table
  194. mnesia:dirty_delete(syn_processes_table, Key),
  195. %% remove it from ETS
  196. ets:delete(Tab, Key),
  197. %% kill the process
  198. exit(LocalProcessPid, kill)
  199. end
  200. end,
  201. lists:foreach(F, RemoteProcessesInfo),
  202. %% compute local processes without doubles
  203. LocalProcessesInfo1 = ets:tab2list(Tab),
  204. %% delete ETS table
  205. ets:delete(Tab),
  206. %% return
  207. {LocalProcessesInfo1, RemoteProcessesInfo}.
  208. -spec write_remote_processes_to_local(RemoteNode :: atom(), RemoteProcessesInfo :: list()) -> ok.
  209. write_remote_processes_to_local(RemoteNode, RemoteProcessesInfo) ->
  210. write_processes_info_to_node(RemoteNode, RemoteProcessesInfo).
  211. -spec write_local_processes_to_remote(RemoteNode :: atom(), LocalProcessesInfo :: list()) -> ok.
  212. write_local_processes_to_remote(RemoteNode, LocalProcessesInfo) ->
  213. ok = rpc:call(RemoteNode, ?MODULE, write_processes_info_to_node, [node(), LocalProcessesInfo]).
  214. -spec get_processes_info_of_node(Node :: atom()) -> list().
  215. get_processes_info_of_node(Node) ->
  216. %% build match specs
  217. MatchHead = #syn_processes_table{key = '$1', pid = '$2', node = '$3'},
  218. Guard = {'=:=', '$3', Node},
  219. ProcessInfoFormat = {{'$1', '$2'}},
  220. %% select
  221. mnesia:dirty_select(syn_processes_table, [{MatchHead, [Guard], [ProcessInfoFormat]}]).
  222. -spec write_processes_info_to_node(Node :: atom(), ProcessesInfo :: list()) -> ok.
  223. write_processes_info_to_node(Node, ProcessesInfo) ->
  224. FWrite = fun({Key, ProcessPid}) ->
  225. mnesia:dirty_write(#syn_processes_table{
  226. key = Key,
  227. pid = ProcessPid,
  228. node = Node
  229. })
  230. end,
  231. lists:foreach(FWrite, ProcessesInfo).