Просмотр исходного кода

Consider process being alive during process resolution.

Roberto Ostinelli 5 лет назад
Родитель
Сommit
3a8437e64d
2 измененных файлов с 78 добавлено и 13 удалено
  1. 47 11
      src/syn_registry.erl
  2. 31 2
      test/syn_registry_SUITE.erl

+ 47 - 11
src/syn_registry.erl

@@ -217,10 +217,12 @@ handle_cast({inconsistent_name_data, OriginatingNode, Name, RemotePid, RemoteMet
                     resolve_conflict(Name, {TablePid, TableMeta}, {RemotePid, RemoteMeta},
                         %% keep currently in table
                         fun() ->
+                            %% overwrite
                             ok = rpc:call(OriginatingNode, syn_registry, add_to_local_table, [Name, TablePid, TableMeta, undefined])
                         end,
                         %% keep remote
                         fun() ->
+                            %% overwrite
                             add_to_local_table(Name, RemotePid, RemoteMeta, undefined)
                         end,
                         State
@@ -290,7 +292,12 @@ handle_info({nodeup, RemoteNode}, State) ->
                 case find_process_entry_by_name(Name) of
                     undefined ->
                         %% no conflict
-                        register_on_node(Name, RemotePid, RemoteMeta);
+                        case rpc:call(node(RemotePid), erlang, is_process_alive, [RemotePid]) of
+                            true ->
+                                register_on_node(Name, RemotePid, RemoteMeta);
+                            _ ->
+                                ok
+                        end;
 
                     Entry ->
                         LocalPid = Entry#syn_registry_table.pid,
@@ -469,16 +476,40 @@ handle_process_down(Name, Pid, Meta, Reason, #state{
     KeepRemoteFun :: fun(),
     #state{}
 ) -> ok.
-resolve_conflict(Name, {TablePid, TableMeta}, {RemotePid, RemoteMeta}, KeepTableFun, KeepRemoteFun, #state{
-    custom_event_handler = CustomEventHandler
-}) ->
-    %% call conflict resolution
-    {PidToKeep, KillOther} = syn_event_handler:do_resolve_registry_conflict(
-        Name,
-        {TablePid, TableMeta},
-        {RemotePid, RemoteMeta},
-        CustomEventHandler
-    ),
+resolve_conflict(
+    Name,
+    {TablePid, TableMeta},
+    {RemotePid, RemoteMeta},
+    KeepTableFun,
+    KeepRemoteFun,
+    #state{custom_event_handler = CustomEventHandler}
+) ->
+    TablePidAlive = rpc:call(node(TablePid), erlang, is_process_alive, [TablePid]),
+    RemotePidAlive = rpc:call(node(RemotePid), erlang, is_process_alive, [RemotePid]),
+
+    %% check if pids are alive (race conditions if pid dies during resolution)
+    {PidToKeep, KillOther} = case {TablePidAlive, RemotePidAlive} of
+        {true, true} ->
+            %% call conflict resolution
+            syn_event_handler:do_resolve_registry_conflict(
+                Name,
+                {TablePid, TableMeta},
+                {RemotePid, RemoteMeta},
+                CustomEventHandler
+            );
+
+        {true, false} ->
+            %% keep only alive process
+            {TablePid, false};
+
+        {false, true} ->
+            %% keep only alive process
+            {RemotePidAlive, false};
+
+        {false, false} ->
+            %% remove both
+            {none, false}
+    end,
 
     %% keep chosen one
     case PidToKeep of
@@ -506,6 +537,11 @@ resolve_conflict(Name, {TablePid, TableMeta}, {RemotePid, RemoteMeta}, KeepTable
             end,
             KeepRemoteFun();
 
+        none ->
+            remove_from_local_table(Name),
+            RemoteNode = node(RemotePid),
+            ok = rpc:call(RemoteNode, syn_registry, remove_from_local_table, [Name]);
+
         Other ->
             error_logger:error_msg(
                 "Syn(~p): Custom handler returned ~p, valid options were ~p and ~p~n",

+ 31 - 2
test/syn_registry_SUITE.erl

@@ -45,7 +45,8 @@
 -export([
     two_nodes_register_monitor_and_unregister/1,
     two_nodes_registry_count/1,
-    two_nodes_registration_race_condition_conflict_resolution/1
+    two_nodes_registration_race_condition_conflict_resolution/1,
+    two_nodes_registration_race_condition_conflict_resolution_when_process_died/1
 ]).
 -export([
     three_nodes_partial_netsplit_consistency/1,
@@ -110,7 +111,8 @@ groups() ->
         {two_nodes_process_registration, [shuffle], [
             two_nodes_register_monitor_and_unregister,
             two_nodes_registry_count,
-            two_nodes_registration_race_condition_conflict_resolution
+            two_nodes_registration_race_condition_conflict_resolution,
+            two_nodes_registration_race_condition_conflict_resolution_when_process_died
         ]},
         {three_nodes_process_registration, [shuffle], [
             three_nodes_partial_netsplit_consistency,
@@ -514,6 +516,33 @@ two_nodes_registration_race_condition_conflict_resolution(Config) ->
             ok = no_process_is_registered_with_conflicting_name
     end.
 
+two_nodes_registration_race_condition_conflict_resolution_when_process_died(Config) ->
+    ConflictingName = "COMMON",
+    %% get slaves
+    SlaveNode = proplists:get_value(slave_node, Config),
+    %% use customer handler
+    syn_test_suite_helper:use_custom_handler(),
+    rpc:call(SlaveNode, syn_test_suite_helper, use_custom_handler, []),
+    %% start syn on nodes
+    ok = syn:start(),
+    ok = rpc:call(SlaveNode, syn, start, []),
+    timer:sleep(100),
+    %% start processes
+    Pid0 = syn_test_suite_helper:start_process(),
+    Pid1 = syn_test_suite_helper:start_process(SlaveNode),
+    %% inject into syn to simulate concurrent registration
+    syn_registry:add_to_local_table(ConflictingName, Pid0, keep_this_one, undefined),
+    %% kill process
+    syn_test_suite_helper:kill_process(Pid0),
+    %% register to trigger conflict resolution
+    ok = rpc:call(SlaveNode, syn, register, [ConflictingName, Pid1, SlaveNode]),
+    timer:sleep(1000),
+    %% check
+    {Pid1, SlaveNode} = syn:whereis(ConflictingName, with_meta),
+    {Pid1, SlaveNode} = rpc:call(SlaveNode, syn, whereis, [ConflictingName, with_meta]),
+    %% check that process is alive
+    true = rpc:call(SlaveNode, erlang, is_process_alive, [Pid1]).
+
 three_nodes_partial_netsplit_consistency(Config) ->
     %% get slaves
     SlaveNode1 = proplists:get_value(slave_node_1, Config),