问题的引起:
最近同事的一个项目遇到一个奇怪的问题,代码如下:
G节点上运行类似如下代码:[代码片段1]
[begin
global:send(role_manager, {role_online, RoleID}),
gen_server:call({global, account_server}, {create_role, A}
end || _ <- lists:seq(1, 1000)].
role_manager, account_server都跑在W节点上
role_manager收到{role_online, RoleID}消息的处理:[代码片段2]
start_child(role_sup, {role_server, {role_server, start_link, []},transient, 30000, worker, [role_server]})
role_server的start_link: [代码片段3]
gen_server:start_link({global, role_XXXX}, role_server, [], []).
代码段1是很简单的一段代码,但是非常奇怪的就是这段代码居然用了N秒,注意是秒!
翻了源码没问题(蛋疼,我直接翻看的是R15B01的源码),后来跟同事聊天忽然想起来R15更新日志中优化了关于safe_whereis_name的代码,赶快去看看R14B02的源码:
两个文件gen.erl(gen_server:call最终会走到这里)和global.erl
[gen.erl]:
%% Global by name
call({global, _Name}=Process, Label, Request, Timeout)
when Timeout =:= infinity;
is_integer(Timeout), Timeout >= 0 ->
case where(Process) of
Pid when is_pid(Pid) ->
Node = node(Pid),
try do_call(Pid, Label, Request, Timeout)
catch
exit:{nodedown, Node} ->
%% A nodedown not yet detected by global,
%% pretend that it was.
exit(noproc)
end;
undefined ->
exit(noproc)
end;
global名字首先where查询PID,发现用的是global:safe_whereis_name(Name)
where({global, Name}) -> global:safe_whereis_name(Name);
where({local, Name}) -> whereis(Name).
跳转到[global.erl]:
-spec safe_whereis_name(term()) -> pid() | 'undefined'.
safe_whereis_name(Name) ->
gen_server:call(global_name_server, {whereis, Name}, infinity).
首先是call,但是这个肯定不是性能低下的关键,再往下看:
handle_call({whereis, Name}, From, S) ->
do_whereis(Name, From),
{noreply, S};
do_whereis(Name, From) ->
case is_global_lock_set() of
false ->
gen_server:reply(From, where(Name)); //注意这里
true ->
send_again({whereis, Name, From})
end.
好吧,有锁,还有sleep:
send_again(Msg) ->
Me = self(),
spawn(fun() -> timer(Me, Msg) end).
timer(Pid, Msg) ->
random_sleep(5),
Pid ! Msg.
到了这里已经搞明白问题了,还在用R15以上版本的同学们赶快升级吧!
接下来顺便理理global lock的逻辑:
is_global_lock_set() ->
is_lock_set(?GLOBAL_RID).
is_lock_set(ResourceId) ->
ets:member(global_locks, ResourceId).
看到了一个关键的表 global_locks,看看register_name的过程:
-spec register_name(term(), pid()) -> 'yes' | 'no'.
register_name(Name, Pid) when is_pid(Pid) ->
register_name(Name, Pid, fun random_exit_name/3).
-type method() :: fun((term(), pid(), pid()) -> pid() | 'none').
-spec register_name(term(), pid(), method()) -> 'yes' | 'no'.
register_name(Name, Pid, Method) when is_pid(Pid) ->
Fun = fun(Nodes) ->
case (where(Name) =:= undefined) andalso check_dupname(Name, Pid) of
true ->
gen_server:multi_call(Nodes,
global_name_server,
{register, Name, Pid, Method}),
yes;
_ ->
no
end
end,
?trace({register_name, self(), Name, Pid, Method}),
gen_server:call(global_name_server, {registrar, Fun}, infinity).
handle_call({registrar, Fun}, From, S) ->
S#state.the_registrar ! {trans_all_known, Fun, From},
{noreply, S};
S#state.the_registrar是什么? 看注释:
%% The registrar is a helper process that registers and unregisters
%% names. Since it never dies it assures that names are registered and
%% unregistered on all known nodes. It is started by and linked to
%% global_name_server.
start_the_registrar() ->
spawn_link(fun() -> loop_the_registrar() end).
loop_the_registrar() ->
receive
{trans_all_known, Fun, From} ->
?trace({loop_the_registrar, self(), Fun, From}),
gen_server:reply(From, trans_all_known(Fun));
Other ->
unexpected_message(Other, register)
end,
loop_the_registrar().
围观trans_all_known(Fun):
trans_all_known(Fun) ->
Id = {?GLOBAL_RID, self()},
Nodes = set_lock_known(Id, 0),
try
Fun(Nodes)
after
delete_global_lock(Id, Nodes)
end.
set_lock_known(Id, Times) ->
Known = get_known(),
Nodes = [node() | Known],
Boss = the_boss(Nodes),
%% Use the same convention (a boss) as lock_nodes_safely. Optimization.
case set_lock_on_nodes(Id, [Boss]) of
true ->
case lock_on_known_nodes(Id, Known, Nodes) of
true ->
Nodes;
false ->
del_lock(Id, [Boss]),
random_sleep(Times),
set_lock_known(Id, Times+1)
end;
false ->
random_sleep(Times),
set_lock_known(Id, Times+1)
end.
lock_on_known_nodes(Id, Known, Nodes) ->
case set_lock_on_nodes(Id, Nodes) of
true ->
(get_known() — Known) =:= [];
false ->
false
end.
set_lock_on_nodes(_Id, []) ->
true;
set_lock_on_nodes(Id, Nodes) ->
case local_lock_check(Id, Nodes) of
true ->
Msg = {set_lock, Id},
{Replies, _} =
gen_server:multi_call(Nodes, global_name_server, Msg),
?trace({set_lock,{me,self()},Id,{nodes,Nodes},{replies,Replies}}),
check_replies(Replies, Id, Replies);
false=Reply ->
Reply
end.
%% Probe lock on local node to see if one should go on trying other nodes.
local_lock_check(_Id, [_] = _Nodes) ->
true;
local_lock_check(Id, Nodes) ->
not lists:member(node(), Nodes) orelse (can_set_lock(Id) =/= false).
中间通过boss方法获得一个裁决node,先在boss node上获得锁,成功则锁住其他节点,失败则timer sleep;很多同学看到sleep可能会下意识的觉得蛋疼,但是实际上这是erlang天生分布式的特性决定,由于sleep的存在才会有自动重试,上层才会通常不用关心分布式的细节。继续走:
handle_set_lock(Id, Pid, S) ->
?trace({handle_set_lock, Id, Pid}),
case can_set_lock(Id) of
{true, PidRefs} ->
case pid_is_locking(Pid, PidRefs) of
true ->
{true, S};
false ->
{true, insert_lock(Id, Pid, PidRefs, S)}
end;
false=Reply ->
{Reply, S}
end.
can_set_lock({ResourceId, LockRequesterId}) ->
case ets:lookup(global_locks, ResourceId) of
[{ResourceId, LockRequesterId, PidRefs}] ->
{true, PidRefs};
[{ResourceId, _LockRequesterId2, _PidRefs}] ->
false;
[] ->
{true, []}
end.
insert_lock({ResourceId, LockRequesterId}=Id, Pid, PidRefs, S) ->
{RPid, Ref} = do_monitor(Pid),
true = ets:insert(global_pid_ids, {Pid, ResourceId}),
true = ets:insert(global_pid_ids, {Ref, ResourceId}),
Lock = {ResourceId, LockRequesterId, [{Pid,RPid,Ref} | PidRefs]},
true = ets:insert(global_locks, Lock),
trace_message(S, {ins_lock, node(Pid)}, [Id, Pid]).
我们看到了其实就是用protected ets来实现的一个内存锁。
总的来说global的实现是比较简单的,是erlang dist的基础物件,中间的一些细节也是非常不错、非常巧妙的,不过也看到了中间存在的巨大的损耗和隐患(call/sleep),因此global不要滥用,适度就好。
网友评论已有0条评论, 我也要评论