聚合国内IT技术精华文章,分享IT技术精华,帮助IT从业人士成长

Erlang R14以及之前版本的global模块的隐患

2013-03-14 21:16 浏览: 2736896 次 我要评论(0 条) 字号:

问题的引起:

最近同事的一个项目遇到一个奇怪的问题,代码如下:

 

G节点上运行类似如下代码:[代码片段1]

[begin
global:send(role_manager, {role_online, RoleID}),
gen_server:call({global, account_server}, {create_role, A}

end || _ <- lists:seq(1, 1000)].

 

role_manager, account_server都跑在W节点上

role_manager收到{role_online, RoleID}消息的处理:[代码片段2]

start_child(role_sup, {role_server, {role_server, start_link, []},transient, 30000, worker, [role_server]})

 

role_serverstart_link: [代码片段3]

gen_server:start_link({global, role_XXXX}, role_server, [], []).

 

代码段1是很简单的一段代码,但是非常奇怪的就是这段代码居然用了N秒,注意是秒!

 

翻了源码没问题(蛋疼,我直接翻看的是R15B01的源码),后来跟同事聊天忽然想起来R15更新日志中优化了关于safe_whereis_name的代码,赶快去看看R14B02的源码:

两个文件gen.erl(gen_server:call最终会走到这里)global.erl

 

[gen.erl]

 

%% Global by name
call({global, _Name}=Process, Label, Request, Timeout)
when Timeout =:= infinity;
is_integer(Timeout), Timeout >= 0 ->
case where(Process) of
Pid when is_pid(Pid) ->
Node = node(Pid),
try do_call(Pid, Label, Request, Timeout)
catch
exit:{nodedown, Node} ->
%% A nodedown not yet detected by global,
%% pretend that it was.
exit(noproc)
end;
undefined
->
exit(noproc)
end;

 

global名字首先where查询PID,发现用的是global:safe_whereis_name(Name)

where({global, Name}) -> global:safe_whereis_name(Name);                                                                                                 

where({local, Name})  -> whereis(Name).

跳转到[global.erl]:

-spec safe_whereis_name(term()) -> pid() | 'undefined'.                                                                                                  

safe_whereis_name(Name) ->

gen_server:call(global_name_server, {whereis, Name}, infinity).

首先是call,但是这个肯定不是性能低下的关键,再往下看:

handle_call({whereis, Name}, From, S) ->
do_whereis(
Name, From),
{noreply, S};

do_whereis(Name, From) ->
case is_global_lock_set() of
false
->
gen_server:reply(From, where(Name)); //注意这里
true
->
send_again({
whereis, Name, From})
end.

 

好吧,有锁,还有sleep

send_again(Msg) ->
Me = self(),
spawn(fun() -> timer(Me, Msg) end).

timer(Pid, Msg) ->
random_sleep(
5),
Pid ! Msg.

 

到了这里已经搞明白问题了,还在用R15以上版本的同学们赶快升级吧!

 

接下来顺便理理global lock的逻辑:

 

is_global_lock_set() ->
is_lock_set(
?GLOBAL_RID).

is_lock_set(ResourceId) ->
ets:member(global_locks, ResourceId).

 

看到了一个关键的表 global_locks,看看register_name的过程:

 

-spec register_name(term(), pid()) -> 'yes' | 'no'.

 register_name(Name, Pid) when is_pid(Pid) ->

     register_name(Name, Pid, fun random_exit_name/3).

 

-type method() :: fun((term(), pid(), pid()) -> pid() | 'none').

 

-spec register_name(term(), pid(), method()) -> 'yes' | 'no'.

 register_name(Name, Pid, Method) when is_pid(Pid) ->

     Fun = fun(Nodes) ->

         case (where(Name) =:= undefined) andalso check_dupname(Name, Pid) of

             true ->

                 gen_server:multi_call(Nodes,

                                       global_name_server,

                                       {register, Name, Pid, Method}),

                 yes;

             _ ->

                 no

         end

     end,

     ?trace({register_name, self(), Name, Pid, Method}),                                                                                                   

gen_server:call(global_name_server, {registrar, Fun}, infinity).

 

handle_call({registrar, Fun}, From, S) ->                                                                                                                

    S#state.the_registrar ! {trans_all_known, Fun, From},

 {noreply, S};

 

S#state.the_registrar是什么? 看注释:

%% The registrar is a helper process that registers and unregisters

%% names. Since it never dies it assures that names are registered and

%% unregistered on all known nodes. It is started by and linked to

%% global_name_server.

start_the_registrar() ->

    spawn_link(fun() -> loop_the_registrar() end).

                      

loop_the_registrar() ->

    receive

        {trans_all_known, Fun, From} ->

            ?trace({loop_the_registrar, self(), Fun, From}),

            gen_server:reply(From, trans_all_known(Fun));

    Other ->

            unexpected_message(Other, register)

    end,

loop_the_registrar().

 

围观trans_all_known(Fun)

 

trans_all_known(Fun) ->

    Id = {?GLOBAL_RID, self()},

    Nodes = set_lock_known(Id, 0),

    try

        Fun(Nodes)

    after

        delete_global_lock(Id, Nodes)

    end.

 

set_lock_known(Id, Times) ->

    Known = get_known(),

    Nodes = [node() | Known],

    Boss = the_boss(Nodes),

    %% Use the  same convention (a boss) as lock_nodes_safely. Optimization.

    case set_lock_on_nodes(Id, [Boss]) of

        true ->

            case lock_on_known_nodes(Id, Known, Nodes) of

                true ->

                    Nodes;

                false ->

                    del_lock(Id, [Boss]),

                    random_sleep(Times),

                    set_lock_known(Id, Times+1)

            end;

        false ->

            random_sleep(Times),

            set_lock_known(Id, Times+1)

    end.

 

lock_on_known_nodes(Id, Known, Nodes) ->

    case set_lock_on_nodes(Id, Nodes) of

        true ->

            (get_known() — Known) =:= [];

        false ->

            false

end.

set_lock_on_nodes(_Id, []) ->

    true;

set_lock_on_nodes(Id, Nodes) ->

    case local_lock_check(Id, Nodes) of

        true ->

            Msg = {set_lock, Id},

            {Replies, _} =

                gen_server:multi_call(Nodes, global_name_server, Msg),

            ?trace({set_lock,{me,self()},Id,{nodes,Nodes},{replies,Replies}}),

            check_replies(Replies, Id, Replies);

        false=Reply ->

            Reply

    end.

 

%% Probe lock on local node to see if one should go on trying other nodes.

local_lock_check(_Id, [_] = _Nodes) ->

    true;

local_lock_check(Id, Nodes) ->

    not lists:member(node(), Nodes) orelse (can_set_lock(Id) =/= false).

 

中间通过boss方法获得一个裁决node,先在boss node上获得锁,成功则锁住其他节点,失败则timer sleep;很多同学看到sleep可能会下意识的觉得蛋疼,但是实际上这是erlang天生分布式的特性决定,由于sleep的存在才会有自动重试,上层才会通常不用关心分布式的细节。继续走:

 

handle_set_lock(Id, Pid, S) ->

    ?trace({handle_set_lock, Id, Pid}),

    case can_set_lock(Id) of

        {true, PidRefs} ->

        case pid_is_locking(Pid, PidRefs) of

        true ->

                    {true, S};

        false ->

                    {true, insert_lock(Id, Pid, PidRefs, S)}

        end;

        false=Reply ->                                                                                                                                   

            {Reply, S}

end.

 

can_set_lock({ResourceId, LockRequesterId}) ->

    case ets:lookup(global_locks, ResourceId) of

    [{ResourceId, LockRequesterId, PidRefs}] ->

            {true, PidRefs};

    [{ResourceId, _LockRequesterId2, _PidRefs}] ->

            false;

    [] ->

            {true, []}

    end.

 

insert_lock({ResourceId, LockRequesterId}=Id, Pid, PidRefs, S) ->

    {RPid, Ref} = do_monitor(Pid),

    true = ets:insert(global_pid_ids, {Pid, ResourceId}),

    true = ets:insert(global_pid_ids, {Ref, ResourceId}),

    Lock = {ResourceId, LockRequesterId, [{Pid,RPid,Ref} | PidRefs]},

    true = ets:insert(global_locks, Lock),

    trace_message(S, {ins_lock, node(Pid)}, [Id, Pid]).

 

我们看到了其实就是用protected ets来实现的一个内存锁。

 

总的来说global的实现是比较简单的,是erlang dist的基础物件,中间的一些细节也是非常不错、非常巧妙的,不过也看到了中间存在的巨大的损耗和隐患(call/sleep),因此global不要滥用,适度就好。



网友评论已有0条评论, 我也要评论

发表评论

*

* (保密)

Ctrl+Enter 快捷回复