function [W_hat,b_hat,PL,err_W,err_b] = MPL_Boltzmann_SESOP(S,W0,b0,deg,show_results,W,b)
%MPL_Boltzmann_SESOP - learning the Boltzmann parameters by MPL via SESOP
% MPL - Maximum Pseudo-Likelihood
% SESOP - Sequential Subspace Optimization
%
% [W_hat,b_hat] = MPL_Boltzmann_SESOP(S) approximates the MPL estimator of the 
% Boltzmann parameters using the SESOP-2 speed-up.
%
% [W_hat,b_hat] = MPL_Boltzmann_SESOP(S,W0,b0) specifies the initial values
% for the Boltzmann parameters.
% 
% [W_hat,b_hat] = MPL_Boltzmann_SESOP(S,W0,b0,deg) also specifies the number of recent 
% steps used in the SESOP speed-up. For deg=0 the algorithm is reduced to gradient 
% ascent. The default is deg==2. 
%
% [W_hat,b_hat] = MPL_Boltzmann_SESOP(S,W0,b0,deg,show_results) also specifies if 
% messages are printed during the process. The default is show_results=0.
%
% [W_hat,b_hat,PL] = MPL_Boltzmann_SESOP(...) also returns the values of the log-PL 
% function for each iteration.
%
% [W_hat,b_hat,PL,err_W,err_b] = MPL_Boltzmann_SESOP(S,W0,b0,deg,show_results,W,b) 
% also specifies the true values of the Boltzmann parameters and returns the estimation 
% errors with respect to them.  This format can be used only in synthetic experiments 
% where we have access to the true Boltzmann parameters. 
% =====================================================================================
% Input:
% S - a matrix of size m-by-N consisting of N sparsity patterns.
% W0,b0 - the initial values for the Boltzmann parameters: an interaction matrix of 
% size m-by-m and a bias vector of size m-by-1.
% deg - the number of recent steps used by SESOP.
% show_results - flag for displaying results.
% W,b - the true values for the Boltzmann parameters.
% =====================================================================================
% Output:
% W_hat,b_hat - the recovered Boltzmann parameters
% PL - the values of the log-PL function (up to an additive constant) in each iteration
% err_W,err_b - estimation errors for the Boltzmann parameters with respect to their
% true values.
% =====================================================================================
% Tomer Faktor
% Department of Electrical Engineering
% Technion, Haifa 32000 Israel
% tomerfa@tx.technion.ac.il
%
% August 2011
% =====================================================================================
[m,N]=size(S);
% Check input parameters
if nargin<2
  W0=zeros(m);  
end
if nargin<3
    mean_S=mean(S,2);
    b0=atanh(mean_S); % the MPL estimator for W=0
    b0(mean_S==-1)=-5;
    b0(mean_S==1)=5;
end
if nargin<4
   deg=2; 
end
if nargin<5
   show_results=0;  
end
% Set parameters for MPL via SESOP run
lambda=N;
u_iter=conv_Wb2u(W0,b0);
p=length(u_iter);
epsilon=10;
grad_thr=sqrt(p*N/50);
max_iter=50;
PL=zeros(1,max_iter+1);
% Compute log-PL value and gradient for initial values of the Boltzmann parameters
[PL(1),grad_u_iter] = compute_PL_props(S,W0,b0,1);
if show_results
    disp(['MPL optimization - initial Boltzmann parameters: norm of gradient vector=',...
        num2str(norm(grad_u_iter))]);
end
err_W=[];
err_b=[];
if nargin>=6
    err_W(1)=norm(W0-W,'fro');
end
if nargin>=7
    err_b(1)=norm(b0-b);
end
% Perform MPL via SESOP
num_iter=1;
u_prev=u_iter;
u_prev_mat=u_iter;
W_hat=W0;
b_hat=b0;
L=1;
if deg==0
    hh = waitbar(0,'Performing MPL estimation via GA');    
else
    hh = waitbar(0,['Performing MPL estimation via SESOP-',num2str(deg)]);
end
while norm(grad_u_iter)>grad_thr && num_iter<=max_iter
    waitbar(num_iter/max_iter,hh)
    if L==1
        Q=-grad_u_iter;
    else
        Q=[-grad_u_iter,u_prev_mat(:,2:L)-u_prev_mat(:,1:L-1)];
    end
    d=sqrt(diag(Q'*Q));
    if sum(d<1e-5)
        break;
    end
    Q=Q*diag(1./d); % normalize directions
    v=zeros(L,1);
    k=0;
    stop_val=[];
    while 1
        k=k+1;
        [PL(num_iter+1),grad_u_iter,hessian_mat]=compute_PL_props(S,W_hat,b_hat,2,Q);
        grad_vec=Q'*grad_u_iter;
        % Inner optimization stage - find step sizes using Newton iterations
        stop_val(k)=0.5*grad_vec'*inv(hessian_mat+lambda*eye(L))*grad_vec;
        if stop_val(k)<epsilon || k==10
            u_prev_mat=[u_prev_mat,u_iter];
            L=min(num_iter+1,deg+1);
            u_prev_mat=u_prev_mat(:,end-(L-1):end);
            u_prev=u_iter;
            break;
        end
        delta_v=-inv(hessian_mat+lambda*eye(L))*grad_vec;
        t_vec=[0.05:0.05:0.5,0.6:0.1:1,1.25:0.25:2.5];
        PL_iter_temp=zeros(1,length(t_vec));
        for i=1:length(t_vec)
            t=t_vec(i);
            v_temp=v+t*delta_v;
            u_temp=u_prev+Q*v_temp;
            [W_hat,b_hat]=conv_u2Wb(u_temp);
            PL_iter_temp(i)=compute_PL_props(S,W_hat,b_hat,0);
        end
        ind_min=find(PL_iter_temp==min(PL_iter_temp),1,'first');
        t=t_vec(ind_min);
        v=v+t*delta_v;
        u_iter=u_prev+Q*v;
        [W_hat,b_hat]=conv_u2Wb(u_iter);
    end
    if nargin>=6
        err_W(num_iter+1)=norm(W_hat-W,'fro');
    end
    if nargin>=7
        err_b(num_iter+1)=norm(b_hat-b);
    end
    if show_results
        disp(['MPL optimization - iter. #',num2str(num_iter),': norm of gradient vector=',...
            num2str(norm(grad_u_iter))]);
    end
    num_iter=num_iter+1;
end
waitbar(1,hh)
close(hh)
PL=PL(1:num_iter);
PL=-PL;

%================================================
function [W,b] = conv_u2Wb(u)

p=numel(u);
m=0.5*(sqrt(1+8*p)-1);
inds=find(triu(ones(m),1)>0);
W=zeros(m);
for j=1:numel(inds)
    W(inds(j))=u(j);
end
W=W+W';
b=u(end-m+1:end);

return;
%================================================
function u = conv_Wb2u(W,b)

m=numel(b);
inds=find(triu(ones(m),1)>0);
u=W(inds(:));
u=[u;b];

return;