Packaging CCNF code with OpenFace.

2025-12-30 13:02:30 +00:00 · 2018-05-05 11:21:09 +01:00
parent ddbe26108a
commit a557e9c192
156 changed files with 18411 additions and 2 deletions
--- a/model_training/CCNF/CCNF/lib/CCNF_training_bfgs.m
+++ b/model_training/CCNF/CCNF/lib/CCNF_training_bfgs.m
@@ -0,0 +1,125 @@
 function [ alphas, betas, thetas, final_likelihood] = CCNF_training_bfgs(thresholdX, thresholdFun, x, y, alphas, betas, thetas, lambda_a, lambda_b, lambda_th, similarityFNs, sparsityFNs, varargin)
 %CCNF_training_bfgs Performs CCNF training using BFGS (or LBFGS)
    if(sum(strcmp(varargin,'const')))
        ind = find(strcmp(varargin,'const')) + 1;
        const = varargin{ind};        
    else        
        const = false;
    end
    if(iscell(x))        
        num_seqs = numel(x);
        x = cell2mat(x)';
        % add a bias term
        x =  cat(1, ones(1,size(x,2)), x);        
        % If all of the sequences are of the same length can flatten them
        % to the same matrix
        if(const)
            y = cell2mat(y);
            y = reshape(y, numel(y)/num_seqs, num_seqs);
        end
    else
        % if not a cell it has already been flattened, and is constant
        % (most likely)
        num_seqs = varargin{find(strcmp(varargin, 'num_seqs'))+1};
    end
    % Should try a bunch of seed for initialising theta?
    if(sum(strcmp(varargin,'reinit')))
        ind = find(strcmp(varargin,'reinit')) + 1;
        reinit = varargin{ind};
    else
        reinit = false; 
    end
    % It is possible to predefine the components B^(k) and C^(k) required 
    % to compute B and and C terms and partial derivative (from equations 
    % 30 and 31 in Appendix B), also can predefine yB^(k)y and yC^(k)y,
    % as they also do not change through the iterations
    % In constant case Precalc_Bs are same across the sequences, same for 
    % PrecalcBsFlat, however yB^(k)y is defined per sequence
    if(sum(strcmp(varargin,'PrecalcBs')) && sum(strcmp(varargin,'PrecalcBsFlat'))...
             && sum(strcmp(varargin,'Precalc_yBy')))
        ind = find(strcmp(varargin,'PrecalcBs')) + 1;
        Precalc_Bs = varargin{ind};
        ind = find(strcmp(varargin,'PrecalcBsFlat')) + 1;
        Precalc_Bs_flat = varargin{ind};
        ind = find(strcmp(varargin,'Precalc_yBys')) + 1;
        Precalc_yBys = varargin{ind};
    else
        % if these are not provided calculate them        
        [ ~, Precalc_Bs, Precalc_Bs_flat, Precalc_yBys ] = CalculateSimilarities( num_seqs, x, similarityFNs, sparsityFNs, y, const);
    end
    % Reinitialisation attempts to find a better starting point for the
    % model training (sometimes helps sometimes doesn't)
    if(reinit)
        rng(0);
        % By default try 200 times, but can override
        num_reinit = 200;
        if(sum(strcmp(varargin,'num_reinit')))
            num_reinit = varargin{find(strcmp(varargin,'num_reinit')) + 1};
        end
        thetas_good = cell(num_reinit, 1);
        lhoods = zeros(num_reinit, 1);
        for i=1:num_reinit
            initial_Theta = randInitializeWeights(size(thetas,2)-1, numel(alphas));            
            lhoods(i) = LogLikelihoodCCNF(y, x, alphas, betas, initial_Theta, lambda_a, lambda_b, lambda_th, Precalc_Bs_flat, [], [], [], [], const, num_seqs);
            thetas_good{i} = initial_Theta;
        end
        [~,ind_max] = max(lhoods);
        thetas = thetas_good{ind_max};
    end
    params = [alphas; betas; thetas(:)];
    if(any(strcmp(varargin,'lbfgs')))
        options = optimset('Algorithm','interior-point','GradObj','on', 'Hessian', 'lbfgs', 'TolX', thresholdX, 'TolFun', thresholdFun, 'display', 'off');
    else
        options = optimset('Algorithm','interior-point','GradObj','on', 'Hessian', 'bfgs', 'TolX', thresholdX, 'TolFun', thresholdFun, 'display', 'off');
    end
    if(any(strcmp(varargin,'max_iter')))
        options.MaxIter = varargin{find(strcmp(varargin,'max_iter')) + 1};
    end
    objectiveFun = @(params)objectiveFunction(params, numel(alphas), numel(betas), size(thetas), lambda_a, lambda_b, lambda_th, Precalc_Bs, x, y, Precalc_yBys, Precalc_Bs_flat, const);
    lowerBound = [zeros(numel(alphas)+numel(betas),1); -Inf(numel(thetas),1)];
    upperBound = Inf(numel(params),1);
    params = fmincon(objectiveFun, params, [], [],[],[], lowerBound, upperBound, [], options);
    alphas = params(1:numel(alphas));
    betas = params(numel(alphas)+1:numel(alphas)+numel(betas));
    thetas = reshape(params(numel(alphas) + numel(betas) + 1:end), size(thetas));
    final_likelihood = LogLikelihoodCCNF(y, x, alphas, betas, thetas, lambda_a, lambda_b, lambda_th, Precalc_Bs_flat, [], [], [], [], const, num_seqs);
 end
 function [loss, gradient] = objectiveFunction(params, numAlpha, numBeta, sizeTheta, lambda_a, lambda_b, lambda_th, PrecalcQ2s, x, y, PrecalcYqDs, PrecalcQ2sFlat, const)
    alphas = params(1:numAlpha);
    betas = params(numAlpha+1:numAlpha+numBeta);
    thetas = reshape(params(numAlpha + numBeta + 1:end), sizeTheta);
    num_seqs = size(PrecalcYqDs,1);
    [gradient, SigmaInvs, CholDecomps, Sigmas, bs, all_x_resp] = gradientCCNF(params, numAlpha, numBeta, sizeTheta, lambda_a, lambda_b, lambda_th, PrecalcQ2s, x, y, PrecalcYqDs, PrecalcQ2sFlat, const, num_seqs);
    % as bfgs does gradient descent rather than ascent, negate the results
    gradient = -gradient;
    loss = -LogLikelihoodCCNF(y, x, alphas, betas, thetas, lambda_a, lambda_b, lambda_th, PrecalcQ2sFlat, SigmaInvs, CholDecomps, Sigmas, bs, const, num_seqs, all_x_resp);
 end
--- a/model_training/CCNF/CCNF/lib/CalcSigmaCCNFflat.m
+++ b/model_training/CCNF/CCNF/lib/CalcSigmaCCNFflat.m
@@ -0,0 +1,19 @@
 function [ SigmaInv] = CalcSigmaCCNFflat(alphas, betas, n, precalc_B_without_beta, precalc_eye, precalc_zeros)
 %CALCSIGMACCNFflat Computing SigmaInv matrices (represented as a vector as
 %it is a symmetric matrix)
    A = sum(alphas) .* precalc_eye;
    % calculating the B + C from the paper (here referred to as B)   
    Btmp = precalc_B_without_beta * betas;        
    B = precalc_zeros;
    on = tril(true(n,n));
    B(on) = Btmp;
    B = B';
    B(on) = Btmp;
    SigmaInv = 2 * (A + B);
 end
--- a/model_training/CCNF/CCNF/lib/CalcbCCNF.m
+++ b/model_training/CCNF/CCNF/lib/CalcbCCNF.m
@@ -0,0 +1,14 @@
 function b = CalcbCCNF( alpha, theta, x, resps)
 %CALCBCCNF Compute the b from CCNF equation
    % Either responses from the neural layers are precomputed and provided
    % in resps or need to compute it yourself
    if(nargin < 4)
        X = [ones(size(x,1),1), x];
        h1 = 1./(1 + exp(-theta * X'));
        b = (2 * alpha' * h1)';
    else
        b = (2 * alpha' * resps)';
    end
 end
--- a/model_training/CCNF/CCNF/lib/CalculateSimilarities.m
+++ b/model_training/CCNF/CCNF/lib/CalculateSimilarities.m
@@ -0,0 +1,155 @@
 function [ Similarities, B_without_beta, B_without_beta_flat, y_B_y ] = CalculateSimilarities( n_sequences, x, similarityFNs, sparsityFNs, y, const)
 %CALCULATESIMILARITIES Summary of this function goes here
 %   Detailed explanation goes here
    K = numel(similarityFNs);
    K2 = numel(sparsityFNs);
    %calculate similarity measures for each of the sequences
    Similarities = cell(n_sequences, 1);
    B_without_beta = cell(n_sequences,1);
    B_without_beta_flat = cell(n_sequences,1);
    y_B_y = zeros(n_sequences, K + K2);
    if(~const)
        similarities = cell(K, 1);
        sparsities = cell(K2, 1);
        % y can either be in cell format (diff length seqs.) or in matrix
        %, same length seqs
        beg_ind = 1;        
        if(iscell(y))
            end_ind = numel(y{1});
            y_cell = true;
        else
            end_ind = size(y,1);
            y_cell = false;
        end
        for q = 1 : n_sequences
            % don't take the bias term
            xq = x(2:end, beg_ind:end_ind);
            sample_length = end_ind - beg_ind + 1;
            Similarities{q} = zeros([sample_length, sample_length, K+K2]);
            B_without_beta{q} = cell(K+K2,1);
            B_without_beta_flat{q} = zeros((sample_length*(sample_length+1))/2,K+K2);
            % go over all of the similarity metrics and construct the
            % similarity matrices
            if(y_cell)
                yq = y{q};
            else
                yq = y(:,q);
            end
            for k=1:K
                if(q==1)
                    similarities{k} = similarityFNs{k}(xq');
                end
                Similarities{q}(:,:,k) = similarities{k};
                S = Similarities{q}(:,:,k);
                D =  diag(sum(S));
                B_without_beta{q}{k} = D - S;
                B = D - S;
                B_without_beta_flat{q}(:,k) = B(logical(tril(ones(size(S)))));
                y_B_y(q,k) = -yq'*B*yq;
            end
            for k=1:K2
                % this is constant so don't need to recalc
                if(q==1)
                   sparsities{k} = sparsityFNs{k}(xq');
                end
                Similarities{q}(:,:,K+k) = sparsities{k};
                S = Similarities{q}(:,:,K+k);
                D =  diag(sum(S));
                B_without_beta{q}{K+k} = D + S;
                B = D +  S;
                B_without_beta_flat{q}(:,K+k) = B(logical(tril(ones(size(S)))));
                y_B_y(q,K+k) = -yq'*B*yq;
            end
            % Update the references to sequence start/end
            if(q ~= n_sequences)
                beg_ind = end_ind + 1;
                if(iscell(y))
                    end_ind = end_ind + numel(y{q+1});
                else
                    end_ind = end_ind + size(y,1);
                end
            end
        end
    else
        sample_length = size(x,2)/n_sequences;
        similarities = cell(K, 1);
        sparsities = cell(K2, 1);
        B_without_beta = {cell(K+K2,1)};
        B_without_beta_flat = {zeros((sample_length*(sample_length+1))/2,K+K2)};
        Similarities = {zeros([sample_length, sample_length, K+K2])};
        beg_ind = 1;
        end_ind = sample_length;
        % don't take the bias term
        xq = x(2:end, beg_ind:end_ind);
        % go over all of the similarity metrics and construct the
        % similarity matrices
        for k=1:K
            similarities{k} = similarityFNs{k}(xq');
            Similarities{1}(:,:,k) = similarities{k};
            S = Similarities{1}(:,:,k);
            D =  diag(sum(S));
            B_without_beta{1}{k} = D - S;
            B = D - S;
            % flatten the symmetric matrix to save space
            B_without_beta_flat{1}(:,k) = B(logical(tril(ones(size(S)))));
            y_B_y(:,k) = diag(-y'*B*y);
        end
        for k=1:K2
            % this is constant so don't need to recalc
            sparsities{k} = sparsityFNs{k}(xq');
            Similarities{1}(:,:,K+k) = sparsities{k};
            S = Similarities{1}(:,:,K+k);
            D =  diag(sum(S));
            B_without_beta{1}{K+k} = D + S;
            B = D + S;
            B_without_beta_flat{1}(:,K+k) = B(logical(tril(ones(size(S)))));
            y_B_y(:,K+k) = diag(-y'*B*y);
        end   
    end
 end
--- a/model_training/CCNF/CCNF/lib/LogLikelihoodCCNF.m
+++ b/model_training/CCNF/CCNF/lib/LogLikelihoodCCNF.m
@@ -0,0 +1,139 @@
 function logL = LogLikelihoodCCNF(ys, xs, alphas, betas,thetas,...
                                  lambda_a,lambda_b,lambda_th, Precalc_Bs_flat,...
                                  SigmaInvs, ChDecomps, Sigmas, bs, const, num_seq, all_X_resp)
 % Calculating the log likelihood of the CCNF
 logL = 0;
 % If sequences are of different lengths
 if(~const)
    % y can either be in cell format (diff length seqs.) or in matrix
    %, same length seqs
    beg_ind = 1;
    if(iscell(ys))
        end_ind = numel(ys{1});
        y_cell = true;
    else
        end_ind = size(ys,1);
        y_cell = false;
    end
    for q=1:num_seq
        % Don't take the bias term
        xq = xs(2:end, beg_ind:end_ind);
        if(y_cell)
            yq = ys{q};
        else
            yq = ys(:,q);
        end
        n = size(xq, 2);
        % Compute b if not provided (they might be, as
        % calculation of gradient involves these terms)        
        if(~isempty(bs))
            b = bs(beg_ind:end_ind)';
        else
            b = CalcbCCNF(alphas, thetas, xq');
        end
        % Same goes for inverse of Sigma
        if(isempty(SigmaInvs))
            precalc_eye = eye(n);
            precalc_zeros = zeros(n);
            [SigmaInv] = CalcSigmaCCNFflat(alphas, betas, n, Precalc_Bs_flat{q}, precalc_eye, precalc_zeros);                        
            mu = SigmaInv \ b;
            % Used for normalisation term
            L = chol(SigmaInv);
        else
            SigmaInv = SigmaInvs{q};
            Sigma = Sigmas{q};
            mu = Sigma * b;
            % Used for normalisation term
            L = ChDecomps{q};
        end
        % normalisation = 1/((2*pi)^(n/2)*sqrt(det(Sigma)));
        % Removing the division by pi, as it is constant
        % normalisation = 1/(sqrt(det(sigma)));
        % flipping around determinant of SigmaInv, as det(inv(Sigma)) = inv(det(Sigma)
        % normalisation = log(sqrt(det(SigmaInv)));
        % log of normalisation using Cholesky decomposition (faster and more
        % numerically stable)
        log_normalisation = sum(log(diag(L))); % no times 2 here as we calculate the square root of determinant
        % prob_q = normalisation * exp(-0.5 * (y - mu)'*SigmaInv*(y-mu));
        % applying a logarithm to this leads to
        % logLq = log(normalisation) + (-0.5 * (yq - mu)'*SigmaInv*(yq-mu));
        logLq = log_normalisation + (-0.5 * (yq - mu)'*SigmaInv*(yq-mu));
        % Add the current likelihood to the running sum
        logL = logL + logLq;
        % Update the references to sequence start/end
        if(q ~= num_seq)
            beg_ind = end_ind + 1;
            if(iscell(ys))
                end_ind = end_ind + numel(ys{q+1});
            else
                end_ind = end_ind + size(ys,1);
            end
        end
    end
 else
    % A version where each sequence is same length and has the same
    % connections
    seq_length = size(ys,1);
    num_seqs = size(ys,2);
    if(isempty(SigmaInvs))
        % If not provided compute the neuron activation (Response)
        if(nargin < 16)
            all_X_resp = 1./(1 + exp(-thetas * xs));
        end
        % Combine the neuron responses to b
        all_bs = 2*alphas' * all_X_resp;
        precalc_eye = eye(seq_length);
        precalc_zeros = zeros(seq_length);
        % Compute Sigma for one of the sequences (same for all so can
        % reuse)
        [SigmaInv] = CalcSigmaCCNFflat(alphas, betas, seq_length, Precalc_Bs_flat{end}, precalc_eye, precalc_zeros);
        % A faster way of inverting a symmetric matrix
        CholDecomp = chol(SigmaInv);        
        Sigma=CholDecomp\(CholDecomp'\precalc_eye);
        % mu values associated with each time step
        mus = Sigma * reshape(all_bs, seq_length, num_seqs);        
    else
        SigmaInv = SigmaInvs;
        CholDecomp = ChDecomps;
        Sigma = Sigmas;
        mus = Sigma * reshape(bs, seq_length, num_seqs);        
    end
    log_normalisation = num_seqs * sum(log(diag(CholDecomp)));
    % Compute the sum across every sequence of
    % (yq - mu)'*SigmaInv*(yq-mu) and add to normalisation term
    ymu = (ys - mus);    
    y1 = SigmaInv * ymu;    
    logL = log_normalisation - 0.5 *  ymu(:)'* y1(:);
 end
 % add regularisation term
 logL = logL -lambda_b * (betas'*betas)/2 - lambda_a * (alphas'*alphas)/2 - lambda_th * (thetas(:)'*thetas(:))/2;
--- a/model_training/CCNF/CCNF/lib/appendix.pdf
+++ b/model_training/CCNF/CCNF/lib/appendix.pdf
--- a/model_training/CCNF/CCNF/lib/evaluate_CCNF_model.m
+++ b/model_training/CCNF/CCNF/lib/evaluate_CCNF_model.m
@@ -0,0 +1,112 @@
 function [ correlations, rms, mean_correlation, mean_RMSE, long_correlation, long_RMSE, predictions, gts ] = evaluate_CCNF_model( alphas, betas, thetas, x, y, similarityFNs, sparsityFNs, offset, scaling, verbose, PrecalcQ2sFlat)
 %evaluate_CCNF_model Evaluate the trained model on test (or training data)
 % For visualising time series predictions
 num_x_plots = 8;
 num_y_plots = 10;
 total_plots = num_x_plots * num_y_plots;
 if(iscell(x))        
    num_seqs = numel(x);
    x = cell2mat(x)';
    % add a bias term
    x =  cat(1, ones(1,size(x,2)), x);
 else
    % if not a cell it has already been flattened, and is constant
    % (most likely)
    num_seqs = size(y,2);
 end
 % if not sure about const assume it is not
 const = false;
 if(nargin < 11)
    [ ~, ~, PrecalcQ2sFlat, ~ ] = CalculateSimilarities( num_seqs, x, similarityFNs, sparsityFNs, y, const);
 end
 correlations = zeros(num_seqs, 1);
 rms = zeros(num_seqs, 1);
 % concatenated data for an alternative correlation
 y_predConcat = [];
 y_trueConcat = [];
 % Predict each sequence
 for q=1:num_seqs
    if(iscell(y))        
        seq_length = size(y{q},1);
        yq = y{q};
    else
        seq_length = size(y,1);            
        yq = y(:,q);
    end
    X = x(:,(q-1)*seq_length+1:q*seq_length);
    h1 = 1./(1 + exp(-thetas * X));
    b = (2 * alphas' * h1)';
    PrecalcQ2flat = PrecalcQ2sFlat{q};
    precalc_eye = eye(seq_length);
    precalc_zeros = zeros(seq_length);
    SigmaInv = CalcSigmaCCNFflat(alphas, betas, seq_length, PrecalcQ2flat, precalc_eye, precalc_zeros);
    y_est = SigmaInv \ b;
    % Can optionally supply the scaling and offset used on the training
    % labels to be applied inversely
    y_est = y_est/scaling + offset;
    if(numel(y_est) > 1)        
        R = corrcoef(y_est, yq);
        correlations(q) = R(1,2);
    end
    rms(q) = sqrt( mean((y_est - yq).^2) );
    y_predConcat = cat(1, y_predConcat, y_est);
    y_trueConcat = cat(1, y_trueConcat, yq);
    if(verbose)
        if(mod(q,total_plots) == 1)
            figure;
            remainingPlots = nExamples - q;
            if(remainingPlots < total_plots)
                num_y_plots = ceil(remainingPlots / num_x_plots);            
            end            
        end        
        subplot(num_y_plots,num_x_plots,mod(q-1,total_plots)+1);
        t = 1:nFrames;
        plot(t,y{q},'g',t,y_est,'b');
        title(sprintf('C %.2f, R %.2f', correlations(q), rms(q)));
        set(gca, 'XTick', [], 'YTick', []);
    end   
 end
 % Compute the error metrics
 mean_correlation = mean(correlations); 
 mean_RMSE = mean(rms);
 long_correlation = corr(y_predConcat, y_trueConcat).^2;
 long_RMSE = sqrt(mean((y_predConcat - y_trueConcat).^2));
 predictions = y_predConcat;
 gts = y_trueConcat;
 if(verbose)
    figure
    plot([1:numel(y_trueConcat)],y_trueConcat,'g',[1:numel(y_trueConcat)],y_predConcat,'b');
    title(sprintf('C %.2f, R %.2f', long_correlation, long_RMSE));
    set(gca, 'XTick', [], 'YTick', []);
 end
 end
--- a/model_training/CCNF/CCNF/lib/gradientCCNF.m
+++ b/model_training/CCNF/CCNF/lib/gradientCCNF.m
@@ -0,0 +1,185 @@
 function [ gradientParams, SigmaInvs, CholDecomps, Sigmas, bs, allXresp] = gradientCCNF( params, num_alpha, numBeta, sizeTheta, lambda_a, lambda_b, lambda_th, Precalc_Bs, x, y, Precalc_yBys, Precalc_Bs_flat, constant, num_seqs)
 %gradientCCNF Summary of this function goes here
 %   Detailed explanation goes here
    % pick out the relevant terms (unpack)
    alphas_init = params(1:num_alpha);
    betasInit = params(num_alpha+1:num_alpha+numBeta);
    thetasInit = reshape(params(num_alpha+numBeta+1:end), sizeTheta);
    % Compute the response from the neural layers        
    allXresp = 1./(1 + exp(-thetasInit * x));
    Xt = x;
    bs = 2*alphas_init' * allXresp;
    % This is precalculated for the next step and is basically the
    % feedforward step of the neural net
    Z_precalc = 2 * (allXresp .* (1-allXresp));
    % These are the outputs weighted by the alphas (see eq TODO
    db2_precalc =  bsxfun(@times, Z_precalc, alphas_init);    
    num_feats = sizeTheta(2);    
    if(constant)       
        seq_length = size(x,2)/num_seqs;        
        % As the similarities are the same across all series we can reuse our
        % Sigma and SigmaInv calculations
        I = eye(seq_length);
        [SigmaInv] = CalcSigmaCCNFflat(alphas_init, betasInit, seq_length, Precalc_Bs_flat{1}, I, zeros(seq_length));
        CholDecomp=chol(SigmaInv);
        % This is a faster way of inverting a symmetric matrix
        Sigma=CholDecomp\(CholDecomp'\I);
        Sigma_trace = trace(Sigma);
        % mu values associated with each time step
        mus = Sigma * reshape(bs, seq_length, num_seqs);
        % difference between actual and prediction (error)
        diff = (y - mus);
        db_precalc_mult = bsxfun(@times, db2_precalc, diff(:)');
        % Equation 46 from the appendix
        gradientThetasT = Xt * db_precalc_mult';
        % Reshape into the correct format
        gradientThetasT = gradientThetasT(:)';
        gradientThetasT = reshape(gradientThetasT, sizeTheta(2), sizeTheta(1))';
        gradientThetasT = gradientThetasT(:);
        % Some useful precalculations
        % for every sequence get a dot product with itself
        yy = dot(y,y);
        % same goes for the mu
        mumu = dot(mus,mus);
        % calculating the derivative of L with respect to alpha_k (Equation 27)       
        % gradientAlphas =  (-yq'*yq +(2*yq'*D')' -2 * D * mu + sum(mu.^2) + trace(Sigma));
        % allXresp is D
        gradient_alphas_add = -sum(yy) + sum(mumu) + num_seqs * Sigma_trace;
        gradient_alphas = 2 * allXresp * (y(:) - mus(:)) + gradient_alphas_add;
        gradient_betas = zeros(numBeta, 1);
        % calculating the derivative of log(L) with respect to the betas
        for k=1:numBeta
            % From Equation 38 (and 39 for gamma)
            % gradient = -yq'*B^(k)*yq + mu'*B^(k)*mu + Vec(Sigma)'*vec(B^(k)
            % We precalculate B^(k) (equation 30), as it does not change
            % over the course of optimisation
            B_k = Precalc_Bs{1}{k};
            % precalculated -yq'*B_k*yq can be used as well as it does not
            % change (stored in Precalc_yBys)
            yq_B_k_yq = sum(Precalc_yBys(:,k));
            % A vectorised version of mu'*B^(k)*mu 
            B_k_mu = B_k*mus;
            mu_B_k_mu = mus(:)' * B_k_mu(:);
            % Vec(Sigma)*Vec(B^(k)) can be computed as follows:
            partition_term = num_seqs * Sigma(:)'*B_k(:);
            % Equation 38 and 39 basically
            dLdb = yq_B_k_yq + mu_B_k_mu + partition_term; 
            gradient_betas(k) = dLdb;
        end        
        gradientParams = [gradient_alphas;gradient_betas;gradientThetasT];            
        SigmaInvs = SigmaInv;
        CholDecomps = CholDecomp;
        Sigmas = Sigma;
    else 
        SigmaInvs = cell(num_seqs, 1);
        CholDecomps = cell(num_seqs, 1);
        Sigmas = cell(num_seqs, 1);
        gradients = zeros(num_seqs, numel(params));        
        a_precalc = zeros(sizeTheta(2)*sizeTheta(1), size(allXresp,2));
        for i=1:size(db2_precalc,1)
            a_precalc((i-1)*num_feats+1:i*num_feats,:) = bsxfun(@times, Xt, db2_precalc(i,:));
        end        
        % y can either be in cell format (diff length seqs.) or in matrix
        %, same length seqs
        beg_ind = 1;        
        if(iscell(y))
            end_ind = numel(y{1});
            y_cell = true;
        else
            end_ind = size(y,1);
            y_cell = false;
        end
        % Go through every sequence summing the gradients
        for q = 1 : num_seqs
            currResp = allXresp(:,beg_ind:end_ind);
            currB = bs(beg_ind:end_ind)';
            PrecalcB = Precalc_Bs{q};
            PrecalcBFlat = Precalc_Bs_flat{q};
            if(y_cell)
                yq = y{q};
            else
                yq = y(:,q);
            end
            xq = x(2:end, beg_ind:end_ind);
            % Used for equation 46 computation
            a_precalc_curr = a_precalc(:,beg_ind:end_ind);
            precalc_eye = eye(numel(yq));
            precalc_zeros = zeros(numel(yq));
            [ gradientsAlphas, gradientsBetas, gradientsThetas, SigmaInv, CholDecomp, Sigma ] = gradientCCNF_per_seq(alphas_init, betasInit, thetasInit, PrecalcB, xq, yq, currResp, currB, Precalc_yBys(q, :), PrecalcBFlat, a_precalc_curr, precalc_eye, precalc_zeros);
            gradients(q,:) = [gradientsAlphas; gradientsBetas; gradientsThetas(:)];
            SigmaInvs{q} = SigmaInv;
            CholDecomps{q} = CholDecomp;
            Sigmas{q} = Sigma;
            % Update the references to sequence start/end
            if(q ~= num_seqs)
                beg_ind = end_ind + 1;
                if(iscell(y))
                    end_ind = end_ind + numel(y{q+1});
                else
                    end_ind = end_ind + size(y,1);
                end
            end
        end
        gradientParams = sum(gradients,1)';
    end
    % Add the regularisation term
    regAlpha = alphas_init * lambda_a;
    regBeta = betasInit * lambda_b;
    regTheta = thetasInit * lambda_th;
    gradientParams = gradientParams - [regAlpha; regBeta; regTheta(:)];
 end
--- a/model_training/CCNF/CCNF/lib/gradientCCNF_per_seq.m
+++ b/model_training/CCNF/CCNF/lib/gradientCCNF_per_seq.m
@@ -0,0 +1,53 @@
 function [ gradient_alphas, gradient_betas, gradient_thetas, SigmaInv, CholDecomp, Sigma ] = gradientCCNF_per_seq( alphas, betas, thetas, precalc_Bk, xq, yq, curr_resp, b, precalc_y_B_y, Precalc_Bk_flat, a_precalc, precalc_eye, precalc_zeros)
 %gradientCCNF_per_seq Compute the partial derivatives for a single sequence
    % This is an optimised version as it does not use the whole matrix but
    % a lower diagonal part due to symmetry
    n = size(xq, 2);
    [SigmaInv] = CalcSigmaCCNFflat(alphas, betas, n, Precalc_Bk_flat, precalc_eye, precalc_zeros);
    % Get the actual sigma from out SigmaInv
    % Optimised for symmetric matrices
    CholDecomp=chol(SigmaInv);
    Sigma=CholDecomp\(CholDecomp'\precalc_eye);
    % mu = SigmaInv \ b = Sigma * b;
    % as we've calculate Sigma already, this is equivalent of the above
    mu = Sigma * b;    
    % calculating the derivative of L with respect to alpha_k (Equation 27)       
    % gradientAlphas =  (-yq'*yq +(2*yq'*D')' -2 * D * mu + sum(mu.^2) + trace(Sigma));
    % curr_resp is D from the paper
    yqq = -yq'*yq;
    curr_resp_yq = (2*curr_resp*yq);
    gradient_alphas = yqq +  curr_resp_yq + -2 * curr_resp * mu + mu' * mu +  sum(diag(Sigma));
    gradient_betas = zeros(size(betas));    
    K2 = numel(betas);
    % calculating the derivative of log(L) with respect to the betas
    for k=1:K2
        % From Equation 38 (and 39 for gamma)
        % gradient = -yq'*B^(k)*yq + mu'*B^(k)*mu + Vec(Sigma)'*vec(B^(k)
        % We precalculate B^(k) (equation 30), as it does not change
        % over the course of optimisation        
        B_k = precalc_Bk{k};
        % Vec(Sigma)*Vec(B^(k)) can be computed as follows:
        partition_gradient = Sigma(:)'*B_k(:);
        % Equation 38 and 39 basically
        dLdb = precalc_y_B_y(k) + mu'*B_k*mu + partition_gradient;
        gradient_betas(k) = dLdb;
    end
    % Equation 46 from the appendix
    gradient_thetas = (yq - mu)' * a_precalc';
    gradient_thetas = (reshape(gradient_thetas, size(thetas')))';
 end
--- a/model_training/CCNF/CCNF/lib/randInitializeWeights.m
+++ b/model_training/CCNF/CCNF/lib/randInitializeWeights.m
@@ -0,0 +1,35 @@
 function W = randInitializeWeights(L_in, L_out)
 %RANDINITIALIZEWEIGHTS Randomly initialize the weights of a layer with L_in
 %incoming connections and L_out outgoing connections
 %   W = RANDINITIALIZEWEIGHTS(L_in, L_out) randomly initializes the weights 
 %   of a layer with L_in incoming connections and L_out outgoing 
 %   connections. 
 %
 %   Note that W should be set to a matrix of size(L_out, 1 + L_in) as
 %   the column row of W handles the "bias" terms
 %
 % You need to return the following variables correctly 
 % epsilon_init  =  0.12;
 % epsilon_init  =  0.12;
 epsilon_init = 1/sqrt(L_in);
 W  =  rand(L_out,  1  +  L_in)  *  2  *  epsilon_init - epsilon_init;
 % ====================== YOUR CODE HERE ======================
 % Instructions: Initialize W randomly so that we break the symmetry while
 %               training the neural network.
 %
 % Note: The first row of W corresponds to the parameters for the bias units
 %
 % =========================================================================
 end
--- a/model_training/CCNF/CCNF/lib/similarityNeighbor.m
+++ b/model_training/CCNF/CCNF/lib/similarityNeighbor.m
@@ -0,0 +1,15 @@
 function [ SimilarityMatrix ] = similarityNeighbor( x, n, ~)
 %similarityNeighbor Create a link for the n'th neighbour
    sz = size(x,1);
    SimilarityMatrix = eye(sz);
    i = 1:sz-n;
    SimilarityMatrix(sub2ind([sz, sz], i+n,i)) = 1;
    SimilarityMatrix(sub2ind([sz, sz], i,i+n)) = 1;
    DiagMask = ones(size(x, 1)) - eye(size(x,1));
    SimilarityMatrix = SimilarityMatrix .* DiagMask;
    SimilarityMatrix = SimilarityMatrix + eye(size(x, 1));
 end
--- a/model_training/CCNF/CCNF/lib/similarity_neighbor_grid.m
+++ b/model_training/CCNF/CCNF/lib/similarity_neighbor_grid.m
@@ -0,0 +1,59 @@
 function [ SimilarityMatrix ] = similarity_neighbor_grid( x, side, types)
 %similarity_neighbor_grid Create a neighbourhood similarities (for a grid)
    % this assumes that the patch is laid out with first column, then second
    % column, ... final column (column major)
    SimilarityMatrix = eye(side*side);
    % types - 1 - horizontal, 2 - vertical, 3 - diagonal (bl-tr), 4 -
    % diagonal (br - tl)
    for t=1:numel(types)
        if(types(t) == 1)
            % for horizontal we want to link both neighbours 
            % (which are offset from the points by height)
            i = 1:(side*side-side);
            % create the neighboring links for i
            SimilarityMatrix(sub2ind([side^2, side^2], i, i+side)) = 1;
            SimilarityMatrix(sub2ind([side^2, side^2], i+side, i)) = 1;            
        end        
        if(types(t) == 2)
            % for vertical we want to link both neighbours except at edge
            % cases which are mod(y_loc,side) = 0 as they are at the edges
            i = 1:side*side;
            i_to_rem = i(mod(i, side) == 0);
            i_both = setdiff(i, i_to_rem);
            % create the neighboring links for i
            SimilarityMatrix(sub2ind([side^2, side^2], i_both+1, i_both)) = 1;
            SimilarityMatrix(sub2ind([side^2, side^2], i_both, i_both+1)) = 1;            
        end        
        if(types(t) == 3)
            % for diagonal to top right, and bottom left don't use right most column
            i = 1:(side^2)-side;
            i_to_rem = i(mod(i-1, side) == 0);
            i_both = setdiff(i, i_to_rem);
            % create the neighboring links for i
            SimilarityMatrix(sub2ind([side^2, side^2], i_both+side-1, i_both)) = 1;
            SimilarityMatrix(sub2ind([side^2, side^2], i_both, i_both+side-1)) = 1;                                    
        end
        if(types(t) == 4)
            % for diagonal to top left, and bottom right don't use right most column
            i = 1:(side^2)-side;
            i_to_rem = i(mod(i, side) == 0);
            i_both = setdiff(i, i_to_rem);
            % create the neighboring links for i
            SimilarityMatrix(sub2ind([side^2, side^2], i_both+side+1, i_both)) = 1;
            SimilarityMatrix(sub2ind([side^2, side^2], i_both, i_both+side+1)) = 1;                   
        end
    end        
    assert(isequal(SimilarityMatrix, SimilarityMatrix'));
 end
--- a/model_training/CCNF/CCNF/lib/similarity_neighbor_grid_further.m
+++ b/model_training/CCNF/CCNF/lib/similarity_neighbor_grid_further.m
@@ -0,0 +1,69 @@
 function [ SimilarityMatrix ] = similarity_neighbor_grid_further( x, side, types, dist)
 %similarity_neighbor_grid_further Summary of this function goes here
    % this assumes that the patch is laid out with first column, then second
    % column, ... final column (column major)
 %     dist = 2;
    SimilarityMatrix = eye(side*side);
    % types - 1 - horizontal, 2 - vertical, 3 - diagonal (bl-tr), 4 -
    % diagonal (br - tl)
    for t=1:numel(types)
        if(types(t) == 1)
            % for horizontal we want to link both neighbours 
            % (which are offset from the points by height)
            i = 1:(side*side-side*dist);
            % create the neighboring links for i
            SimilarityMatrix(sub2ind([side^2, side^2], i, i+side*dist)) = 1;
            SimilarityMatrix(sub2ind([side^2, side^2], i+side*dist, i)) = 1;            
        end        
        if(types(t) == 2)
            % for vertical we want to link both neighbours except at edge
            % cases which are mod(y_loc,side) = 0 as they are at the edges
            i = 1:side*side;
            i_to_rem =[];
            for s=1:dist
                i_to_rem = union(i_to_rem, i(mod(i+s-1, side) == 0));
            end
            i_both = setdiff(i, i_to_rem);
            % create the neighboring links for i
            SimilarityMatrix(sub2ind([side^2, side^2], i_both+dist, i_both)) = 1;
            SimilarityMatrix(sub2ind([side^2, side^2], i_both, i_both+dist)) = 1;
        end        
        if(types(t) == 3)
            % for diagonal to top right, and bottom left don't use right most column
            i = 1:(side^2)-dist * side;
            i_to_rem = [];
            for s=1:dist
                i_to_rem = union(i_to_rem,i(mod(i-s, side) == 0));
            end
            i_both = setdiff(i, i_to_rem);
            % create the neighboring links for i
            SimilarityMatrix(sub2ind([side^2, side^2], i_both+dist*side-dist, i_both)) = 1;
            SimilarityMatrix(sub2ind([side^2, side^2], i_both, i_both+dist*side-dist)) = 1;                                    
        end
        if(types(t) == 4)
            % for diagonal to top left, and bottom right don't use right most column
            i = 1:(side^2)-dist*side;
            i_to_rem = [];
            for s=1:dist
                i_to_rem = union(i_to_rem, i(mod(i+s-1, side) == 0));
            end
            i_both = setdiff(i, i_to_rem);
            % create the neighboring links for i
            SimilarityMatrix(sub2ind([side^2, side^2], i_both+dist*side+dist, i_both)) = 1;
            SimilarityMatrix(sub2ind([side^2, side^2], i_both, i_both+dist*side+ dist)) = 1;                   
        end
    end        
    assert(isequal(SimilarityMatrix, SimilarityMatrix'));
 end
--- a/model_training/CCNF/CCNF/lib/sparsity_grid.m
+++ b/model_training/CCNF/CCNF/lib/sparsity_grid.m
@@ -0,0 +1,22 @@
 function [ SparsityMatrix ] = sparsity_grid( x, side, width, width_end)
 %sparsity_grid Summary of this function goes here
 %   Detailed explanation goes here
    % width and width-end define the start and end for the sparsity (or
    % similarity) grid, allowing to control enforced smoothnes and
    % sparsity/inhibition
    SimilarityMatrix = zeros(side*side);
    for i=1:width
        SimilarityMatrix = (similarity_neighbor_grid_further(x, side, [1,2,3,4], i) | SimilarityMatrix);
    end
    SimilarityMatrix_end = zeros(side*side);
    for i=1:width_end
        SimilarityMatrix_end = (similarity_neighbor_grid_further(x, side, [1,2,3,4], i) | SimilarityMatrix_end);
    end
    SparsityMatrix = double(SimilarityMatrix_end & (~SimilarityMatrix));
    assert(isequal(SparsityMatrix, SparsityMatrix'));
 end
--- a/model_training/CCNF/CCRF/lib/CCRF_training_bfgs.m
+++ b/model_training/CCNF/CCRF/lib/CCRF_training_bfgs.m
@@ -0,0 +1,57 @@
 function [ alphas, betas, scaling, finalLikelihood] = CCRF_training_bfgs( num_seqs, thresholdX, thresholdFun, x, y, yUnnormed, alphas, betas, lambda_a, lambda_b, similarityFNs, Precalc_Bs, Precalc_Bs_flat, Precalc_yBys, varargin)
 %GRADIENTDESCENTCCRF Performs CCRF gradient descen given the initial state
 %and gradient descent parameters
 %   Detailed explanation goes here
    % if these are not provided calculate them, TODO this might be
    % It is possible to predefine the component B^(k) required 
    % to compute B term and partial derivatives, also can predefine yB^(k)y,
    % as they also do not change through the iterations
    if(sum(strcmp(varargin,'PrecalcBs')) && sum(strcmp(varargin,'PrecalcBsFlat'))...
             && sum(strcmp(varargin,'Precalc_yBy')))
        ind = find(strcmp(varargin,'PrecalcBs')) + 1;
        Precalc_Bs = varargin{ind};
        ind = find(strcmp(varargin,'PrecalcBsFlat')) + 1;
        Precalc_Bs_flat = varargin{ind};
        ind = find(strcmp(varargin,'Precalc_yBys')) + 1;
        Precalc_yBys = varargin{ind};
    else
        % if these are not provided calculate them        
        [ ~, Precalc_Bs, Precalc_Bs_flat, Precalc_yBys ] = CalculateSimilarities( num_seqs, x, similarityFNs, y);
    end              
    params = [alphas; betas];
    objectiveFun = @(params)objectiveFunction(params, numel(alphas), lambda_a, lambda_b, Precalc_Bs, x, y, Precalc_yBys, Precalc_Bs_flat);
    options = optimset('Algorithm','interior-point','GradObj','on', 'TolX', thresholdX, 'TolFun', thresholdFun, 'Hessian', 'bfgs', 'display','off', 'useParallel', 'Always');
    if(sum(strcmp(varargin,'max_iter'))) 
        options.MaxIter = varargin{find(strcmp(varargin,'max_iter')) + 1};
    end      
    params = fmincon(objectiveFun, params, [], [],[],[], zeros(numel(params),1), Inf(numel(params), 1), [], options);
    alphas = params(1:numel(alphas));
    betas = params(numel(alphas)+1:end);
    finalLikelihood = LogLikelihoodCCRF(y, x, alphas, betas, lambda_a, lambda_b, Precalc_Bs_flat);
 %     fprintf('Final log likelihood at iteration; logL %f, learning rate\n', finalLikelihood);
    % establish the scaling
    scaling = getScaling2(alphas, betas, x, yUnnormed, Precalc_Bs);
 end
 function [loss, gradient] = objectiveFunction(params, numAlpha, lambda_a, lambda_b, PrecalcBs, x, y, Precalc_yBys, PrecalcBsFlat)
    alphas = params(1:numAlpha);
    betas = params(numAlpha+1:end);
    [gradient, SigmaInvs, CholDecomps, Sigmas] = gradientCCRFFull(params, lambda_a, lambda_b, PrecalcBs, x, y, Precalc_yBys, PrecalcBsFlat);
    % as bfgs does gradient descent rather than ascent, negate the results
    gradient = -gradient;
    loss = -LogLikelihoodCCRF(y, x, alphas, betas, lambda_a, lambda_b, PrecalcBsFlat, SigmaInvs, CholDecomps, Sigmas);
 end
--- a/model_training/CCNF/CCRF/lib/CCRF_training_gradient_descent.m
+++ b/model_training/CCNF/CCRF/lib/CCRF_training_gradient_descent.m
@@ -0,0 +1,122 @@
 function [ alphas, betas, scaling, finalLikelihood] = CCRF_training_gradient_descent( nIterations, nExamples, learningRate, threshold, x, y, yUnnormed, masks, alphas, betas, lambda_a, lambda_b, similarityFNs, useIndicators, verbose)
 %GRADIENTDESCENTCCRF Performs CCRF gradient descen given the initial state
 %and gradient descent parameters
 %   Detailed explanation goes here
    if(verbose)
        logLikelihood = zeros(round(nIterations/10)+1, 1);
        alphaTrack = zeros(nIterations, numel(alphas));
        betaTrack = zeros(nIterations, numel(betas));
    end
    logAlphas = log(alphas);
    logBetas = log(betas);
    K = numel(similarityFNs);
    %calculate similarity measures for each of the sequences
    Similarities = cell(nExamples, 1);
    PrecalcQ2s = cell(nExamples,1);
    PrecalcQ2sFlat = cell(nExamples,1);
    PrecalcYqDs = zeros(nExamples, K);
    for q = 1 : nExamples
        yq = y{q};
        xq = x{q};
        mask = masks{q};
        n = size(yq, 1);
        Similarities{q} = zeros([n, n, K]);
 %         PrecalcQ2s{q} = zeros([n, n, K]);
        PrecalcQ2s{q} = cell(K,1);
 %         PrecalcQ2sFlat{q} = cell(K,1);
        PrecalcQ2sFlat{q} = zeros((n*(n+1))/2,K);
        % go over all of the similarity metrics and construct the
        % similarity matrices
        for k=1:K
            Similarities{q}(:,:,k) = similarityFNs{k}(xq, mask);
            S = Similarities{q}(:,:,k);
            D =  diag(sum(S));
            B = D - S;
 %             PrecalcQ2s{q}(:,:,k) = B;
            PrecalcQ2s{q}{k} = B;
 %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
            PrecalcQ2sFlat{q}(:,k) = B(logical(tril(ones(size(S)))));
            PrecalcYqDs(q,k) = -yq'*B*yq;
        end
    end    
    %stochastic gradient descent
    for iter = 1 : nIterations
        prevAlphas = alphas;
        prevBetas = betas;        
        for q = 1 : nExamples
            yq = y{q};
            xq = x{q};
            mask = masks{q};
            PrecalcQ2 = PrecalcQ2s{q};
            PrecalcQ2Flat = PrecalcQ2sFlat{q};
            [ logGradientsAlphas, logGradientsBetas] = gradientCCRF(alphas, betas, lambda_a, lambda_b, PrecalcQ2, xq, yq, mask, PrecalcYqDs(q, :), useIndicators, PrecalcQ2Flat);
 %             [logGradientAlphasAnalytical, logGradientBetasAnalytical] = gradientAnalytical(PrecalcQ2, alphas, betas, lambda, xq, yq, mask);
 %  
 %             diffInGradientsAlpha = mean(abs(logGradientsAlphas - logGradientAlphasAnalytical));
 %             diffInGradientsBeta = mean(abs(logGradientsBetas - logGradientBetasAnalytical));
            %update log alpha
            logAlphas = logAlphas + learningRate * logGradientsAlphas;
            alphas = exp(logAlphas);
            %update log beta
            logBetas = logBetas + learningRate * logGradientsBetas;
            betas = exp(logBetas);
            if(verbose)
                %record alpha and beta values for each iteration for debug purposes
                alphaTrack(iter,:) = alphas(:);
                betaTrack(iter,:) = betas;
            end
        end
        %check for convergence 
        if (norm([prevAlphas;prevBetas] - [alphas;betas])/norm([prevAlphas;prevBetas]) < threshold || norm([logGradientsAlphas;logGradientsBetas]) < threshold)
            break;
        end
        if(verbose)
            if(mod(iter, 10)==0)
                logLikelihood(iter/10 + 1) = LogLikelihoodCCRF(y, x, masks, alphas, betas, lambda_a, lambda_b, PrecalcQ2sFlat, useIndicators);
                fprintf('Iteration %d; logL %f\n', iter, logLikelihood(iter/10 + 1));
            end
        end
    end
    % establish the scaling
    scaling = getScaling(alphas, betas, x, yUnnormed, masks, PrecalcQ2s, useIndicators);
    if(verbose)  
        figure
        subplot(1,3,1)
        plot(betaTrack(1:iter,:));
        title('beta');
        subplot(1,3,2)
        plot(alphaTrack(1:iter,:))
        title('alpha');
        subplot(1,3,3)
        plot(logLikelihood(1:round(iter/10),:))
        title('log likelihood');
        finalLikelihood = LogLikelihoodCCRF(y, x, masks, alphas, betas, lambda_a, lambda_b, PrecalcQ2sFlat, useIndicators);
        fprintf('Final log likelihood at iteration %d; logL %f, learning rate %f\n', iter, finalLikelihood, learningRate);
    else
        finalLikelihood = LogLikelihoodCCRF(y, x, masks, alphas, betas, lambda_a, lambda_b, PrecalcQ2sFlat, useIndicators);
        fprintf('Final log likelihood at iteration %d; logL %f, learning rate %f\n', iter, finalLikelihood, learningRate);
    end
 end
--- a/model_training/CCNF/CCRF/lib/CalcSigmaCCRF.m
+++ b/model_training/CCNF/CCRF/lib/CalcSigmaCCRF.m
@@ -0,0 +1,50 @@
 function [ SigmaInv] = CalcSigmaCCRF(alphas, betas, precalcBwithoutBeta )
 %CALCSIGMAPRF Summary of this function goes here
 %   Detailed explanation goes here
 % constructing the sigma
    % the number of elements in a current sequence
    n = size(precalcBwithoutBeta{1},1);
    q1 = sum(alphas) * eye(n);
    % the above code can be simplified by the following 2 lines of the
    % inner loop, we want to do that for every beta however
    K2 = numel(betas);
    q2 = zeros([n,n]);
    % calculating the q2 from the paper
    for i=1:K2
        % We're basically performing the following calculation, but use
        % precalculated D - S instead of doing it every iteration
 %         S = Similarities(:,:,i);
 %         D =  diag(sum(S));
 %         q = betas(i) * D - betas(i) * S;
 %         q2s(:,:,i) = q;
 %         q2 = q2 + betas(i)*precalcQ2withoutBeta(:,:,i);
        q2 = q2 + betas(i)*precalcBwithoutBeta{i};
    end
    % This is another alternative, does not seem to be faster
 %     q2old = sum(bsxfun(@times, precalcQ2withoutBeta, reshape(betas,[1,1,K2])),3);
 %     q2 = sum(q2s, 3);
 %     % An alternative way of calculating the above could be using bsxfun,
 %     but this seems to be actually slower than using it
 %     S = bsxfun(@times, Similarities, -reshape(betas,[1,1,K2]));
 % 
 %     % now need the diagonals
 %     d = sum(Similarities);
 % 
 %     I = repmat(eye(n), [1, 1, K2]);
 %     I = bsxfun(@times, I, reshape(betas,[1,1,K2]));
 %     D = bsxfun(@times, I, d);
 % 
 %     q2s = D + S;
 %     q2 = sum(q2s2,3);
    SigmaInv = 2 * (q1 + q2);
 end
--- a/model_training/CCNF/CCRF/lib/CalcSigmaCCRFflat.m
+++ b/model_training/CCNF/CCRF/lib/CalcSigmaCCRFflat.m
@@ -0,0 +1,26 @@
 function [ SigmaInv] = CalcSigmaCCRFflat(alphas, betas, n, PrecalcB_flat)
 %CALCSIGMAPRF Summary of this function goes here
 %   Detailed explanation goes here
 % constructing the Sigma (that is laid out in an efficient way for
 % symmertic matrices
    A = sum(alphas) * eye(n);
    % calculating the B from the paper
    % using the precalculated lower triangular elements of B without beta
    Btmp = PrecalcB_flat * betas;        
    % not faster
    % now make it into a square symmetric matrix
    B = zeros(n,n);
    on = tril(true(n,n));
    B(on) = Btmp;
    B = B';
    B(on) = Btmp;
    % Combine A and B
    SigmaInv = 2 * (A + B);
 end
--- a/model_training/CCNF/CCRF/lib/CalcbCCRF.m
+++ b/model_training/CCNF/CCRF/lib/CalcbCCRF.m
@@ -0,0 +1,14 @@
 function b = CalcbCCRF( alpha, x)
 %CALCBPRF Summary of this function goes here
 %   Detailed explanation goes here
 %     b = zeros(size(x,1),1);
 % 
 %     for i=1:size(x,1)
 %        b(i) = 2 *  x(i,:) * alpha; 
 %     end
    % vectorising above code
    b = 2 * x * alpha;
 end
--- a/model_training/CCNF/CCRF/lib/CalculateSimilarities.m
+++ b/model_training/CCNF/CCRF/lib/CalculateSimilarities.m
@@ -0,0 +1,85 @@
 function [ Similarities, PrecalcQ2s, PrecalcQ2sFlat, PrecalcYqDs ] = CalculateSimilarities( n_sequences, x, similarityFNs, y)
 %CALCULATESIMILARITIES Summary of this function goes here
 %   Detailed explanation goes here
    K = numel(similarityFNs);
    %calculate similarity measures for each of the sequences
    Similarities = cell(n_sequences, 1);
    PrecalcQ2s = cell(n_sequences,1);
    PrecalcQ2sFlat = cell(n_sequences,1);
    PrecalcYqDs = zeros(n_sequences, K);
    if(iscell(x))
        for q = 1 : n_sequences
            xq = x{q};
            n = size(xq, 1);
            Similarities{q} = zeros([n, n, K]);
            PrecalcQ2s{q} = cell(K,1);
            PrecalcQ2sFlat{q} = zeros((n*(n+1))/2,K);
            % go over all of the similarity metrics and construct the
            % similarity matrices
            if(nargin > 3)
                yq = y{q};
            end
            for k=1:K
                Similarities{q}(:,:,k) = similarityFNs{k}(xq);
                S = Similarities{q}(:,:,k);
                D =  diag(sum(S));
    %             PrecalcQ2s{q}(:,:,k) = D - S;
                PrecalcQ2s{q}{k} = D - S;
                B = D - S;
    %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
                PrecalcQ2sFlat{q}(:,k) = B(logical(tril(ones(size(S)))));
                if(nargin > 3)        
                    PrecalcYqDs(q,k) = -yq'*B*yq;
                end
            end
        end
    else
        sample_length = size(x,2)/n_sequences;
        for q = 1 : n_sequences
            beg_ind = (q-1)*sample_length + 1;
            end_ind = q*sample_length;
            % don't take the bias term
            xq = x(2:end, beg_ind:end_ind);
            Similarities{q} = zeros([sample_length, sample_length, K]);
            PrecalcQ2s{q} = cell(K,1);
            PrecalcQ2sFlat{q} = zeros((sample_length*(sample_length+1))/2,K);
            % go over all of the similarity metrics and construct the
            % similarity matrices
            if(nargin > 3)
                yq = y(:,q);
            end
            for k=1:K
                Similarities{q}(:,:,k) = similarityFNs{k}(xq);
                S = Similarities{q}(:,:,k);
                D =  diag(sum(S));
    %             PrecalcQ2s{q}(:,:,k) = D - S;
                PrecalcQ2s{q}{k} = D - S;
                B = D - S;
    %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
                PrecalcQ2sFlat{q}(:,k) = B(logical(tril(ones(size(S)))));
                if(nargin > 3)        
                    PrecalcYqDs(q,k) = -yq'*B*yq;
                end
            end
        end        
    end
 end
--- a/model_training/CCNF/CCRF/lib/CalculateSimilarities_sparsity.m
+++ b/model_training/CCNF/CCRF/lib/CalculateSimilarities_sparsity.m
@@ -0,0 +1,173 @@
 function [ Similarities, PrecalcQ2s, PrecalcQ2sFlat, PrecalcYqDs ] = CalculateSimilarities_sparsity( n_sequences, x, similarityFNs, sparsityFNs, y, const)
 %CALCULATESIMILARITIES Summary of this function goes here
 %   Detailed explanation goes here
    K = numel(similarityFNs);
    K2 = numel(sparsityFNs);
    %calculate similarity measures for each of the sequences
    Similarities = cell(n_sequences, 1);
    PrecalcQ2s = cell(n_sequences,1);
    PrecalcQ2sFlat = cell(n_sequences,1);
    PrecalcYqDs = zeros(n_sequences, K + K2);
    if(iscell(x))
        for q = 1 : n_sequences
            xq = x{q};
            n = size(xq, 1);
            Similarities{q} = zeros([n, n, K+K2]);
            PrecalcQ2s{q} = cell(K+K2,1);
            PrecalcQ2sFlat{q} = zeros((n*(n+1))/2,K+K2);
            % go over all of the similarity metrics and construct the
            % similarity matrices
            if(nargin > 4)
                yq = y{q};
            end
            for k=1:K
                Similarities{q}(:,:,k) = similarityFNs{k}(xq);
                S = Similarities{q}(:,:,k);
                D =  diag(sum(S));
    %             PrecalcQ2s{q}(:,:,k) = D - S;
                PrecalcQ2s{q}{k} = D - S;
                B = D - S;
    %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
                PrecalcQ2sFlat{q}(:,k) = B(logical(tril(ones(size(S)))));
                if(nargin > 4)        
                    PrecalcYqDs(q,k) = -yq'*B*yq;
                end
            end
            for k=1:K2
                Similarities{q}(:,:,K+k) = sparsityFNs{k}(xq);
                S = Similarities{q}(:,:,K+k);
                D =  diag(sum(S));
    %             PrecalcQ2s{q}(:,:,k) = D - S;
                PrecalcQ2s{q}{K+k} = D + S;
                B = D +  S;
    %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
                PrecalcQ2sFlat{q}(:,K+k) = B(logical(tril(ones(size(S)))));
                if(nargin > 4)        
                    PrecalcYqDs(q,K+k) = -yq'*B*yq;
                end
            end            
        end
    elseif(~const)
        sample_length = size(x,2)/n_sequences;
        similarities = cell(K, 1);
        sparsities = cell(K2, 1);
        for q = 1 : n_sequences
            beg_ind = (q-1)*sample_length + 1;
            end_ind = q*sample_length;
            % don't take the bias term
            xq = x(2:end, beg_ind:end_ind);
            Similarities{q} = zeros([sample_length, sample_length, K+K2]);
            PrecalcQ2s{q} = cell(K+K2,1);
            PrecalcQ2sFlat{q} = zeros((sample_length*(sample_length+1))/2,K+K2);
            % go over all of the similarity metrics and construct the
            % similarity matrices
            if(nargin > 4)
                yq = y(:,q);
            end
            for k=1:K
                if(q==1)
                    similarities{k} = similarityFNs{k}(xq);
                end
                Similarities{q}(:,:,k) = similarities{k};
                S = Similarities{q}(:,:,k);
                D =  diag(sum(S));
    %             PrecalcQ2s{q}(:,:,k) = D - S;
                PrecalcQ2s{q}{k} = D - S;
                B = D - S;
    %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
                PrecalcQ2sFlat{q}(:,k) = B(logical(tril(ones(size(S)))));
                if(nargin > 4)        
                    PrecalcYqDs(q,k) = -yq'*B*yq;
                end
            end
            for k=1:K2
                % this is constant so don't need to recalc
                if(q==1)
                   sparsities{k} = sparsityFNs{k}(xq);
                end
                Similarities{q}(:,:,K+k) = sparsities{k};
                S = Similarities{q}(:,:,K+k);
                D =  diag(sum(S));
    %             PrecalcQ2s{q}(:,:,k) = D - S;
                PrecalcQ2s{q}{K+k} = D + S;
                B = D +  S;
    %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
                PrecalcQ2sFlat{q}(:,K+k) = B(logical(tril(ones(size(S)))));
                if(nargin > 4)        
                    PrecalcYqDs(q,K+k) = -yq'*B*yq;
                end
            end
        end
    else
        sample_length = size(x,2)/n_sequences;
        similarities = cell(K, 1);
        sparsities = cell(K2, 1);
        PrecalcQ2s = {cell(K+K2,1)};
        PrecalcQ2sFlat = {zeros((sample_length*(sample_length+1))/2,K+K2)};
        Similarities = {zeros([sample_length, sample_length, K+K2])};
        beg_ind = 1;
        end_ind = sample_length;
        % don't take the bias term
        xq = x(2:end, beg_ind:end_ind);
        % go over all of the similarity metrics and construct the
        % similarity matrices
        for k=1:K
            similarities{k} = similarityFNs{k}(xq);
            Similarities{1}(:,:,k) = similarities{k};
            S = Similarities{1}(:,:,k);
            D =  diag(sum(S));
            PrecalcQ2s{1}{k} = D - S;
            B = D - S;
            % flatten the symmetric matrix to save space
            PrecalcQ2sFlat{1}(:,k) = B(logical(tril(ones(size(S)))));
            if(nargin > 4)
                PrecalcYqDs(:,k) = diag(-y'*B*y);
            end
        end
        for k=1:K2
            % this is constant so don't need to recalc
            sparsities{k} = sparsityFNs{k}(xq);
            Similarities{1}(:,:,K+k) = sparsities{k};
            S = Similarities{1}(:,:,K+k);
            D =  diag(sum(S));
 %             PrecalcQ2s{q}(:,:,k) = D - S;
            PrecalcQ2s{1}{K+k} = D + S;
            B = D +  S;
 %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
            PrecalcQ2sFlat{1}(:,K+k) = B(logical(tril(ones(size(S)))));
            if(nargin > 4)        
                PrecalcYqDs(:,K+k) = diag(-y'*B*y);
            end
        end   
    end
 end
--- a/model_training/CCNF/CCRF/lib/CalculateYqDs.m
+++ b/model_training/CCNF/CCRF/lib/CalculateYqDs.m
@@ -0,0 +1,54 @@
 function [ PrecalcYqDs ] = CalculateYqDs( n_sequences, x, similarityFNs, sparsityFNs, y)
 %CALCULATESIMILARITIES Summary of this function goes here
 %   Detailed explanation goes here
    K = numel(similarityFNs);
    K2 = numel(sparsityFNs);
    PrecalcYqDs = zeros(n_sequences, K + K2);
    sample_length = size(y,1);
    similarities = cell(K, 1);
    sparsities = cell(K2, 1);
    Similarities = zeros([sample_length, sample_length, K+K2]);
    Bs = zeros([sample_length, sample_length, K+K2]);
    for k=1:K
        similarities{k} = similarityFNs{k}(x);
        Similarities(:,:,k) = similarities{k};
        S = Similarities(:,:,k);
        D =  diag(sum(S));
        Bs(:,:,k) = D - S;
    end    
    for k=1:K2
        % this is constant so don't need to recalc
        sparsities{k} = sparsityFNs{k}(x);
        Similarities(:,:,K+k) = sparsities{k};
        S = Similarities(:,:,K+k);
        D =  diag(sum(S));
        %             PrecalcQ2s{q}(:,:,k) = D - S;
        Bs(:,:,K+k) = D + S;
        %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
    end
    for q = 1 : n_sequences
        % go over all of the similarity metrics and construct the
        % similarity matrices
        yq = y(:,q);
        for k=1:K+K2
            PrecalcYqDs(q,k) = -yq'*Bs(:,:,k)*yq;                       
        end
    end
 end
--- a/model_training/CCNF/CCRF/lib/LogLikelihoodCCRF.m
+++ b/model_training/CCNF/CCRF/lib/LogLikelihoodCCRF.m
@@ -0,0 +1,48 @@
 function logL = LogLikelihoodCCRF(y_coll, x_coll, alphas, betas,...
                                  lambda_a,lambda_b, PrecalcBsFlat,...
                                  SigmaInvs, ChDecomps, Sigmas)
 % Calculating the log likelihood of the CCRF with multi alpha and beta    
 Q = numel(y_coll);
 logL = 0;
 for q=1:Q
    yq = y_coll{q};
    xq = x_coll{q};
    n = size(xq, 1);
    b = CalcbCCRF(alphas, xq);
    % constructing the sigma inverse
    if(nargin < 11)
        [SigmaInv] = CalcSigmaCCRFflat(alphas, betas, n, PrecalcBsFlat{q});
        L = chol(SigmaInv);        
        mu = SigmaInv \ b;
    else
        SigmaInv = SigmaInvs{q};
        L = ChDecomps{q};
        Sigma = Sigmas{q};        
        mu = Sigma * b;
    end    
    % normalisation = 1/((2*pi)^(n/2)*sqrt(det(Sigma)));
    % Removing the division by pi, as it is constant
    % normalisation = 1/(sqrt(det(sigma)));
    % flipping around determinant of SigmaInv, as det(inv(Sigma)) = inv(det(Sigma)  
 %     normalisation = log(sqrt(det(SigmaInv)));
    % normalisation 2 using Cholesky decomposition
    normalisation2 = sum(log(diag(L))); % no times 2 here as we calculate the square root of determinant
    % probq = normalisation * exp(-0.5 * (y - mu)'*SigmaInv*(y-mu));
    % applying a logarithm to this leads to
 %     logLq = log(normalisation) + (-0.5 * (yq - mu)'*SigmaInv*(yq-mu));
    logLq = normalisation2 + (-0.5 * (yq - mu)'*SigmaInv*(yq-mu));
    logL = logL + logLq;
 end
 % add regularisation term
 logL = logL -lambda_b * (betas'*betas)/2 - lambda_a * (alphas'*alphas)/2;
--- a/model_training/CCNF/CCRF/lib/evaluateCCRFmodel.m
+++ b/model_training/CCNF/CCRF/lib/evaluateCCRFmodel.m
@@ -0,0 +1,83 @@
 function [ correlations, rms, meanCorr, meanRMS, longCorr, longRMS, predictions, gt ] = evaluateCCRFmodel( alphas, betas, x, xOffsets, y, similarityFNs, scaling, verbose, PrecalcBsFlat)
 %EVALUATEPRFMODEL Summary of this function goes here
 %   Detailed explanation goes here
 num_x_plots = 8;
 num_y_plots = 10;
 total_plots = num_x_plots * num_y_plots;
 nExamples = numel(x);
 if(nargin < 11)
    [ ~, ~, PrecalcBsFlat, ~ ] = CalculateSimilarities( nExamples, x, similarityFNs);
 end
 correlations = zeros(nExamples, 1);
 rms = zeros(nExamples, 1);
 % concatenated data for an alternative correlation
 y_predConcat = [];
 y_trueConcat = [];
 for q=1:nExamples
    X = x{q};
    nFrames = size(X,1);
    PrecalcBflat = PrecalcBsFlat{q};
    SigmaInv = CalcSigmaCCRFflat(alphas, betas, nFrames, PrecalcBflat);
    b = CalcbCCRF(alphas, x{q});
    y_est = SigmaInv \ b;
 %     y_est = y_est * scaling + xOffsets(q);
    y_est = y_est * scaling + xOffsets(q);
    R = corrcoef(y_est, y{q});
    correlations(q) = R(1,2);
    rms(q) = sqrt( (1/nFrames) * sum((y_est - y{q}).^2) );
    y_predConcat = cat(1, y_predConcat, y_est);
    y_trueConcat = cat(1, y_trueConcat, y{q});
    if(verbose)
        if(mod(q,total_plots) == 1)
            figure;
            remainingPlots = nExamples - q;
            if(remainingPlots < total_plots)
                num_y_plots = ceil(remainingPlots / num_x_plots);            
            end            
        end        
        subplot(num_y_plots,num_x_plots,mod(q-1,total_plots)+1);
        t = 1:nFrames;
        plot(t,y{q},'g',t,y_est,'b');
        title(sprintf('C %.2f, R %.2f', correlations(q), rms(q)));
        set(gca, 'XTick', [], 'YTick', []);
 %         legend('y_{true}','y_{ccrf}');
    end   
 end
 meanCorr = mean(correlations); 
 meanRMS = mean(rms);
 longCorr = corr(y_predConcat, y_trueConcat).^2;
 longRMS = sqrt( (1/numel(y_predConcat)) * sum((y_predConcat - y_trueConcat).^2) );
 predictions = y_predConcat;
 gt = y_trueConcat;
 if(verbose)
    figure
    plot([1:numel(y_trueConcat)],y_trueConcat,'g',[1:numel(y_trueConcat)],y_predConcat,'b');
    title(sprintf('C %.2f, R %.2f', longCorr, longRMS));
    set(gca, 'XTick', [], 'YTick', []);
 end
 end
--- a/model_training/CCNF/CCRF/lib/getScaling.m
+++ b/model_training/CCNF/CCRF/lib/getScaling.m
@@ -0,0 +1,28 @@
 function [ scaling ] = getScaling(  alphas, betas, x, y, masks, PrecalcQ2s, useIndicator)
 %getScaling Summary of this function goes here
 %   Detailed explanation goes here
 % for visualisation use only the first sequence
 nExamples = numel(x);
 scalings = zeros(1,nExamples);
 for q=1:nExamples
    mask = masks{q};
    PrecalcQ2 = PrecalcQ2s{q};
    SigmaInv = CalcSigmaCCRF(alphas, betas, PrecalcQ2, mask, useIndicator);
    b = CalcbCCRF(alphas, x{q}, mask, useIndicator);
    y_est = SigmaInv \ b;
    sc = std(y{q}) / std(y_est);
    scalings(q) = sc;
 end
 scaling = mean(scalings);
 end
--- a/model_training/CCNF/CCRF/lib/getScaling2.m
+++ b/model_training/CCNF/CCRF/lib/getScaling2.m
@@ -0,0 +1,30 @@
 function [ scaling ] = getScaling2(  alphas, betas, x, y, PrecalcBs)
 %getScaling Summary of this function goes here
 %   Detailed explanation goes here
 % for visualisation use only the first sequence
 nExamples = numel(x);
 cat_y = [];
 cat_y_pred = [];
 for q=1:nExamples
    PrecalcB = PrecalcBs{q};
    SigmaInv = CalcSigmaCCRF(alphas, betas, PrecalcB);
    b = CalcbCCRF(alphas, x{q});
    y_est = SigmaInv \ b;
    cat_y = cat(1, cat_y, y{q} - mean(y{q}));
 %     cat_y = cat(1, cat_y, y{q});
    cat_y_pred = cat(1, cat_y_pred, y_est);
 end
 % scaling = (max(cat_y) - min(cat_y)) / (max(cat_y_pred) - min(cat_y_pred));
 scaling = std(cat_y) / std(cat_y_pred);
 end
--- a/model_training/CCNF/CCRF/lib/gradientCCRF.m
+++ b/model_training/CCNF/CCRF/lib/gradientCCRF.m
@@ -0,0 +1,92 @@
 function [ logGradientAlphas, logGradientBetas, SigmaInv, ChDecomp ] = gradientCCRF( alphas, betas, lambda_a, lambda_b, precalcQ2withoutBeta, xq, yq, mask, precalcYQ, useIndicator, PrecalcQ2Flat)
 %GRADIENTPRF Summary of this function goes here
 %   Detailed explanation goes here
    % Calculate the Sigma inverse now
 %     [SigmaInv2] = CalcSigmaCCRF(alphas, betas, precalcQ2withoutBeta, mask);
    % This is an optimised version as it does not use the whole matrix but
    % a lower diagonal part due to symmetry
    numElemsInSeq = size(precalcQ2withoutBeta{1}, 1);
    [SigmaInv] = CalcSigmaCCRFflat(alphas, betas, numElemsInSeq, PrecalcQ2Flat, mask, useIndicator);
    % Get the actual sigma from out SigmaInv
    % Sigma = inv(SigmaInv);
    % Below is an optimised version of the above using Cholesky decomposition
    % which decomposes a matrix into a upper triangular (R) and its
    % conjugate transpose R'; A = R'*R for real numbers, thus
    % inv(A) = inv(R)inv(R')
    ChDecomp=chol(SigmaInv);
    I=eye(size(SigmaInv));    
    % Rinv = (R\I);
    % Sigma = Rinv*Rinv';
    % This is a very slightly faster version of the above
    Sigma=ChDecomp\(ChDecomp'\I);
    b = CalcbCCRF(alphas, xq, mask, useIndicator);
    % mu = SigmaInv \ b = Sigma * b;
    % as we've calculate Sigma already, this is equivalent of the above
    mu = Sigma * b;    
    logGradientAlphas = zeros(size(alphas));
    logGradientBetas = zeros(size(betas));
    K1 = numel(alphas);
    K2 = numel(betas);
    % calculating the derivative of L with respect to alpha_k        
    for k = 1:K1
        if(useIndicator)
            dQ1da = diag(mask(:,k));
            dbda = xq(:,k).*mask(:,k);
            gaussGradient = -yq'*dQ1da*yq +2*yq'*dbda -2 * dbda' * mu + mu'*dQ1da*mu;
            zGradient = Sigma(:)'*dQ1da(:);
        else
            % if we don't use the masks here's a speedup
            gaussGradient = -yq'*yq +2*yq'*xq(:,k) -2 * xq(:,k)' * mu + sum(mu.^2);
            % simplification as trace(Sigma * I) = trace(Sigma)
            zGradient = trace(Sigma);
        end
        % add the Z derivative now
        dLda = zGradient + gaussGradient;
        % add regularisation
        dLda = dLda - lambda_a * alphas(k);
        logGradientAlphas(k) = alphas(k) * dLda;
    end
    % This was done for gradient checking
 %   [alphasG, betaG] = gradientAnalytical(nFrames, S, alphas, beta, xq, yq, mask); 
    % calculating the derivative of log(L) with respect to the betas
    for k=1:K2
        % Bs = Bs(:,:,k);
        % dSdb = q2./betas(k); we precalculate this, as it does not change
        % over the course of optimisation (dSdb - dSigma/dbeta)
        dSdb = precalcQ2withoutBeta{k};
        % -yq'*dSdb*yq can be precalculated as they don't change through
        % iterations (precalcQ2withoutBeta is dSdb
        % gaussGradient = -yq'*dSdb*yq + mu'*dSdb*mu;
        % this does the above line
        gaussGradient = precalcYQ(k) + mu'*dSdb*mu;
        % zGradient = trace(Sigma*dSdb);
        zGradient = Sigma(:)'*dSdb(:); % equivalent but faster to the above line
        dLdb = gaussGradient + zGradient;
        % add regularisation term
        dLdb = dLdb - lambda_b * betas(k);
        logGradientBetas(k) = betas(k) * dLdb;
    end
 end
--- a/model_training/CCNF/CCRF/lib/gradientCCRFFull.m
+++ b/model_training/CCNF/CCRF/lib/gradientCCRFFull.m
@@ -0,0 +1,39 @@
 function [ gradientParams, SigmaInvs, CholDecomps, Sigmas ] = gradientCCRFFull( params, lambda_a, lambda_b, PrecalcBs, x, y, Precalc_yBys, PrecalcBsFlat)
 %GRADIENTPRF Summary of this function goes here
 %   Detailed explanation goes here
    nExamples = numel(x);
    numBetas = size(PrecalcBsFlat{1},2);
    numAlphas = numel(params) - numBetas;
    alphasInit = params(1:numAlphas);
    betasInit = params(numAlphas+1:end);
    gradientParams = zeros(size(params));
    % These might be use to calculate the LogLikelihood, don't want to
    % recompute them
    SigmaInvs = cell(nExamples, 1);
    CholDecomps = cell(nExamples, 1);
    Sigmas = cell(nExamples, 1);
    gradients = zeros(nExamples, numel(params));
    for q = 1 : nExamples
        yq = y{q};
        xq = x{q};
        PrecalcB = PrecalcBs{q};
        PrecalcB_flat = PrecalcBsFlat{q};
        [ logGradientsAlphas, logGradientsBetas, SigmaInv, CholDecomp, Sigma ] = gradientCCRF_withoutReg(alphasInit, betasInit, PrecalcB, xq, yq, Precalc_yBys(q, :), PrecalcB_flat);
        SigmaInvs{q} = SigmaInv;
        CholDecomps{q} = CholDecomp;
        Sigmas{q} = Sigma;
        gradients(q,:) = [logGradientsAlphas; logGradientsBetas];
    end
    gradientParams = sum(gradients,1)';
    regAlpha = alphasInit * lambda_a;
    regBeta = betasInit * lambda_b;
    gradientParams = gradientParams - [regAlpha; regBeta];
 end
--- a/model_training/CCNF/CCRF/lib/gradientCCRF_withoutReg.m
+++ b/model_training/CCNF/CCRF/lib/gradientCCRF_withoutReg.m
@@ -0,0 +1,76 @@
 function [ logGradientAlphas, logGradientBetas, SigmaInv, CholDecomp, Sigma ] = gradientCCRF_withoutReg( alphas, betas, precalcQ2withoutBeta, xq, yq, Precalc_yBy, PrecalcB_flat)
 %GRADIENTPRF Summary of this function goes here
 %   Detailed explanation goes here
    % Calculate the Sigma inverse now
    % This is an optimised version as it does not use the whole matrix but
    % a lower diagonal part due to symmetry
    n = size(xq, 1);
    [SigmaInv] = CalcSigmaCCRFflat(alphas, betas, n, PrecalcB_flat);
    % Get the actual sigma from out SigmaInv
    % Sigma = inv(SigmaInv);
    % Below is an optimised version of the above using Cholesky decomposition
    % which decomposes a matrix into a upper triangular (R) and its
    % conjugate transpose R'; A = R'*R for real numbers, thus
    % inv(A) = inv(R)inv(R')
    CholDecomp=chol(SigmaInv);
    I=eye(size(SigmaInv));    
    % This is a way of calculating it faster than just inv(SigmaInv)
    Sigma=CholDecomp\(CholDecomp'\I);
    b = CalcbCCRF(alphas, xq);
    % mu = SigmaInv \ b = Sigma * b;
    % as we've calculate Sigma already, this is equivalent of the above
    mu = Sigma * b;    
    logGradientAlphas = zeros(size(alphas));
    logGradientBetas = zeros(size(betas));
    K1 = numel(alphas);
    K2 = numel(betas);
    % calculating the derivative of L with respect to alpha_k        
    for k = 1:K1
        gaussGradient = -yq'*yq +2*yq'*xq(:,k) -2 * xq(:,k)' * mu + sum(mu.^2);
        % simplification as trace(Sigma * I) = trace(Sigma)
        zGradient = trace(Sigma);
        % add the Z (partition function) derivative now
        dLda = zGradient + gaussGradient;
        logGradientAlphas(k) = dLda;
    end
    % This was done for gradient checking
 %   [alphasG, betaG] = gradientAnalytical(nFrames, S, alphas, beta, xq, yq, mask); 
    % calculating the derivative of log(L) with respect to the betas
    for k=1:K2
        % Bs = Bs(:,:,k);
        % dSdb = q2./betas(k); we precalculate this, as it does not change
        % over the course of optimisation (dSdb - dSigma/dbeta)
        dSdb = precalcQ2withoutBeta{k};
        % -yq'*dSdb*yq can be precalculated as they don't change through
        % iterations (precalcQ2withoutBeta is dSdb
        % gaussGradient = -yq'*dSdb*yq + mu'*dSdb*mu;
        % this does the above line
        gaussGradient = Precalc_yBy(k) + mu'*dSdb*mu;
        % zGradient = trace(Sigma*dSdb);
        zGradient = Sigma(:)'*dSdb(:); % equivalent but faster to the above line
        dLdb = gaussGradient + zGradient;
        logGradientBetas(k) = dLdb;
    end
 end
--- a/model_training/CCNF/CCRF/lib/randInitializeWeights.m
+++ b/model_training/CCNF/CCRF/lib/randInitializeWeights.m
@@ -0,0 +1,35 @@
 function W = randInitializeWeights(L_in, L_out)
 %RANDINITIALIZEWEIGHTS Randomly initialize the weights of a layer with L_in
 %incoming connections and L_out outgoing connections
 %   W = RANDINITIALIZEWEIGHTS(L_in, L_out) randomly initializes the weights 
 %   of a layer with L_in incoming connections and L_out outgoing 
 %   connections. 
 %
 %   Note that W should be set to a matrix of size(L_out, 1 + L_in) as
 %   the column row of W handles the "bias" terms
 %
 % You need to return the following variables correctly 
 % epsilon_init  =  0.12;
 % epsilon_init  =  0.12;
 epsilon_init = 1/sqrt(L_in);
 W  =  rand(L_out,  1  +  L_in)  *  2  *  epsilon_init - epsilon_init;
 % ====================== YOUR CODE HERE ======================
 % Instructions: Initialize W randomly so that we break the symmetry while
 %               training the neural network.
 %
 % Note: The first row of W corresponds to the parameters for the bias units
 %
 % =========================================================================
 end
--- a/model_training/CCNF/CCRF/lib/similarityEuclidean.m
+++ b/model_training/CCNF/CCRF/lib/similarityEuclidean.m
@@ -0,0 +1,8 @@
 function SimilarityMatrix = similarityEuclidean(x)
    %spatial distance measure
    Distances = sqrt(pdist(x)+3e-6).^-1; % 0.05 best so far
    SimilarityMatrix = squareform(Distances) + eye(size(x, 1));
 end
--- a/model_training/CCNF/CCRF/lib/similarityGauss.m
+++ b/model_training/CCNF/CCRF/lib/similarityGauss.m
@@ -0,0 +1,25 @@
 function SimilarityMatrix = similarityGauss(x, sigma, range, mask)
 %spatial distance measure, based on exponential decay, creates a matrix of
 %similarities
 % get the euclidean distance for each pair
 if(numel(range) > 0)
 Distances = exp(-pdist(x(:,range))/sigma); % 0.05 best so far
 else
 Distances = exp(-pdist(x)/sigma); % 0.05 best so far    
 end
 SimilarityMatrix = squareform(Distances);
 % invalidate the illegal values from the mask (if at least one element is
 % not present in the mask set similarity to 0)
 if(numel(mask) ~= 0)    
    invalidInds = sum(mask(:,range),2) < numel(range);
    SimilarityMatrix(invalidInds,:) = 0;
    SimilarityMatrix(:,invalidInds) = 0;
 end
 SimilarityMatrix = SimilarityMatrix + eye(size(x, 1));
 end
--- a/model_training/CCNF/CCRF/lib/similarityNeighbor.m
+++ b/model_training/CCNF/CCRF/lib/similarityNeighbor.m
@@ -0,0 +1,25 @@
 function [ SimilarityMatrix ] = similarityNeighbor( x, n, range)
 %SIMILARITYNEIGHBOR Summary of this function goes here
 %   Detailed explanation goes here
    sz = size(x,1);
    SimilarityMatrix = eye(sz);
    i = 1:sz-n;
    SimilarityMatrix(sub2ind([sz, sz], i+n,i)) = 1;
    SimilarityMatrix(sub2ind([sz, sz], i,i+n)) = 1;
    % invalidate the illegal values from the mask (if at least one element is
    % not present in the mask set similarity to 0)
 %     if(numel(mask)~=0)
 %         invalidInds = sum(mask(:,range),2) < numel(range);
 % 
 %         SimilarityMatrix(invalidInds,:) = 0;
 %         SimilarityMatrix(:,invalidInds) = 0;
 %     end
    DiagMask = ones(size(x, 1)) - eye(size(x,1));
    SimilarityMatrix = SimilarityMatrix .* DiagMask;
    SimilarityMatrix = SimilarityMatrix + eye(size(x, 1));
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/CCNF_ncc_response.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/CCNF_ncc_response.m
@@ -0,0 +1,78 @@
 function [ responses ] = CCNF_ncc_response( patches, patch_experts, normalisation_options, window_size, patch_length)
 %PATCHRESPONSESVM Computing a patch response from a CCNF patch expert
 %   Using convolution, for testing purposes and not for actual speed
    SigmaInv = patch_experts.SigmaInv;
    patchSize = normalisation_options.patchSize;
    if(~iscell(patches))       
        patches = {patches};
    end
    num_modalities = numel(patches);
    responses = zeros(size(patches{1},1), patch_length);
    % prepare the patches by normalising them to zscore (if used)
    if(normalisation_options.zscore)
        for i=1:num_modalities
            patches{i} = zscore(patches{i});
        end
    end
    for i = 1:size(patches{1},1)
        norm_cross_corr = normalisation_options.useNormalisedCrossCorr == 1;
        b = zeros(patch_length,1);
        hl_per_modality = size(patch_experts.thetas,1);
        for p=1:num_modalities
            smallRegionVec = patches{p}(i,:);
            smallRegion = reshape(smallRegionVec, window_size(1), window_size(2));                
            for hls = 1:hl_per_modality
                % because the normalised cross correlation calculates the
                % responses from a normalised template and a normalised image,
                % normalise the thetas here and then apply the normalisation to
                % the response
                w = patch_experts.thetas(hls, 2:end, p);
                norm_w = norm(w);
                w = w/norm(w);
                w = reshape(w, patchSize);
                response = -norm_w * Cross_corr_resp(smallRegion, w, norm_cross_corr, patchSize) - patch_experts.thetas(hls,1,p);
                % here we include the bias term as well, as it wasn't added
                % during the response calculation
                h1 = 1./(1 + exp(response(:)));
                b = b + (2 * patch_experts.alphas((p-1)*hl_per_modality + hls) * h1);
            end
        end
        response = SigmaInv \ b;
        responses(i,:) = response(:);
    end
    responses = responses';
    responses = responses(:);
 end
 function response = Cross_corr_resp(region, patchExpert, normalise_x_corr,patchSize)
    if(normalise_x_corr)
        [response] = normxcorr2(patchExpert, region);
        response = response(patchSize(1):end-patchSize(1)+1,patchSize(2):end-patchSize(2)+1);       
    else        
        % this assumes that the patch is already normed, so just use
        % cross-correlation
        template = rot90(patchExpert,2);
        response = conv2(region, template, 'valid');
    end
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/Create_CCNF_Regressor.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/Create_CCNF_Regressor.m
@@ -0,0 +1,77 @@
 function [alphas, betas, thetas, similarities, sparsities] = Create_CCNF_Regressor(samples, labels, patch_length, similarity_types, sparsity_types, normalisation_options)
 %CREATESVMCLASSIFIER creating a CCNF (LNF) patch expert given labeled
 %training samples
 % Add the CCNF library
 addpath('../../CCNF/lib');
 %% Preparing the similarity and sparsity function
 % this will create a family of similarity neighbour node connections
 similarities = {};
 for i=1:size(similarity_types, 1)
    type = similarity_types{i};
    neighFn = @(x) similarity_neighbor_grid(x, sqrt(patch_length), type);
    similarities = [similarities; {neighFn}];
 end
 sparsities = {};
 % this will create a family of sparsity (inhibition) neighbour node
 % connections
 for i=1:size(sparsity_types, 1)
    spFn = @(x) sparsity_grid(x, sqrt(patch_length), sparsity_types(i,1), sparsity_types(i,2));
    sparsities = [sparsities; {spFn}];
 end
 %% Default training hyper-parameters
 thresholdX = 1e-8;
 thresholdFn = 1e-4;
 max_iter = 200;
 input_layer_size  = size(samples, 1)-1;
 % Some rule of thumb hyper-parameters if similarities are defined or not
 if(numel(similarities) == 0)
    best_lambda_a = 10000;
    best_lambda_b = 0;
    best_lambda_th = 0.1;
    best_num_layer = 5;
 else
    best_lambda_a = 100;
    best_lambda_b = 1000;
    best_lambda_th = 0.1;
    best_num_layer = 5;
 end
 % Checking if hyper-parameters are specified to be overriden
 if(isfield(normalisation_options, 'lambda_a'))
    best_lambda_a = normalisation_options.lambda_a;
 end
 if(isfield(normalisation_options, 'lambda_b'))
    best_lambda_b = normalisation_options.lambda_b;
 end
 if(isfield(normalisation_options, 'lambda_th'))
    best_lambda_th = normalisation_options.lambda_th;
 end
 if(isfield(normalisation_options, 'num_layers'))
    best_num_layer = normalisation_options.num_layers;
 end
 % Initial parameter values
 alphas = 1 * ones(best_num_layer,1);
 betas = 1 * ones(numel(similarities) + numel(sparsities), 1);    
 initial_Theta = randInitializeWeights(input_layer_size, best_num_layer);
 num_seqs = size(samples, 2)/patch_length;
 labels = reshape(labels, patch_length, num_seqs);
 % Actual training
 [alphas, betas, thetas] = CCNF_training_bfgs(thresholdX, thresholdFn, samples, labels, alphas, betas, initial_Theta, best_lambda_a, best_lambda_b, best_lambda_th, similarities, sparsities, 'const', true, 'reinit', true, 'num_reinit', 20, 'num_seqs', num_seqs, 'lbfgs', 'max_iter', max_iter);
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/EvaluatePatchExpert.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/EvaluatePatchExpert.m
@@ -0,0 +1,13 @@
 function [ meanSquaredError, correlation, predictions ] = EvaluatePatchExpert( samples, labels, alphas, betas, thetas, similarities, sparsities, normalisationOptions, region_length )
 %EVALUATEPATCHEXPERT Summary of this function goes here
 %   Detailed explanation goes here
    num_seqs = size(samples, 2)/region_length;
    % adding the bias term, and transposing to optimise
    labels = reshape(labels, region_length, num_seqs);
    [~,~,~, ~, correlation, meanSquaredError, predictions] = evaluate_CCNF_model(alphas, betas, thetas, samples, labels, similarities, sparsities, 0, 1, false);
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/ExtractTrainingSamples.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/ExtractTrainingSamples.m
@@ -0,0 +1,132 @@
 function [samples, labels, samples_unnormed, imgs_used] = ExtractTrainingSamples(examples, landmarkLoc, img_names, sigma, numSamples, landmark, normalisation_options)
 %%
 % for an area of interest of 19x19 and patch support region of 11x11, we
 % would have 9x9=81 samples (9 is the single_input_size, 11 is
 % patch_expert_support_size, 19x19 is normalisation_size, 9 would be the
 % normalisation_side_size)
 evaluation_size = normalisation_options.normalisationRegion;
 patch_expert_support_size = normalisation_options.patchSize;
 normalisation_side_size = (evaluation_size - 1)/2;
 single_input_size = evaluation_size - patch_expert_support_size + 1;
 % Determine the ratio of images to be sampled (most likely not all of them will be)
 samples_per_img = (numSamples / (size(examples,1) * (1 + normalisation_options.rate_negative))) / (single_input_size(1)^2);
 num_samples = int32(samples_per_img * (1 + normalisation_options.rate_negative) * size(examples,1) * (single_input_size(1)^2));
 %% Initialise the samples and labels
 samples = zeros(num_samples, patch_expert_support_size(1) * patch_expert_support_size(2));                
 labels = zeros(num_samples, 1);    
 %% Initialise the unnormed versions of the images
 % This is done in order to assert our use of algorithms for calculating
 % the responses, as for training we might use regular ml procedures,
 % whereas for fitting normalised cross-correlation or just
 % cross-correlation will be used, so keep some unnormed samples
 samples_unnormed = zeros(int32(num_samples/300), evaluation_size(1)^2);
 img_size = [size(examples,2), size(examples,3)];
 % Extract only images of differing shaped faces to extract more diverse
 % training samples
 to_keep = FindDistantLandmarks(landmarkLoc, landmark, round(samples_per_img*size(examples,1)));
 inds_all = 1:size(examples,1);
 samples_to_use = inds_all(to_keep);
 % Keep track of how many samples have been computed already
 samples_filled = 1;
 samples_unnormed_filled = 1;
 %% parse the image names for reporting purposes
 imgs_used = img_names(samples_to_use);
 for i=1:numel(imgs_used)
    [~,name,ext] = fileparts(imgs_used{i});
    imgs_used{i} = [name, ext];
 end
 for i=samples_to_use
    % Do rate_negative negatives and a single positive
    for p=1:normalisation_options.rate_negative+1
        % create a gaussian
        corrPoint = landmarkLoc(i,landmark,:);
        % Ignore occluded points
        if(corrPoint(1) == 0)
           break; 
        end
        startX = 1 - corrPoint(1);
        startY = 1 - corrPoint(2);
        patchWidth = img_size(2);
        patchHeight = img_size(1);
        [X, Y] = meshgrid(startX:patchWidth + startX-1, startY:patchHeight + startY-1);
        response = exp(-0.5*(X.^2+Y.^2)/(sigma^2));
        % Choose positive or negative sample
        if(p==normalisation_options.rate_negative+1)
            sample_centre = squeeze(corrPoint) + round(1*randn(2,1));
        else
            sample_centre = squeeze(corrPoint) + round(10*randn(2,1));                
        end
        sample_centre = round(sample_centre);
        sample_centre(sample_centre <= normalisation_side_size(1)) = normalisation_side_size(1) + 1;
        sample_centre(sample_centre > img_size(1)-normalisation_side_size(1)) = img_size(1) - normalisation_side_size(1) - 1;
        patches = squeeze(examples(i, sample_centre(2) - normalisation_side_size:sample_centre(2) + normalisation_side_size, sample_centre(1) - normalisation_side_size:sample_centre(1) + normalisation_side_size));
        side = (single_input_size - 1)/2;
        responses = response(sample_centre(2) - side(2):sample_centre(2) + side(2), sample_centre(1) - side(1):sample_centre(1) + side(1));
        if(samples_unnormed_filled <= size(samples_unnormed,1))
            % even if correct size is not initialised Matlab will
            % sort that out (would only happen once anyway)
            samples_unnormed(samples_unnormed_filled,:) = patches(:);
            samples_unnormed_filled = samples_unnormed_filled + 1;
        end
        % if we want to normalise each patch individualy do it here
        patch = im2col(patches, patch_expert_support_size, 'sliding')';
        response = im2col(responses, [1,1], 'sliding');
        labels(samples_filled:samples_filled+size(patch,1)-1,:) = response;
        samples(samples_filled:samples_filled+size(patch,1)-1,:) = patch;                             
        samples_filled = samples_filled + size(patch,1);           
    end
 end
 if(normalisation_options.useNormalisedCrossCorr == 1)
    mean_curr = mean(samples, 2);
    patch_normed = samples - repmat(mean_curr,1, patch_expert_support_size(1)*patch_expert_support_size(2));
    % Normalising the patches using the L2 norm
    scaling = sqrt(sum(patch_normed.^2,2));
    scaling(scaling == 0) = 1;
    patch_normed = patch_normed ./ repmat(scaling, 1, patch_expert_support_size(1)*patch_expert_support_size(2));
    samples = patch_normed;
    clear 'patch_normed';
 end
 % Only keep the filled samples
 samples = samples(1:samples_filled-1,:);
 labels = labels(1:samples_filled-1,:);
 if((samples_filled-1)/(single_input_size(1)*single_input_size(2)) < size(samples_unnormed,1))
    samples_unnormed = samples_unnormed(1:(samples_filled-1)/(single_input_size(1)*single_input_size(2)),:);
 end
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/FindDistantLandmarks.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/FindDistantLandmarks.m
@@ -0,0 +1,41 @@
 function [to_keep] = FindDistantLandmarks(landmarkLoc, landmark_num, num_to_keep)
    % First align all of them
    a = landmarkLoc(:,:,1);
    b = landmarkLoc(:,:,2);
    offset_x = mean(a,2);
    offset_y = mean(b,2);
    landmark_loc_off = cat(3, bsxfun(@plus, a, -offset_x), bsxfun(@plus, b, -offset_y));
    fixed_x = landmark_loc_off(:,:,1);
    fixed_y = landmark_loc_off(:,:,2);
    % Extract the relevant landmarks
    fixed_x_l = fixed_x(:,landmark_num);
    fixed_y_l = fixed_y(:,landmark_num);
    obs = cat(2, fixed_x_l, fixed_y_l);
    % Discard landmarks that are very close to each other, so that we only
    % keep more diverse images
    D = squareform(pdist(obs));
    to_keep = true(size(landmarkLoc,1),1);
    for i = 1:(size(landmarkLoc,1) - num_to_keep)
        diversity_score = mean(D,2);
        a = min(diversity_score);
        lowest = find(diversity_score == a);
        lowest = lowest(1);
        to_keep(lowest) = 0;
        D(:,~to_keep) = 0;
        D(~to_keep,:) = 200;
    end
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/Parse_settings.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/Parse_settings.m
@@ -0,0 +1,120 @@
 function [ normalisation_options ] = Parse_settings( sigma, ratio_neg, num_samples, varargin)
 %PARSE_SETTINGS Summary of this function goes here
 %   Detailed explanation goes here
    % creating the parameters to use when training colour (intensity) patches 
    normalisation_options = struct;
    % this is what currently is expected (although could potentially have
    % bigger or smaller patches, this should not be bigger that the patch
    % available in examples and negExamples
    normalisation_options.patchSize = [11 11];    
    % The region size of a region that is taken for training around an
    % aligned or misaligned landmark
    if(sum(strcmp(varargin,'normalisation_size')))
        ind = find(strcmp(varargin,'normalisation_size')) + 1;
        normalisation_options.normalisationRegion = [varargin{ind}, varargin{ind}];
    else
        normalisation_options.normalisationRegion = [21 21];
    end
    % This specifies the split of data ratios
    normalisation_options.ccnf_ratio = 0.9; % proportion of data used for cross-validating CCNFs
    % the rest is used for testing and provides the F1 and accuracy scores
    if(any(strcmp(varargin, 'patch_types')))
        ind = find(strcmp(varargin,'patch_types')) + 1;
        normalisation_options.patch_type = varargin{ind}; 
    else
        normalisation_options.patch_type = {'reg'};         
    end
    if(any(strcmp(varargin, 'sparsity_types')))
        ind = find(strcmp(varargin,'sparsity_types')) + 1;
        if(~isempty( varargin{ind}))
            normalisation_options.sparsity = 1;
            normalisation_options.sparsity_types = varargin{ind}; 
        else
            normalisation_options.sparsity = 0; 
            normalisation_options.sparsity_types = [];
        end
    else
        normalisation_options.sparsity = 0;
        normalisation_options.sparsity_types = [];
    end
    if(any(strcmp(varargin, 'lambda_a')))
        ind = find(strcmp(varargin,'lambda_a')) + 1;
        normalisation_options.lambda_a = varargin{ind};            
    end
    if(any(strcmp(varargin, 'lambda_b')))
        ind = find(strcmp(varargin,'lambda_b')) + 1;
        normalisation_options.lambda_b = varargin{ind};            
    end
    if(any(strcmp(varargin, 'lambda_th')))
        ind = find(strcmp(varargin,'lambda_th')) + 1;
        normalisation_options.lambda_th = varargin{ind};            
    end
    if(any(strcmp(varargin, 'num_layers')))
        ind = find(strcmp(varargin,'num_layers')) + 1;
        normalisation_options.num_layers = varargin{ind};            
    end
    if(any(strcmp(varargin, 'num_bins')))
        ind = find(strcmp(varargin,'num_bins')) + 1;
        normalisation_options.num_hog_bins = varargin{ind};            
    else    
        normalisation_options.num_hog_bins = 9;
    end
    normalisation_options.numSamples = num_samples;
    normalisation_options.useZeroMeanPerPatch = 1;
    normalisation_options.useNormalisedCrossCorr = 1;
    normalisation_options.zscore = 0;
    % Should invalid pixels be taken into account when normalising (yes in
    % case of depth and no in case of colour)
    normalisation_options.ignoreInvalidInMeanStd = 0; % we don't care about invalid pixels at this time (black is valid here) TODO background simulation?
    normalisation_options.setIllegalToPost = 0;
    if(sum(strcmp(varargin,'use_bu')))
        ind = find(strcmp(varargin,'use_bu')) + 1;
        normalisation_options.bu = varargin{ind};       
    else
        normalisation_options.bu = 1;
    end
    if(sum(strcmp(varargin,'use_mpie')))
        ind = find(strcmp(varargin,'use_mpie')) + 1;
        normalisation_options.mpie = varargin{ind};       
    else
        normalisation_options.mpie = 1;
    end
    if(sum(strcmp(varargin,'use_wild')))
        ind = find(strcmp(varargin,'use_wild')) + 1;
        normalisation_options.wild = varargin{ind};       
    else
        normalisation_options.wild = 0;
    end    
    normalisation_options.sigma = sigma;
    normalisation_options.rate_negative = ratio_neg;
    % the similarities need to be tested separately (1,2,3 and 4) and
    % together all, vs hor/ver and diags, and none of course
    if(any(strcmp(varargin, 'similarity_types')))
        ind = find(strcmp(varargin,'similarity_types')) + 1;
        normalisation_options.similarity_types = varargin{ind};
    else        
        normalisation_options.similarity_types = [];
    end    
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/Script_Training_cofw.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/Script_Training_cofw.m
@@ -0,0 +1,38 @@
 clear
 % define the root name of database
 root = '../data_preparation/prepared_data/';
 % which scales we're doing
 sigma = 1;
 num_samples = 5e5;
 scales = [0.25,0.35,0.5];
 frontalView = 1;
 profileViewInds = [];
 version = 'cofw';
 ratio_neg = 5;
 norm = 1;
 data_loc = 'cofw_';
 rng(0);
 similarities = {};
 sparsity = 1;
 sparsity_types = [];
 lambda_a = 100;
 lambda_b = 1000;
 lambda_th = 1;
 num_layers = 7;
 for s=scales
    Train_all_patch_experts(root, frontalView, profileViewInds,...
        s, sigma, version, 'ratio_neg', ratio_neg,...
        'num_samples', num_samples, 'data_loc', data_loc,...
        'normalisation_size', 19, 'similarity_types', similarities, 'sparsity', sparsity,...
        'sparsity_types', sparsity_types, 'lambda_a', lambda_a, 'lambda_b', lambda_b, 'lambda_th', lambda_th, 'num_layers', num_layers);
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/Script_Training_general.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/Script_Training_general.m
@@ -0,0 +1,38 @@
 clear
 % define the root name of database
 root = '../data_preparation/prepared_data/';
 % which scales we're doing
 sigma = 1;
 num_samples = 2.5e6;
 scales = [0.25,0.35,0.5,1.0];
 frontalView = 1;
 profileViewInds = [2,3,4];
 version = 'general';
 ratio_neg = 10;
 norm = 1;
 data_loc = 'combined_';
 rng(0);
 similarities = {[1,2]; [3, 4]};
 sparsity = 1;
 sparsity_types = [4,6];
 lambda_a = 200;
 lambda_b = 7500;
 lambda_th = 1;
 num_layers = 7;
 for s=scales
    Train_all_patch_experts(root, frontalView, profileViewInds,...
        s, sigma, version, 'ratio_neg', ratio_neg,...
        'num_samples', num_samples, 'data_loc', data_loc,...
        'normalisation_size', 19, 'similarity_types', similarities, 'sparsity', sparsity,...
        'sparsity_types', sparsity_types, 'lambda_a', lambda_a, 'lambda_b', lambda_b, 'lambda_th', lambda_th, 'num_layers', num_layers);
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/Script_Training_multi_pie.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/Script_Training_multi_pie.m
@@ -0,0 +1,38 @@
 clear
 % define the root name of database
 root = '../data_preparation/prepared_data/';
 % which scales we're doing
 sigma = 1;
 num_samples = 5e5;
 scales = [0.25,0.35,0.5];
 frontalView = 1;
 profileViewInds = [2,3,4];
 version = 'multi_pie';
 ratio_neg = 5;
 norm = 1;
 data_loc = 'mpie_';
 rng(0);
 similarities = {[1,2]; [3, 4]};
 sparsity = 1;
 sparsity_types = [4,6];
 lambda_a = 100;
 lambda_b = 1000;
 lambda_th = 1;
 num_layers = 7;
 for s=scales
    Train_all_patch_experts(root, frontalView, profileViewInds,...
        s, sigma, version, 'ratio_neg', ratio_neg,...
        'num_samples', num_samples, 'data_loc', data_loc,...
        'normalisation_size', 19, 'similarity_types', similarities, 'sparsity', sparsity,...
        'sparsity_types', sparsity_types, 'lambda_a', lambda_a, 'lambda_b', lambda_b, 'lambda_th', lambda_th, 'num_layers', num_layers);
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/Script_Training_wild.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/Script_Training_wild.m
@@ -0,0 +1,38 @@
 clear
 % define the root name of database
 root = '../data_preparation/prepared_data/';
 % which scales we're doing
 sigma = 1;
 num_samples = 2e6;
 scales = [0.25,0.35,0.5,1.0];
 frontalView = 1;
 profileViewInds = [2];
 version = 'wild';
 ratio_neg = 5;
 norm = 1;
 data_loc = 'wild_';
 rng(0);
 similarities = {[1,2]; [3, 4]};
 sparsity = 1;
 sparsity_types = [4,6];
 lambda_a = 200;
 lambda_b = 7500;
 lambda_th = 1;
 num_layers = 7;
 for s=scales
    Train_all_patch_experts(root, frontalView, profileViewInds,...
        s, sigma, version, 'ratio_neg', ratio_neg,...
        'num_samples', num_samples, 'data_loc', data_loc,...
        'normalisation_size', 19, 'similarity_types', similarities, 'sparsity', sparsity,...
        'sparsity_types', sparsity_types, 'lambda_a', lambda_a, 'lambda_b', lambda_b, 'lambda_th', lambda_th, 'num_layers', num_layers);
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/Train_CCNF_patches.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/Train_CCNF_patches.m
@@ -0,0 +1,152 @@
 function [correlations, rmsErrors, patchExperts, visiIndex, centres, imgs_used, normalisation_options] = Train_CCNF_patches(training_loc, view, scale, sigma, ratio_neg, num_samples, varargin)
 %% creating the model   
    % creating the regression models
    normalisation_options = Parse_settings( sigma, ratio_neg, num_samples, varargin{:});    
    if(sum(strcmp(varargin,'data_loc')))       
        ind = find(strcmp(varargin,'data_loc')) + 1;
        data_loc = varargin{ind};
        data_loc = sprintf(['%s\\' data_loc '%s_%s.mat'], training_loc, num2str(scale), num2str(view));
    else        
        data_loc = sprintf('%s\\wild_%s_%s.mat', training_loc, num2str(scale), num2str(view));
    end    
    load(data_loc);
    examples = all_images;
    landmark_loc = landmark_locations;
    clear 'all_images'    
    numPoints = size(landmark_loc,2);
    correlations = zeros(1, numPoints);
    rmsErrors = zeros(1, numPoints);
    patchExperts = cell(1, numPoints);    
    for j=1:numPoints
        pause(0);
        % can only do mirroring if there is no yaw
        if((numPoints == 68 || numPoints == 29 )&& centres(2) == 0)
            % Do not redo a mirror feature (just flip them)
            if(numPoints == 68)
                mirrorInds = [1,17;2,16;3,15;4,14;5,13;6,12;7,11;8,10;18,27;19,26;20,25;21,24;22,23;...
                      32,36;33,35;37,46;38,45;39,44;40,43;41,48;42,47;49,55;50,54;51,53;60,56;59,57;...
                      61,65;62,64;68,66];     
            else
                mirrorInds = [1,2; 3,4; 5,7; 6,8; 9,10; 11,12; 13,15; 14,16; 17,18; 19,20; 23,24];                
            end
            mirror_idx = j;
            if(any(mirrorInds(:,1)==j))
                mirror_idx = mirrorInds(mirrorInds(:,1)==j,2);
            elseif(any(mirrorInds(:,2)==j))
                mirror_idx = mirrorInds(mirrorInds(:,2)==j,1);
            end
            if(mirror_idx~=j & correlations(1,mirror_idx) ~= 0)
                correlations(1,j) = correlations(1,mirror_idx);
                rmsErrors(1, j) = rmsErrors(1,mirror_idx);
                patchExperts{1, j} = patchExperts{1,mirror_idx};
                num_hl = size(patchExperts{1,mirror_idx}.thetas, 1);
                num_mod = size(patchExperts{1,mirror_idx}.thetas, 3);
                for m=1:num_mod
                    for hl=1:num_hl
                        w = reshape(patchExperts{1,mirror_idx}.thetas(hl, 2:end, m),11,11);
                        w = fliplr(w);
                        w = reshape(w, 121,1);
                        patchExperts{1, j}.thetas(hl, 2:end, m) = w;
                    end
                end
                fprintf('Feature %d done\n', j);
                continue;
            end
        end
        imgs_used = {};
        if(visiIndex(j)) 
            tic;
            % instead of loading the patches compute them here:
            num_samples = normalisation_options.numSamples;
            [samples, labels, unnormed_samples, imgs_used_n] = ExtractTrainingSamples(examples, landmark_loc, actual_imgs_used, sigma, num_samples, j, normalisation_options);
            imgs_used = union(imgs_used, imgs_used_n);
            % add the bias term
            samples = [ones(1,size(samples,1)); samples'];                  
            region_length = normalisation_options.normalisationRegion - normalisation_options.patchSize + 1;
            region_length = region_length(1) * region_length(2);
            num_examples = size(samples, 2);
            % this part sets the split boundaries for training and test subsets
            train_ccnf_start = 1;
            train_ccnf_end = int32(normalisation_options.ccnf_ratio * num_examples - 1);
            % make sure we don't split a full training region apart
            train_ccnf_end = train_ccnf_end - mod(train_ccnf_end, region_length);
            test_start = train_ccnf_end + 1;
            test_end = size(samples,2);
            samples_train = samples(:,train_ccnf_start:train_ccnf_end);
            labels_train = labels(train_ccnf_start:train_ccnf_end);
            samples_test = samples(:,test_start:test_end);                                                
            labels_test = labels(test_start:test_end);
            % Set up the patch expert
            similarity_types = normalisation_options.similarity_types;    
            patch_expert.similarity_types = similarity_types;
            patch_expert.sparsity = normalisation_options.sparsity;
            patch_expert.sparsity_types = normalisation_options.sparsity_types;
            patch_expert.patch_expert_type = 'CCNF';
            % The actual regressor training
            [alpha, betas, thetas, similarities, sparsities] = Create_CCNF_Regressor(samples_train, labels_train, region_length, similarity_types, normalisation_options.sparsity_types, normalisation_options);
            % The learned patch expert
            patch_expert.alphas = alpha;
            patch_expert.betas = betas;
            patch_expert.thetas = thetas;
            % if we have a SigmaInv, we don't need betas anymore (or
            % similarity and sparsity functions for that matter), compute
            % in a single sample for efficiency
            [ ~, ~, Precalc_Bs_flat, ~ ] = CalculateSimilarities( 1, zeros(size(samples,1),region_length), similarities, sparsities, labels_train(1:region_length), true);
            Precalc_B_flat = Precalc_Bs_flat{1};
            SigmaInv = CalcSigmaCCNFflat(patch_expert.alphas, patch_expert.betas, region_length, Precalc_B_flat, eye(region_length), zeros(region_length));
            patch_expert.SigmaInv = SigmaInv;
            % Evaluate the patch expert
            [rmsError, corr,~] = EvaluatePatchExpert(samples_test, labels_test, alpha, betas, thetas, similarities, sparsities, normalisation_options, region_length);
            fprintf('Rms error %.3f, correlation %.3f\n', rmsError, corr);
            % Assert that our normalisation and different fitting are equivalent
 %             normed_samples = samples(:,1:size(unnormed_samples,1)*region_length);
 %             [~, ~, responses_ccnf] = EvaluatePatchExpert(normed_samples, labels(1:size(unnormed_samples,1)*region_length), alpha, betas, thetas, similarities, sparsities, normalisation_options, region_length);
 %             [responses_ccnf_ncc] = CCNF_ncc_response(unnormed_samples, patch_expert, normalisation_options, normalisation_options.normalisationRegion, region_length);
 %             assert(norm(responses_ccnf-responses_ccnf_ncc)< 10e-1);
            correlations(1,j) = corr;
            rmsErrors(1, j) = rmsError;
            patchExperts{1, j} = patch_expert(:);
            fprintf('Landmark %d done\n', j);
            clear samples
            clear samples_test
            clear samples_train
            clear labels
            clear unnormed_samples
            clear imgs_used_n
            toc;
        end
    end
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/Train_all_patch_experts.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/Train_all_patch_experts.m
@@ -0,0 +1,155 @@
 function Train_all_patch_experts(trainingLoc, frontalView, profile_views, scaling, sigma, version, varargin)
 % need some documentation here
 if(sum(strcmp(varargin,'ratio_neg')))
    ind = find(strcmp(varargin,'ratio_neg')) + 1;
    ratio_neg = varargin{ind};
 else
    ratio_neg = 20;
 end
 if(sum(strcmp(varargin,'num_samples')))
    ind = find(strcmp(varargin,'num_samples')) + 1;
    num_samples = varargin{ind};
 else
    num_samples = 5e5;
 end
 patch_experts = struct;
 patch_experts.types = {'reg'};
 patch_experts.correlations = [];
 patch_experts.rms_errors = [];
 patch_experts.patch_experts = cell(numel(patch_experts.types), 1);
 % first do the frontal view
 [visiIndex, centres, patch_experts, imgs_used, norm_options] = ...
    AppendTraining(trainingLoc, frontalView, scaling, sigma, [], [], patch_experts, ratio_neg, num_samples, varargin{:});
 fprintf('Frontal done\n');
 % now do the profile views
 for i=1:numel(profile_views)
    [visiIndex, centres, patch_experts, imgs_used_profile] = ...
        AppendTraining(trainingLoc, profile_views(i), scaling, sigma, visiIndex, centres, patch_experts, ratio_neg, num_samples, varargin{:});
    fprintf('Profile %d done\n', i);
    imgs_used = cat(1, imgs_used, imgs_used_profile);
 end
 % saving time by not retraining mirrored (left/right) views, but just
 % filpping the patch expert
 for i=1:numel(profile_views)
    [visiIndex, centres, patch_experts] = ...
        AppendMirror(visiIndex, centres, patch_experts, numel(profile_views) - i + 2, varargin{:});
    fprintf('Profile %d done\n', i + numel(profile_views));
 end
 % output the training
 locationTxtCol = sprintf('trained/ccnf_patches_%s_%s.txt', num2str(scaling), version);
 locationMlabCol = sprintf('trained/ccnf_patches_%s_%s.mat', num2str(scaling), version);
 Write_patch_experts_ccnf(locationTxtCol, locationMlabCol, scaling, centres, visiIndex, patch_experts, norm_options, [7,9,11,15]);
 % save the images used
 location_imgs_used = sprintf('trained/imgs_used_%s.mat', version);
 save(location_imgs_used, 'imgs_used');
 end
 function [visi_index, centres, patches_m, imgs_used, norm_options] = AppendTraining(training_data_loc, view, scale, sigma, visibilities_init, centres_init, patches_m_init, ratio_neg, num_samples, varargin)
    patches_m = patches_m_init;
    [correlations, rms_errors, patch_experts, visi_index, centres, imgs_used, norm_options] = Train_CCNF_patches(training_data_loc, view, scale, sigma, ratio_neg, num_samples, varargin{:});
    if(numel(patches_m_init.correlations) > 0)
        patches_m.correlations = cat(1, patches_m_init.correlations, correlations);
        patches_m.rms_errors = cat(1, patches_m_init.rms_errors, rms_errors);
        patches_m.patch_experts = cat(1, patches_m_init.patch_experts, patch_experts);
    else
        patches_m.correlations = correlations;
        patches_m.rms_errors = rms_errors;
        patches_m.patch_experts = patch_experts;
    end
    % also add the visibility indices and centres, as that will need to be
    % output to the patch expert when it's written out
    if(numel(visibilities_init) > 0)
        visi_index = cat(1, visibilities_init, visi_index);
        centres = cat(1, centres_init, centres);
    end
 end
 function [visiIndex, centres, patches_m] = AppendMirror(visiIndexInit, centresInit, patches_m, index, varargin)
    if(numel(visiIndexInit) > 0)
        corr_T = patches_m.correlations(index,:);
        if(numel(corr_T) == 66)
            % this specifies the mirrored points, say 1 in left profile becomes 17
            % in right profile if mirrored, non-mirrored points don't need to be
            % specified, they just get flipped but index remains the same
            mirrorInds = [1,17;2,16;3,15;4,14;5,13;6,12;7,11;8,10;18,27;19,26;20,25;21,24;22,23;...
                      32,36;33,35;37,46;38,45;39,44;40,43;41,48;42,47;49,55;50,54;51,53;60,56;59,57;...
                      61,63;66,64];            
        elseif(numel(corr_T) == 68)
            mirrorInds = [1,17;2,16;3,15;4,14;5,13;6,12;7,11;8,10;18,27;19,26;20,25;21,24;22,23;...
                          32,36;33,35;37,46;38,45;39,44;40,43;41,48;42,47;49,55;50,54;51,53;60,56;59,57;...
                          61,65;62,64;68,66];            
        end
        corr_T  = swap(corr_T, mirrorInds(:,1), mirrorInds(:,2));
        patches_m.correlations = cat(1, patches_m.correlations, corr_T);
        AccT = patches_m.rms_errors(index,:);
        AccT = swap(AccT, mirrorInds(:,1), mirrorInds(:,2));
        patches_m.rms_errors = cat(1, patches_m.rms_errors, AccT);
        visiIndexT = visiIndexInit(index,:);                
        visiIndexT = swap(visiIndexT, mirrorInds(:,1), mirrorInds(:,2));
        visiIndex = cat(1, visiIndexInit, visiIndexT);
        % mirroring of the orientation involves flipping yaw or roll (we
        % assume only views with one rotation will be present (say only
        % pitch or yaw or roll)
        centresMirror = [centresInit(index,1), -centresInit(index,2), -centresInit(index,3)];        
        centres = cat(1, centresInit, centresMirror);
        patchExpertMirror = patches_m.patch_experts(index,:);
        patchExpertMirrorT1 = patchExpertMirror(1,mirrorInds(:,1),:);
        patchExpertMirrorT2 = patchExpertMirror(1,mirrorInds(:,2),:);
        patchExpertMirror(1,mirrorInds(:,2),:) = patchExpertMirrorT1;
        patchExpertMirror(1,mirrorInds(:,1),:) = patchExpertMirrorT2;
        % To flip a patch expert it        
        for p=1:size(patchExpertMirror,2)
            if(visiIndexT(p))
                num_hl = size(patchExpertMirror{p}.thetas, 1);
                num_mod = size(patchExpertMirror{p}.thetas, 3);
                for m=1:num_mod
                    for hl=1:num_hl
                        w = reshape(patchExpertMirror{p}.thetas(hl, 2:end, m),11,11);
                        w = fliplr(w);
                        w = reshape(w, 121,1);
                        patchExpertMirror{p}.thetas(hl, 2:end, m) = w;
                    end
                end
            end
        end
        patches_m.patch_experts = cat(1, patches_m.patch_experts, patchExpertMirror);
    end
 end
 function arr = swap(arr, ind1, ind2)
    val1 = arr(ind1);
    val2 = arr(ind2);
    arr(ind1) = val2;
    arr(ind2) = val1;
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/Write_patch_experts_ccnf.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/Write_patch_experts_ccnf.m
@@ -0,0 +1,139 @@
 function Write_patch_experts_ccnf(location_txt, location_mlab, trainingScale, centers, visiIndex, patch_experts, normalisationOptions, w_sizes)
    save(location_mlab, 'patch_experts', 'trainingScale', 'centers', 'visiIndex', 'normalisationOptions');
    patches_file = fopen(location_txt, 'w');        
    [n_views, n_landmarks, ~] = size(patch_experts.correlations);
 %     fprintf(patches_file, '# scaling factor of training\r\n%f\r\n', trainingScale);
    fwrite(patches_file, trainingScale, 'float64');
    % write out the scaling factor as this is what will be used when
    % fitting on the window
 %     fprintf(patches_file, '# number of views\r\n%d\r\n', n_views);
    fwrite(patches_file, n_views, 'int');
    % Write out the information about the view's and centers here
 %     fprintf(patches_file, '# centers of the views\r\n');
    for i=1:n_views
        % this indicates that we're writing a 3x1 double matrix
        writeMatrixBin(patches_file, centers(i,:)', 6);
    end
 %     fprintf(patches_file, '# visibility indices per view\r\n');
    for i=1:n_views
        % this indicates that we're writing a 3x1 double matrix
        writeMatrixBin(patches_file, visiIndex(i,:)', 4);
    end
 %     fprintf(patches_file, '# Sigma component matrices being used in these patches\r\n');    
 %     fprintf(patches_file, '# Number of windows sizes\r\n');
 %     fprintf(patches_file, '%d\r\n', numel(w_sizes));
    fwrite(patches_file, numel(w_sizes), 'int');
    for w = 1:numel(w_sizes)
 %         fprintf(patches_file, '# Size of window\r\n');
 %         fprintf(patches_file, '%d\r\n', w_sizes(w));
        fwrite(patches_file, w_sizes(w), 'int');
        similarities = {};
        response_side_length = w_sizes(w);
        for st=1:size(patch_experts.patch_experts{1,1}.similarity_types, 1)
            type_sim = patch_experts.patch_experts{1,1}.similarity_types{st};
            neighFn = @(x) similarity_neighbor_grid(x, response_side_length, type_sim);
            similarities = cat(1, similarities, {neighFn});
        end
        sparsities = {};
        for st=1:size(patch_experts.patch_experts{1,1}.sparsity_types, 1)
            spFn = @(x) sparsity_grid(x, response_side_length, patch_experts.patch_experts{1,1}.sparsity_types(st,1), patch_experts.patch_experts{1,1}.sparsity_types(st,2));
            sparsities = cat(1, sparsities, {spFn});
        end
        region_length = response_side_length^2;
        % Adding the sparsities here if needed (assuming we are using an
        % 11x11 support area, hard-coded)
        [ ~, PrecalcQ2s, ~, ~ ] = CalculateSimilarities( 1, zeros(122,region_length), similarities, sparsities, zeros(region_length,1), true);
        PrecalcQ2s = PrecalcQ2s{1};
 %         fprintf(patches_file, '# Number of Sigma components\r\n');
        fwrite(patches_file, numel(PrecalcQ2s), 'int');
        for q2=1:numel(PrecalcQ2s)
            writeMatrixBin(patches_file, PrecalcQ2s{q2}, 5);
        end
    end    
 %     fprintf(patches_file, '# Patches themselves (1 line patches of a vertex)\r\n');
    for i=1:n_views
        for j=1:n_landmarks
            % Write out that we're writing a ccnf patch expert of 11x11 support region
            fwrite(patches_file, 5, 'int');
            fwrite(patches_file, 11, 'int');
            fwrite(patches_file, 11, 'int');
            if(~visiIndex(i,j))       
                % Write out that there won't be any neurons for this
                % landmark
                fwrite(patches_file, 0, 'int');
                fwrite(patches_file, 0, 'int');
            else
                num_neurons = size(patch_experts.patch_experts{i,j}.thetas, 1);
                % CCNF patch(5), width, height, num_neurons, Patch(2), neuron_type,
                % normalisation, bias, alpha, rows, cols, type            
                num_modalities = size(patch_experts.patch_experts{i,j}.thetas, 3);
                fwrite(patches_file, num_neurons, 'int');
                for n=1:num_neurons
                    for m=1:num_modalities
                        if(strcmp(patch_experts.types{m}, 'reg'))
                           type = 0; 
                        elseif(strcmp(patch_experts.types{m}, 'grad'))
                           type = 1; 
                        else
                           fprintf('Not supported patch type\n');
                           type = 0;
                        end
                        % normalise the w
                        w = patch_experts.patch_experts{i,j}.thetas(n, 2:end, m);
                        norm_w = norm(w);
                        w = w/norm(w);
                        bias = patch_experts.patch_experts{i,j}.thetas(n, 1, m);
                        alpha = patch_experts.patch_experts{i,j}.alphas((m-1)*num_neurons+n);
                        % also add patch confidence based on correlation scores
                        fwrite(patches_file, 2, 'int');
                        fwrite(patches_file, type, 'int');
                        fwrite(patches_file, norm_w, 'float64');
                        fwrite(patches_file, bias, 'float64');
                        fwrite(patches_file, alpha, 'float64');
                        % the actual weight matrix
                        writeMatrixBin(patches_file, reshape(w, 11, 11), 5);
                    end
                end
                % Write out the betas
                for b=1:numel(patch_experts.patch_experts{i,j}.betas)
                    fwrite(patches_file, patch_experts.patch_experts{i,j}.betas(b), 'float64');
                end
                % finally write out the confidence
                fwrite(patches_file, patch_experts.correlations(i,j), 'float64');
            end    
        end
    end
    fclose(patches_file);
--- a/model_training/CCNF/patch_experts/ccnf_training/extract_inner.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/extract_inner.m
@@ -0,0 +1,13 @@
 clear;
 load('trained/ccnf_patches_1_general.mat');
 % now drop the first 17 points
 visiIndex = visiIndex(:,18:end);
 patch_experts.correlations = patch_experts.correlations(:,18:end);
 patch_experts.rms_errors = patch_experts.rms_errors(:,18:end);
 patch_experts.patch_experts = patch_experts.patch_experts(:,18:end);
 Write_patch_experts_ccnf('trained/ccnf_patches_1.00_inner.txt',...
    'trained/ccnf_patches_1.00_inner.mat', trainingScale, centers,...
    visiIndex, patch_experts, normalisationOptions, [7,9,11,15]);
--- a/model_training/CCNF/patch_experts/ccnf_training/generateDisplayData.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/generateDisplayData.m
@@ -0,0 +1,56 @@
 function [ display_array ] = generateDisplayData( X )
 %GENERATEDISPLAYDATA Summary of this function goes here
 %   Detailed explanation goes here
    example_width = 11;
    example_height = 11;
    % Compute rows, cols
    [m n] = size(X);
    % Compute number of items to display
    display_rows = floor(sqrt(m));
    display_cols = ceil(m / display_rows);
    % Between images padding
    pad = 1;
    % Setup blank display
    display_array = double(zeros(pad + display_rows * (example_height + pad), ...
                           pad + display_cols * (example_width + pad)));
    % Copy each example into a patch on the display array
    curr_ex = 1;
    for j = 1:display_rows
        for i = 1:display_cols
            if curr_ex > m, 
                break; 
            end
            % Copy the patch
 %             if(isa(X, 'uint8'))
                display_array(pad + (j - 1) * (example_height + pad) + (1:example_height), ...
                              pad + (i - 1) * (example_width + pad) + (1:example_width)) = ...
                                reshape(X(curr_ex, :), example_height, example_width);
 %             else
 %                 % Get the max value of the patch                
 %                 minVal = min(X(curr_ex, X(curr_ex,:)~=0)) - 10;     
 %                 if(numel(minVal) < 1)
 %                     minVal = 0;
 %                 end
 %                 maxVal = double(max(X(curr_ex,:)-minVal))/255.0;
 %                 if(numel(minVal) < 1 || maxVal == 0)
 %                     maxVal = 1;
 %                 end
 %                 display_array(pad + (j - 1) * (example_height + pad) + (1:example_height), ...
 %                               pad + (i - 1) * (example_width + pad) + (1:example_width)) = ...
 %                                 reshape((X(curr_ex, :)-minVal)/maxVal, example_height, example_width);                
 %             end
            curr_ex = curr_ex + 1;
        end
        if curr_ex > m, 
            break; 
        end
    end    
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/readme.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/readme.txt
@@ -0,0 +1,13 @@
 Scripts for training continuous conditional neural field (CCNF, or alternatively Local Neural Field - LNF) patch experts.
 This requires data preparation first (not included with the source code), to prepare the data go to '../data_preparation/readme.txt'.
 To train the patch experts run:
 Script_Training_wild.m (using in-the-wild data)
 Script_Training_multi_pie (using multi-pie data)
 Script_Training_general (using combined data)
 To prepare the inner face general patch expert run:
 extract_inner.m
 The trained patch experts are included in the './trained/' folder, the experts you train might differ slightly based on the version of Matlab used.
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.25_cofw.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.25_cofw.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.25_cofw.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.25_cofw.txt
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.25_general.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.25_general.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.25_general.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.25_general.txt
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.25_wild.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.25_wild.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.25_wild.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.25_wild.txt
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.35_cofw.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.35_cofw.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.35_cofw.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.35_cofw.txt
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.35_general.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.35_general.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.35_general.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.35_general.txt
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.35_wild.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.35_wild.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.35_wild.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.35_wild.txt
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.5_cofw.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.5_cofw.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.5_cofw.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.5_cofw.txt
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.5_general.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.5_general.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.5_general.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.5_general.txt
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.5_wild.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.5_wild.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.5_wild.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_0.5_wild.txt
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_1.00_inner.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_1.00_inner.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_1.00_inner.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_1.00_inner.txt
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_1_general.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_1_general.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_1_general.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_1_general.txt
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_1_wild.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_1_wild.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_1_wild.txt
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/ccnf_patches_1_wild.txt
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/imgs_used_cofw.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/imgs_used_cofw.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/imgs_used_general.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/imgs_used_general.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/trained/imgs_used_wild.mat
+++ b/model_training/CCNF/patch_experts/ccnf_training/trained/imgs_used_wild.mat
--- a/model_training/CCNF/patch_experts/ccnf_training/writeMatrix.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/writeMatrix.m
@@ -0,0 +1,16 @@
 % for easier readibility write them row by row
 function writeMatrix(fileID, M, type)
    fprintf(fileID, '%d\r\n', size(M,1));
    fprintf(fileID, '%d\r\n', size(M,2));
    fprintf(fileID, '%d\r\n', type);
    for i=1:size(M,1)
        if(type == 4 || type == 0)
            fprintf(fileID, '%d ', M(i,:));
        else
            fprintf(fileID, '%.9f ', M(i,:));
        end
        fprintf(fileID, '\r\n');
    end
 end
--- a/model_training/CCNF/patch_experts/ccnf_training/writeMatrixBin.m
+++ b/model_training/CCNF/patch_experts/ccnf_training/writeMatrixBin.m
@@ -0,0 +1,37 @@
 % for easier readibility write them row by row
 function writeMatrixBin(fileID, M, type)
    % 4 bytes each for the description
    fwrite(fileID, size(M,1), 'uint');
    fwrite(fileID, size(M,2), 'uint');
    fwrite(fileID, type, 'uint');
    % Convert the matrix to OpenCV format (row minor as opposed to column
    % minor)
    M = M';
    % type 0 - uint8, 1 - int8, 2 - uint16, 3 - int16, 4 - int, 5 -
    % float32, 6 - float64
    % Write out the matrix itself    
    switch type
        case 0
            type = 'uint8';
        case 1
            type = 'int8';
        case 2
            type = 'uint16';
        case 3
            type = 'int16';
        case 4
            type = 'int';
        case 5
            type = 'float32';
        case 6
            type = 'float64';            
        otherwise
            type = 'float32';
    end
    fwrite(fileID, M, type);
 end
--- a/model_training/CCNF/patch_experts/collect_menpo_data/Collect_all_patches.m
+++ b/model_training/CCNF/patch_experts/collect_menpo_data/Collect_all_patches.m
@@ -0,0 +1,35 @@
 function Collect_all_patches(trainingLoc, frontalView, profile_views, scaling, sigma, version, varargin)
 % need some documentation here
 if(sum(strcmp(varargin,'ratio_neg')))
    ind = find(strcmp(varargin,'ratio_neg')) + 1;
    ratio_neg = varargin{ind};
 else
    ratio_neg = 20;
 end
 if(sum(strcmp(varargin,'num_samples')))
    ind = find(strcmp(varargin,'num_samples')) + 1;
    num_samples = varargin{ind};
 else
    num_samples = 5e5;
 end
 % first do the frontal view
 AppendTraining(trainingLoc, frontalView, scaling, sigma, ratio_neg, num_samples, 0, version, varargin{:});
 fprintf('Frontal done\n');
 % now do the profile views
 for i=1:numel(profile_views)
    AppendTraining(trainingLoc, profile_views(i), scaling, sigma, ratio_neg, num_samples, i, version, varargin{:});
    fprintf('Profile %d done\n', i);    
 end
 end
 function AppendTraining(training_data_loc, view, scale, sigma, ratio_neg, num_samples, varargin)
    Collect_patches_view(training_data_loc, view, scale, sigma, ratio_neg, num_samples, varargin{:});
 end
--- a/model_training/CCNF/patch_experts/collect_menpo_data/Collect_patches_view.m
+++ b/model_training/CCNF/patch_experts/collect_menpo_data/Collect_patches_view.m
@@ -0,0 +1,71 @@
 function Collect_patches_view(training_loc, view, scale, sigma, ratio_neg, num_samples, profile_id, version, varargin)
 %% creating the model   
    % creating the regression models
    normalisation_options = Parse_settings( sigma, ratio_neg, num_samples, varargin{:});    
    if(sum(strcmp(varargin,'data_loc')))       
        ind = find(strcmp(varargin,'data_loc')) + 1;
        data_loc = varargin{ind};
        data_loc = sprintf(['%s\\' data_loc '%s_%s.mat'], training_loc, num2str(scale), num2str(view));
    else        
        data_loc = sprintf('%s\\wild_%s_%s.mat', training_loc, num2str(scale), num2str(view));
    end    
    load(data_loc);
    examples = all_images;
    landmark_loc = landmark_locations;
    clear 'all_images'    
    numPoints = size(landmark_loc,2);
    done = false(numPoints,1);
    for j=1:numPoints
        % can only do mirroring if there is no yaw
        if((numPoints == 68 || numPoints == 29 )&& centres(2) == 0)
            % Do not redo a mirror feature (just flip them)
            if(numPoints == 68)
                mirrorInds = [1,17;2,16;3,15;4,14;5,13;6,12;7,11;8,10;18,27;19,26;20,25;21,24;22,23;...
                      32,36;33,35;37,46;38,45;39,44;40,43;41,48;42,47;49,55;50,54;51,53;60,56;59,57;...
                      61,65;62,64;68,66];     
            else
                mirrorInds = [1,2; 3,4; 5,7; 6,8; 9,10; 11,12; 13,15; 14,16; 17,18; 19,20; 23,24];                
            end
            mirror_idx = j;
            if(any(mirrorInds(:,1)==j))
                mirror_idx = mirrorInds(mirrorInds(:,1)==j,2);
            elseif(any(mirrorInds(:,2)==j))
                mirror_idx = mirrorInds(mirrorInds(:,2)==j,1);
            end
            if(mirror_idx~=j & done(mirror_idx))
                continue;
            end
        end
        imgs_used = {};
        if(visiIndex(j)) 
            tic;
            % instead of loading the patches compute them here:
            num_samples = normalisation_options.numSamples;
            [samples, labels, imgs_used_n] = ExtractTrainingSamples(examples, landmark_loc, actual_imgs_used, sigma, num_samples, j, normalisation_options);
            imgs_used = union(imgs_used, imgs_used_n);
            % add the bias term
            samples = [ones(1,size(samples,1)); samples'];                  
            if(centres(2) == 0)
                save(sprintf('E:/menpo_data/%s_data%.2f_frontal_%d.mat', version, scale, j), 'samples', 'labels', '-v7.3');
            else
                save(sprintf('E:/menpo_data/%s_data%.2f_profile%d_%d.mat', version, scale, profile_id, j), 'samples', 'labels', '-v7.3');
            end
            fprintf('Landmark %d done\n', j);
            clear samples            
            clear labels
            done(j) = true;
        end
    end
 end
--- a/model_training/CCNF/patch_experts/collect_menpo_data/ExtractTrainingSamples.m
+++ b/model_training/CCNF/patch_experts/collect_menpo_data/ExtractTrainingSamples.m
@@ -0,0 +1,133 @@
 function [samples, labels, imgs_used] = ExtractTrainingSamples(examples, landmarkLoc, img_names, sigma, numSamples, landmark, normalisation_options)
 %%
 % for an area of interest of 19x19 and patch support region of 11x11, we
 % would have 9x9=81 samples (9 is the single_input_size, 11 is
 % patch_expert_support_size, 19x19 is normalisation_size, 9 would be the
 % normalisation_side_size)
 evaluation_size = normalisation_options.normalisationRegion;
 patch_expert_support_size = normalisation_options.patchSize;
 normalisation_side_size = (evaluation_size - 1)/2;
 single_input_size = evaluation_size - patch_expert_support_size + 1;
 % Determine the ratio of images to be sampled (most likely not all of them will be)
 samples_per_img = (numSamples / (size(examples,1) * (1 + normalisation_options.rate_negative))) / (single_input_size(1)^2);
 num_samples = int32(samples_per_img * (1 + normalisation_options.rate_negative) * size(examples,1) * (single_input_size(1)^2));
 %% Initialise the samples and labels
 samples = zeros(num_samples, patch_expert_support_size(1) * patch_expert_support_size(2));                
 labels = zeros(num_samples, 1);    
 %% Initialise the unnormed versions of the images
 % This is done in order to assert our use of algorithms for calculating
 % the responses, as for training we might use regular ml procedures,
 % whereas for fitting normalised cross-correlation or just
 % cross-correlation will be used, so keep some unnormed samples
 % samples_unnormed = zeros(int32(num_samples/300), evaluation_size(1)^2);
 img_size = [size(examples,2), size(examples,3)];
 % Extract only images of differing shaped faces to extract more diverse
 % training samples
 to_keep = FindDistantLandmarks(landmarkLoc, landmark, round(samples_per_img*size(examples,1)));
 inds_all = 1:size(examples,1);
 samples_to_use = inds_all(to_keep);
 % Keep track of how many samples have been computed already
 samples_filled = 1;
 samples_unnormed_filled = 1;
 %% parse the image names for reporting purposes
 imgs_used = img_names(samples_to_use);
 for i=1:numel(imgs_used)
    [~,name,ext] = fileparts(imgs_used{i});
    imgs_used{i} = [name, ext];
 end
 for i=samples_to_use
    % Do rate_negative negatives and a single positive
    for p=1:normalisation_options.rate_negative+1
        % create a gaussian
        corrPoint = landmarkLoc(i,landmark,:);
        % Ignore occluded points
        if(corrPoint(1) == 0)
           break; 
        end
        startX = 1 - corrPoint(1);
        startY = 1 - corrPoint(2);
        patchWidth = img_size(2);
        patchHeight = img_size(1);
        [X, Y] = meshgrid(startX:patchWidth + startX-1, startY:patchHeight + startY-1);
        response = exp(-0.5*(X.^2+Y.^2)/(sigma^2));
        % Choose positive or negative sample
        if(p==normalisation_options.rate_negative+1)
            sample_centre = squeeze(corrPoint) + round(1*randn(2,1));
        else
            sample_centre = squeeze(corrPoint) + round(10*randn(2,1));                
        end
        sample_centre = round(sample_centre);
        sample_centre(sample_centre <= normalisation_side_size(1)) = normalisation_side_size(1) + 1;
        sample_centre(sample_centre > img_size(1)-normalisation_side_size(1)) = img_size(1) - normalisation_side_size(1) - 1;
        patches = squeeze(examples(i, sample_centre(2) - normalisation_side_size:sample_centre(2) + normalisation_side_size, sample_centre(1) - normalisation_side_size:sample_centre(1) + normalisation_side_size));
        side = (single_input_size - 1)/2;
        responses = response(sample_centre(2) - side(2):sample_centre(2) + side(2), sample_centre(1) - side(1):sample_centre(1) + side(1));
 %         if(samples_unnormed_filled <= size(samples_unnormed,1))
 %             % even if correct size is not initialised Matlab will
 %             % sort that out (would only happen once anyway)
 %             samples_unnormed(samples_unnormed_filled,:) = patches(:);
 %             samples_unnormed_filled = samples_unnormed_filled + 1;
 %         end
        % if we want to normalise each patch individualy do it here
        patch = im2col(patches, patch_expert_support_size, 'sliding')';
        response = im2col(responses, [1,1], 'sliding');
        labels(samples_filled:samples_filled+size(patch,1)-1,:) = response;
        samples(samples_filled:samples_filled+size(patch,1)-1,:) = patch;                             
        samples_filled = samples_filled + size(patch,1);           
    end
 end
 % Only keep the filled samples
 samples = samples(1:samples_filled-1,:);
 labels = labels(1:samples_filled-1,:);
 if(normalisation_options.useNormalisedCrossCorr == 1)
    mean_curr = mean(samples, 2);
    patch_normed = samples - repmat(mean_curr,1, patch_expert_support_size(1)*patch_expert_support_size(2));
    % Normalising the patches using the L2 norm
    scaling = sqrt(sum(patch_normed.^2,2));
    scaling(scaling == 0) = 1;
    patch_normed = patch_normed ./ repmat(scaling, 1, patch_expert_support_size(1)*patch_expert_support_size(2));
    samples = patch_normed;
    clear 'patch_normed';
 end
 % if((samples_filled-1)/(single_input_size(1)*single_input_size(2)) < size(samples_unnormed,1))
 %     samples_unnormed = samples_unnormed(1:(samples_filled-1)/(single_input_size(1)*single_input_size(2)),:);
 % end
 end
--- a/model_training/CCNF/patch_experts/collect_menpo_data/FindDistantLandmarks.m
+++ b/model_training/CCNF/patch_experts/collect_menpo_data/FindDistantLandmarks.m
@@ -0,0 +1,41 @@
 function [to_keep] = FindDistantLandmarks(landmarkLoc, landmark_num, num_to_keep)
    % First align all of them
    a = landmarkLoc(:,:,1);
    b = landmarkLoc(:,:,2);
    offset_x = mean(a,2);
    offset_y = mean(b,2);
    landmark_loc_off = cat(3, bsxfun(@plus, a, -offset_x), bsxfun(@plus, b, -offset_y));
    fixed_x = landmark_loc_off(:,:,1);
    fixed_y = landmark_loc_off(:,:,2);
    % Extract the relevant landmarks
    fixed_x_l = fixed_x(:,landmark_num);
    fixed_y_l = fixed_y(:,landmark_num);
    obs = cat(2, fixed_x_l, fixed_y_l);
    % Discard landmarks that are very close to each other, so that we only
    % keep more diverse images
    D = squareform(pdist(obs));
    to_keep = true(size(landmarkLoc,1),1);
    for i = 1:(size(landmarkLoc,1) - num_to_keep)
        diversity_score = mean(D,2);
        a = min(diversity_score);
        lowest = find(diversity_score == a);
        lowest = lowest(1);
        to_keep(lowest) = 0;
        D(:,~to_keep) = 0;
        D(~to_keep,:) = 200;
    end
 end
--- a/model_training/CCNF/patch_experts/collect_menpo_data/Parse_settings.m
+++ b/model_training/CCNF/patch_experts/collect_menpo_data/Parse_settings.m
@@ -0,0 +1,120 @@
 function [ normalisation_options ] = Parse_settings( sigma, ratio_neg, num_samples, varargin)
 %PARSE_SETTINGS Summary of this function goes here
 %   Detailed explanation goes here
    % creating the parameters to use when training colour (intensity) patches 
    normalisation_options = struct;
    % this is what currently is expected (although could potentially have
    % bigger or smaller patches, this should not be bigger that the patch
    % available in examples and negExamples
    normalisation_options.patchSize = [11 11];    
    % The region size of a region that is taken for training around an
    % aligned or misaligned landmark
    if(sum(strcmp(varargin,'normalisation_size')))
        ind = find(strcmp(varargin,'normalisation_size')) + 1;
        normalisation_options.normalisationRegion = [varargin{ind}, varargin{ind}];
    else
        normalisation_options.normalisationRegion = [21 21];
    end
    % This specifies the split of data ratios
    normalisation_options.ccnf_ratio = 0.9; % proportion of data used for cross-validating CCNFs
    % the rest is used for testing and provides the F1 and accuracy scores
    if(any(strcmp(varargin, 'patch_types')))
        ind = find(strcmp(varargin,'patch_types')) + 1;
        normalisation_options.patch_type = varargin{ind}; 
    else
        normalisation_options.patch_type = {'reg'};         
    end
    if(any(strcmp(varargin, 'sparsity_types')))
        ind = find(strcmp(varargin,'sparsity_types')) + 1;
        if(~isempty( varargin{ind}))
            normalisation_options.sparsity = 1;
            normalisation_options.sparsity_types = varargin{ind}; 
        else
            normalisation_options.sparsity = 0; 
            normalisation_options.sparsity_types = [];
        end
    else
        normalisation_options.sparsity = 0;
        normalisation_options.sparsity_types = [];
    end
    if(any(strcmp(varargin, 'lambda_a')))
        ind = find(strcmp(varargin,'lambda_a')) + 1;
        normalisation_options.lambda_a = varargin{ind};            
    end
    if(any(strcmp(varargin, 'lambda_b')))
        ind = find(strcmp(varargin,'lambda_b')) + 1;
        normalisation_options.lambda_b = varargin{ind};            
    end
    if(any(strcmp(varargin, 'lambda_th')))
        ind = find(strcmp(varargin,'lambda_th')) + 1;
        normalisation_options.lambda_th = varargin{ind};            
    end
    if(any(strcmp(varargin, 'num_layers')))
        ind = find(strcmp(varargin,'num_layers')) + 1;
        normalisation_options.num_layers = varargin{ind};            
    end
    if(any(strcmp(varargin, 'num_bins')))
        ind = find(strcmp(varargin,'num_bins')) + 1;
        normalisation_options.num_hog_bins = varargin{ind};            
    else    
        normalisation_options.num_hog_bins = 9;
    end
    normalisation_options.numSamples = num_samples;
    normalisation_options.useZeroMeanPerPatch = 1;
    normalisation_options.useNormalisedCrossCorr = 1;
    normalisation_options.zscore = 0;
    % Should invalid pixels be taken into account when normalising (yes in
    % case of depth and no in case of colour)
    normalisation_options.ignoreInvalidInMeanStd = 0; % we don't care about invalid pixels at this time (black is valid here) TODO background simulation?
    normalisation_options.setIllegalToPost = 0;
    if(sum(strcmp(varargin,'use_bu')))
        ind = find(strcmp(varargin,'use_bu')) + 1;
        normalisation_options.bu = varargin{ind};       
    else
        normalisation_options.bu = 1;
    end
    if(sum(strcmp(varargin,'use_mpie')))
        ind = find(strcmp(varargin,'use_mpie')) + 1;
        normalisation_options.mpie = varargin{ind};       
    else
        normalisation_options.mpie = 1;
    end
    if(sum(strcmp(varargin,'use_wild')))
        ind = find(strcmp(varargin,'use_wild')) + 1;
        normalisation_options.wild = varargin{ind};       
    else
        normalisation_options.wild = 0;
    end    
    normalisation_options.sigma = sigma;
    normalisation_options.rate_negative = ratio_neg;
    % the similarities need to be tested separately (1,2,3 and 4) and
    % together all, vs hor/ver and diags, and none of course
    if(any(strcmp(varargin, 'similarity_types')))
        ind = find(strcmp(varargin,'similarity_types')) + 1;
        normalisation_options.similarity_types = varargin{ind};
    else        
        normalisation_options.similarity_types = [];
    end    
 end
--- a/model_training/CCNF/patch_experts/collect_menpo_data/Script_collect_menpo.m
+++ b/model_training/CCNF/patch_experts/collect_menpo_data/Script_collect_menpo.m
@@ -0,0 +1,40 @@
 clear
 % define the root name of database
 root = '../data_preparation/prepared_data/';
 % which scales we're doing
 sigma = 1;
 num_samples = 4.5e6; % Making sure all data is used
 scales = [0.25,0.35,0.5,1.0];
 frontalView = 1;
 profileViewInds = [2,3,4];
 version = 'menpo_train';
 ratio_neg = 10;
 norm = 1;
 data_loc = 'menpo_train_';
 rng(0);
 for s=scales
    Collect_all_patches(root, frontalView, profileViewInds,...
        s, sigma, version, 'ratio_neg', ratio_neg,...
        'num_samples', num_samples, 'data_loc', data_loc,...
        'normalisation_size', 19);
 end
 version = 'menpo_valid';
 data_loc = 'menpo_valid_';
 rng(0);
 for s=scales
    Collect_all_patches(root, frontalView, profileViewInds,...
        s, sigma, version, 'ratio_neg', ratio_neg,...
        'num_samples', num_samples, 'data_loc', data_loc,...
        'normalisation_size', 19);
 end
--- a/model_training/CCNF/patch_experts/collect_menpo_data/readme.txt
+++ b/model_training/CCNF/patch_experts/collect_menpo_data/readme.txt
@@ -0,0 +1 @@
 Data collection for train CE-CLM on Menpo data.
--- a/model_training/CCNF/patch_experts/data_preparation/readme.txt
+++ b/model_training/CCNF/patch_experts/data_preparation/readme.txt
@@ -0,0 +1,11 @@
 The code here prepares the training images and labels into a format that can be easilly used to train SVR and CCNF regressors.
 Just run "scripts/Prepare_data_wild_all.m" for data needed to train patch experts for in-the-wild experiments.(you have to have the relevant datasets, but they are all available online at http://ibug.doc.ic.ac.uk/resources/facial-point-annotations/) 
 Run "scripts/Prepare_data_Multi_PIE_all.m" (you have to have the multi-pie dataset and labels)
 Run "scripts/Prepare_data_general_all.m" (you have to have both of the datasets, and you have to run "scripts/Prepare_data_wild_all.m" and scripts/Prepare_data_Multi_PIE_all.m" first.
 Run "scripts/Prepare_data_menpo_all.m" (you have to have the Menpo challenge training data - https://ibug.doc.ic.ac.uk/resources/2nd-facial-landmark-tracking-competition-menpo-ben/)
 PDM model used is trained on 2D landmark labels using Non-Rigid-Structure for motion (code can be found http://www.cl.cam.ac.uk/~tb346/res/ccnf/pdm_generation.zip)
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/AddOrthRow.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/AddOrthRow.m
@@ -0,0 +1,9 @@
 function [RotFull] = AddOrthRow(RotSmall)
  % We can work out these values from the small version of the rotation matrix Rx * Ry * Rz (if you plug in values you can work it out, just slightly tedious)
  RotFull = zeros(3,3);
  RotFull(1:2, :) = RotSmall;
  RotFull(3,1) = RotSmall(1, 2) * RotSmall(2, 3) - RotSmall(1, 3) * RotSmall(2, 2);
  RotFull(3,2) = RotSmall(1, 3) * RotSmall(2, 1) - RotSmall(1, 1) * RotSmall(2, 3);
  RotFull(3,3) = RotSmall(1, 1) * RotSmall(2, 2) - RotSmall(1, 2) * RotSmall(2, 1);
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/AlignShapesKabsch.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/AlignShapesKabsch.m
@@ -0,0 +1,22 @@
 function [ R, T ] = AlignShapesKabsch ( alignFrom, alignTo )
 %ALIGN3DSHAPES Summary of this function goes here
 %   Detailed explanation goes here
    dims = size(alignFrom, 2);
    alignFromMean = alignFrom - repmat(mean(alignFrom), size(alignFrom,1),1);
    alignToMean = alignTo - repmat(mean(alignTo), size(alignTo,1),1);
    [U, ~, V] = svd( alignFromMean' * alignToMean);
    % make sure no reflection is there
    d = sign(det(V*U'));
    corr = eye(dims);
    corr(end,end) = d;
    R = V*corr*U';
    T = mean(alignTo) - (R * mean(alignFrom)')';
    T = T';
 end
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/AlignShapesWithScale.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/AlignShapesWithScale.m
@@ -0,0 +1,30 @@
 function [ A, T, error, alignedShape ] = AlignShapesWithScale( alignFrom, alignTo )
 %ALIGNSHAPESWITHSCALE Summary of this function goes here
 %   Detailed explanation goes here
    numPoints = size(alignFrom,1);
    meanFrom = mean(alignFrom);    
    meanTo = mean(alignTo);
    alignFromMeanNormed = bsxfun(@minus, alignFrom, meanFrom);
    alignToMeanNormed = bsxfun(@minus, alignTo, meanTo);
    % scale now
    sFrom = sqrt(sum(alignFromMeanNormed(:).^2)/numPoints);
    sTo = sqrt(sum(alignToMeanNormed(:).^2)/numPoints);
    s = sTo / sFrom;
    alignFromMeanNormed = alignFromMeanNormed/sFrom;
    alignToMeanNormed = alignToMeanNormed/sTo;
    [R, t] = AlignShapesKabsch(alignFromMeanNormed, alignToMeanNormed);
    A = s * R;
    aligned = (A * alignFrom')';
    T = mean(alignTo - aligned);
    alignedShape = bsxfun(@plus, aligned, T);
    error = mean(sum((alignedShape - alignTo).^2,2));
 end
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/AxisAngle2Rot.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/AxisAngle2Rot.m
@@ -0,0 +1,11 @@
 function [Rot] = AxisAngle2Rot(axisAngle)
    theta = norm(axisAngle, 2);
    nx = axisAngle / theta;
    nx = [    0  -nx(3)  nx(2);
           nx(3)     0  -nx(1);
          -nx(2)  nx(1)     0 ];
    Rot = eye(3) + sin(theta) * nx + (1-cos(theta))*nx^2;
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/Euler2Rot.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/Euler2Rot.m
@@ -0,0 +1,11 @@
 function [Rot] = Euler2Rot(euler)
 	rx = euler(1);
 	ry = euler(2);
 	rz = euler(3);
 	Rx = [1 0 0; 0 cos(rx) -sin(rx); 0 sin(rx) cos(rx)];
 	Ry = [cos(ry) 0 sin(ry); 0 1 0; -sin(ry) 0 cos(ry)];
 	Rz = [cos(rz) -sin(rz) 0; sin(rz) cos(rz) 0; 0 0 1];
 	Rot = Rx * Ry * Rz;
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/GetShape3D.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/GetShape3D.m
@@ -0,0 +1,7 @@
 function [shape3D] = GetShape3D(M, V, p)
    shape3D = M + V * p;
    shape3D = reshape(shape3D, numel(shape3D) / 3, 3);
 end
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/GetShapeOrtho.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/GetShapeOrtho.m
@@ -0,0 +1,20 @@
 function [shape2D] = GetShapeOrtho(M, V, p, global_params)
    % M - mean shape vector
    % V - eigenvectors
    % p - parameters of non-rigid shape
    % V_exp
    % p_exp
    % global_params includes scale, euler rotation, translation, 
    % R - rotation matrix
    % T - translation vector (tx, ty)    
    R = Euler2Rot(global_params(2:4));    
    T = [global_params(5:6); 0];
    a = global_params(1);
    shape3D = GetShape3D(M, V, p);
    shape2D = bsxfun(@plus, a * R*shape3D', T);
    shape2D = shape2D';
 end
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/ProcrustesAnalysis.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/ProcrustesAnalysis.m
@@ -0,0 +1,103 @@
 function [normX, normY, meanShape, Transform] = ProcrustesAnalysis(x, y, options)
 % Translate all elements to origin and scale to 1
 normX = zeros(size(x));
 normY = zeros(size(y));
 for i = 1:size(x,1)
    offsetX = mean(x(i,:));
    offsetY = mean(y(i,:));
    Transform.offsetX(i) = offsetX;
    Transform.offsetY(i) = offsetY;
    normX(i,:) = x(i,:) - offsetX;
    normY(i,:) = y(i,:) - offsetY;
    % Get the Frobenius norm, to scale the shapes to unit size
    scale = norm([normX(i,:) normY(i,:)], 'fro');
    Transform.scale(i) = scale;
    normX(i,:) = normX(i,:)/scale;
    normY(i,:) = normY(i,:)/scale;
 end
 % Rotate elements untill all of them have the same orientation
 % the initial estimate of rotation would be the first element
 % if change is less than 1% stop (shouldn't take more than 2 steps)
 change = 0.1;
 meanShape = [ normX(1,:); normY(1,:) ]';
 Transform.Rotation = zeros(size(x,1),1);
 for i = 1:30
    % align all of the shapes to the mean shape
    % remember all orientations to get the mean one
    orientations = zeros(size(normX,1),1);
    for j = 1:size(x,1)
        % do SVD of mean * X'
        currentShape = [ normX(j,:); normY(j,:) ]';
        [U, ~, V] = svd( meanShape' * currentShape);
        rot = V*U';
        if(asin(rot(2,1)) > 0)
           orientations(j) = real(acos(rot(1,1)));
        else
           orientations(j) = real(-acos(rot(1,1)));
        end
        Transform.Rotation(j) = Transform.Rotation(j) + orientations(j);
        currentShape = currentShape * rot;                
        normX(j,:) = currentShape(:,1)';
        normY(j,:) = currentShape(:,2)';
    end
    % recalculate the mean shape;
    oldMean = meanShape;
    meanShape = [mean(normX); mean(normY)]';
    % rotate the mean shape to mean rotation
    meanOrientation = mean(orientations);
    % Do this only the first time
    if(i==1)
        rotM = [ cos(-meanOrientation) -sin(-meanOrientation); sin(-meanOrientation) cos(-meanOrientation) ]; 
        meanShape = meanShape * rotM;
    end
    % scale mean shape to unit
    meanScale = norm(meanShape, 'fro');
    meanShape = meanShape*(1/meanScale);
    % find frobenious norm
    diff = norm(oldMean - meanShape, 'fro');
    if(diff/norm(oldMean,'fro') < change)
        break;
    end
 end
 % transform to tangent space to preserve linearities
 % get the scaling factors for each shape
 if(options.TangentSpaceTransform)
    scaling = [ normX normY ] * [ meanShape(:,1)' meanShape(:,2)']';
    for i=1:size(x,1)
        normX(i,:) = normX(i,:) * (1 / scaling(i));
        normY(i,:) = normY(i,:) * (1 / scaling(i));
    end
 end
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/ProcrustesAnalysis3D.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/ProcrustesAnalysis3D.m
@@ -0,0 +1,139 @@
 function [ normX, normY, normZ, meanShape, Transform ] = ProcrustesAnalysis3D( x, y, z, tangentSpace, meanShape )
 %PROCRUSTESANALYSIS3D Summary of this function goes here
 %   Detailed explanation goes here
 meanProvided = false;
 if(nargin > 4)
    meanProvided = true;
 end
 % Translate all elements to origin
 normX = zeros(size(x));
 normY = zeros(size(y));
 normZ = zeros(size(z));
 for i = 1:size(x,1)
    offsetX = mean(x(i,:));
    offsetY = mean(y(i,:));
    offsetZ = mean(z(i,:));
    Transform.offsetX(i) = offsetX;
    Transform.offsetY(i) = offsetY;
    Transform.offsetZ(i) = offsetZ;
    normX(i,:) = x(i,:) - offsetX;
    normY(i,:) = y(i,:) - offsetY;
    normZ(i,:) = z(i,:) - offsetZ;
 end
 % Rotate elements untill all of them have the same orientation
 % the initial estimate of rotation would be the first element
 % if change is less than 1% stop (shouldn't take more than 2 steps)
 change = 0.1;
 if(~meanProvided)
    meanShape = [ mean(normX); mean(normY); mean(normZ) ]';
 end
 % scale all the shapes to mean shape
 % Get the Frobenius norm, to scale the shapes to mean size (still want to
 % retain mm)
 meanScale = norm(meanShape, 'fro');    
 for i = 1:size(x,1)
    scale = norm([normX(i,:) normY(i,:) normZ(i,:)], 'fro')/meanScale;
    normX(i,:) = normX(i,:)/scale;
    normY(i,:) = normY(i,:)/scale;
    normZ(i,:) = normZ(i,:)/scale;
 end
 Transform.RotationX = zeros(size(x,1),1);
 Transform.RotationY = zeros(size(x,1),1);
 Transform.RotationZ = zeros(size(x,1),1);
 for i = 1:30
    % align all of the shapes to the mean shape
    % remember all orientations to get the mean one (in euler angle form, pitch, yaw roll)
    orientationsX = zeros(size(normX,1),1);
    orientationsY = zeros(size(normX,1),1);
    orientationsZ = zeros(size(normX,1),1);
    for j = 1:size(x,1)
        currentShape = [normX(j,:); normY(j,:); normZ(j,:)]';
        % we want to align the current shape to the mean one
        [ R, T ] = AlignShapesKabsch(currentShape, meanShape);
        eulers = Rot2Euler(R);
        orientationsX(j) = eulers(1);
        orientationsY(j) = eulers(2);
        orientationsZ(j) = eulers(3);
        Transform.RotationX(j) = eulers(1);
        Transform.RotationY(j) = eulers(2);
        Transform.RotationZ(j) = eulers(3);
        currentShape = R * currentShape';                
        normX(j,:) = currentShape(1,:);
        normY(j,:) = currentShape(2,:);
        normZ(j,:) = currentShape(3,:);
    end
    % recalculate the mean shape
 %     if(~meanProvided)
        oldMean = meanShape;
        meanShape = [mean(normX); mean(normY); mean(normZ)]';
        meanScale = norm(meanShape, 'fro');  
 %     end
    for j = 1:size(x,1)
        scale = norm([normX(j,:) normY(j,:) normZ(j,:)], 'fro')/meanScale;
        normX(j,:) = normX(j,:)/scale;
        normY(j,:) = normY(j,:)/scale;
        normZ(j,:) = normZ(j,:)/scale;
    end
    if(i==1 && ~meanProvided)
        % rotate the mean shape to mean rotation
        meanOrientationX = mean(orientationsX);
        meanOrientationY = mean(orientationsY);
        meanOrientationZ = mean(orientationsZ);
        R = Euler2Rot([meanOrientationX, meanOrientationY, meanOrientationZ]);
        meanShape = (R * meanShape')';
    end
    % find frobenious norm
    diff = norm(oldMean - meanShape, 'fro');
    if(diff/norm(oldMean,'fro') < change)
        break;
    end
 end
 % transform to tangent space to preserve linearities
 % get the scaling factors for each shape
 if(tangentSpace)
    [ normX, normY, normZ] = TangentSpaceTransform(normX, normY, normZ, meanShape);
 end
 end
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/Rot2AxisAngle.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/Rot2AxisAngle.m
@@ -0,0 +1,11 @@
 function [ axisAngle ] = Rot2AxisAngle( Rot )
 %ROT2AXISANGLE Summary of this function goes here
 %   Detailed explanation goes here
    theta = acos((trace(Rot) - 1) / 2);
    vec = 1.0/(2*sin(theta));
    vec = vec * [Rot(3,2) - Rot(2,3), Rot(1,3) - Rot(3,1), Rot(2,1) - Rot(1,2)];
    axisAngle = vec * theta;
 end
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/Rot2Euler.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/Rot2Euler.m
@@ -0,0 +1,12 @@
 function [euler] = Rot2Euler(R)
    q0 = sqrt( 1 + R(1,1) + R(2,2) + R(3,3) ) / 2;
    q1 = (R(3,2) - R(2,3)) / (4*q0) ;
    q2 = (R(1,3) - R(3,1)) / (4*q0) ;
    q3 = (R(2,1) - R(1,2)) / (4*q0) ;
    yaw  = asin(2*(q0*q2 + q1*q3));
    pitch= atan2(2*(q0*q1-q2*q3), q0*q0-q1*q1-q2*q2+q3*q3); 
    roll = atan2(2*(q0*q3-q1*q2), q0*q0+q1*q1-q2*q2-q3*q3);
    euler = [pitch, yaw, roll];
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/TangentSpaceTransform.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/TangentSpaceTransform.m
@@ -0,0 +1,17 @@
 function [ transformedX, transformedY, transformedZ ] = TangentSpaceTransform( x, y, z, meanShape )
 %TANGENTSPACETRANSFORM Summary of this function goes here
 %   Detailed explanation goes here
    scaling = [ x y z] * [ meanShape(:,1)' meanShape(:,2)' meanShape(:,3)']';
    for i=1:size(x,1)
        x(i,:) = x(i,:) * (1 / scaling(i));
        y(i,:) = y(i,:) * (1 / scaling(i));
        z(i,:) = z(i,:) * (1 / scaling(i));
    end
    transformedX = x * mean(scaling);
    transformedY = y * mean(scaling);
    transformedZ = z * mean(scaling);
 end
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/fit_PDM_ortho_proj_to_2D.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/fit_PDM_ortho_proj_to_2D.m
@@ -0,0 +1,345 @@
 function [ a, R, T, T3D, params, error, shapeOrtho ] = fit_PDM_ortho_proj_to_2D( M, E, V, shape2D, f, cx, cy)
 %FITPDMTO2DSHAPE Summary of this function goes here
 %   Detailed explanation goes here
    params = zeros(size(E));
    hidden = false;
    % if some of the points are unavailable modify M, V, and shape2D (can
    % later infer the actual shape from this)
    if(sum(shape2D(:)==0) > 0)        
        hidden = true;
        % which indices to remove
        inds_to_rem = shape2D(:,1) == 0 | shape2D(:,2) == 0;
        shape2D = shape2D(~inds_to_rem,:);
        inds_to_rem = repmat(inds_to_rem, 3, 1);
        M_old = M;
        V_old = V;
        M = M(~inds_to_rem);
        V = V(~inds_to_rem,:);
    end
    num_points = numel(M) / 3;
    m = reshape(M, num_points, 3)';
    width_model = max(m(1,:)) - min(m(1,:));
    height_model = max(m(2,:)) - min(m(2,:));
    bounding_box = [min(shape2D(:,1)), min(shape2D(:,2)),...
                    max(shape2D(:,1)), max(shape2D(:,2))];
    a = (((bounding_box(3) - bounding_box(1)) / width_model) + ((bounding_box(4) - bounding_box(2))/ height_model)) / 2;
    tx = (bounding_box(3) + bounding_box(1))/2;
    ty = (bounding_box(4) + bounding_box(2))/2;
    % correct it so that the bounding box is just around the minimum
    % and maximum point in the initialised face
    tx = tx - a*(min(m(1,:)) + max(m(1,:)))/2;
    ty = ty - a*(min(m(2,:)) + max(m(2,:)))/2;    
    R = eye(3); 
    T = [tx; ty];
    currShape = getShapeOrtho(M, V, params, R, T, a);
    currError = getRMSerror(currShape, shape2D);
    reg_rigid = zeros(6,1);
    regFactor = 20;
    regularisations = [reg_rigid; regFactor ./ E]; % the above version, however, does not perform as well
    regularisations = diag(regularisations)*diag(regularisations);
    red_in_a_row = 0;
    for i=1:1000
        shape3D = M + V * params;
        shape3D = reshape(shape3D, numel(shape3D) / 3, 3);
        % Now find the current residual error        
        currShape = a * R(1:2,:)*shape3D' + repmat(T, 1, numel(M)/3); 
        currShape = currShape'; 
        error_res = shape2D - currShape;
        eul = Rot2Euler(R);
        p_global = [a; eul'; T];
        % get the Jacobians
        J = CalcJacobian(M, V, params, p_global);
        % RLMS style update
        p_delta = (J'*J + regularisations) \ (J'*error_res(:) - regularisations*[p_global;params]);
        [params, p_global] = CalcReferenceUpdate(p_delta, params, p_global);
        a = p_global(1);
        R = Euler2Rot(p_global(2:4));
        T = p_global(5:6);
        shape3D = M + V * params;
        shape3D = reshape(shape3D, numel(shape3D) / 3, 3);
        currShape = a * R(1:2,:)*shape3D' + repmat(T, 1, numel(M)/3); 
        currShape = currShape'; 
        error = getRMSerror(currShape, shape2D);
        if(0.999 * currError < error)
            red_in_a_row = red_in_a_row + 1;
            if(red_in_a_row == 5)
                break;
            end
        end
        currError = error;
    end    
    if(hidden)
        shapeOrtho = getShapeOrtho(M_old, V_old, params, R, T, a);
    else
        shapeOrtho = currShape;
    end
    if(nargin == 7)
        Zavg = f / a;
        Xavg = (T(1) - cx) / a;
        Yavg = (T(2) - cy) / a;
        T3D = [Xavg;Yavg;Zavg];
    else
        T3D = [0;0;0];
    end
 end
 function [shape2D] = getShapeOrtho(M, V, p, R, T, a)
    % M - mean shape vector
    % V - eigenvectors
    % p - parameters of non-rigid shape
    % R - rotation matrix
    % T - translation vector (tx, ty)
    shape3D = getShape3D(M, V, p);
    shape2D = a * R(1:2,:)*shape3D' + repmat(T, 1, numel(M)/3);
    shape2D = shape2D';
 end
 function [shape2D] = getShapeOrthoFull(M, V, p, R, T, a)
    % M - mean shape vector
    % V - eigenvectors
    % p - parameters of non-rigid shape
    % R - rotation matrix
    % T - translation vector (tx, ty)    
    T = [T; 0];
    shape3D = getShape3D(M, V, p);
    shape2D = a * R*shape3D' + repmat(T, 1, numel(M)/3);
    shape2D = shape2D';
 end
 function [shape3D] = getShape3D(M, V, params)
    shape3D = M + V * params;
    shape3D = reshape(shape3D, numel(shape3D) / 3, 3);
 end
 function [error] = getRMSerror(shape2Dv1, shape2Dv2)
    error = sqrt(mean(reshape(shape2Dv1 - shape2Dv2, numel(shape2Dv1), 1).^2));
 end
 % This calculates the combined rigid with non-rigid Jacobian
 function J = CalcJacobian(M, V, p, p_global)
    n = size(M, 1)/3;
    non_rigid_modes = size(V,2);
    J = zeros(n*2, 6 + non_rigid_modes);
    % now the layour is
    % ---------- Rigid part -------------------|----Non rigid part--------|
    % dx_1/ds, dx_1/dr1, ... dx_1/dtx, dx_1/dty dx_1/dp_1 ... dx_1/dp_m
    % dx_2/ds, dx_2/dr1, ... dx_2/dtx, dx_2/dty dx_2/dp_1 ... dx_2/dp_m
    % ...
    % dx_n/ds, dx_n/dr1, ... dx_n/dtx, dx_n/dty dx_n/dp_1 ... dx_n/dp_m
    % dy_1/ds, dy_1/dr1, ... dy_1/dtx, dy_1/dty dy_1/dp_1 ... dy_1/dp_m
    % ...
    % dy_n/ds, dy_n/dr1, ... dy_n/dtx, dy_n/dty dy_n/dp_1 ... dy_n/dp_m
    % getting the rigid part
    J(:,1:6) = CalcRigidJacobian(M, V, p, p_global);
    % constructing the non-rigid part
    R = Euler2Rot(p_global(2:4));
    s = p_global(1);
    % 'rotate' and 'scale' the principal components
    % First reshape to 3D
    V_X = V(1:n,:);
    V_Y = V(n+1:2*n,:);
    V_Z = V(2*n+1:end,:);
    J_x_non_rigid = s*(R(1,1)*V_X + R(1,2)*V_Y + R(1,3)*V_Z);
    J_y_non_rigid = s*(R(2,1)*V_X + R(2,2)*V_Y + R(2,3)*V_Z);
    J(1:n, 7:end) = J_x_non_rigid;
    J(n+1:end, 7:end) = J_y_non_rigid;
 end
 function J = CalcRigidJacobian(M, V, p, p_global)
 	n = size(M, 1)/3;
 	% Get the current 3D shape (not affected by global transform, as this
 	% is how the Jacobian was derived (for derivation please see
 	% ../derivations/orthoJacobian
 	shape3D = GetShape3D(M, V, p);
 	% Get the rotation matrix corresponding to current global orientation
 	R = Euler2Rot(p_global(2:4));
 	s = p_global(1);
    % Rigid Jacobian is laid out as follows
    % dx_1/ds, dx_1/dr1, dx_1/dr2, dx_1/dr3, dx_1/dtx, dx_1/dty
    % dx_2/ds, dx_2/dr1, dx_2/dr2, dx_2/dr3, dx_2/dtx, dx_2/dty
    % ...
    % dx_n/ds, dx_n/dr1, dx_n/dr2, dx_n/dr3, dx_n/dtx, dx_n/dty
    % dy_1/ds, dy_1/dr1, dy_1/dr2, dy_1/dr3, dy_1/dtx, dy_1/dty
    % ...
    % dy_n/ds, dy_n/dr1, dy_n/dr2, dy_n/dr3, dy_n/dtx, dy_n/dty
    J = zeros(n*2, 6);
    % dx/ds = X * r11  + Y * r12 + Z * r13
    % dx/dr1 =  s*(r13 * Y - r12 * Z)
    % dx/dr2 = -s*(r13 * X - r11 * Z)
    % dx/dr3 =  s*(r12 * X - r11 * Y)
    % dx/dtx = 1
    % dx/dty = 0
    % dy/ds = X * r21  + Y * r22 + Z * r23
    % dy/dr1 =  s * (r23 * Y - r22 * Z)
    % dy/dr2 = -s * (r23 * X - r21 * Z)
    % dy/dr3 =  s * (r22 * X - r21 * Y)
    % dy/dtx = 0
    % dy/dty = 1
    % set the Jacobian for x's
    % with respect to scaling factor
    J(1:n,1) = shape3D * R(1,:)';
    % with respect to angular rotation around x, y, and z axes
    % Change of x with respect to change in axis angle rotation
 	dxdR = [      0,  R(1,3), -R(1,2);
            -R(1,3),       0,  R(1,1);
             R(1,2), -R(1,1),      0];
    J(1:n,2:4) = s*(dxdR * shape3D')';
    % with respect to translation
    J(1:n,5) = 1;
    J(1:n,6) = 0;
    % set the Jacobian for y's
    % with respect to scaling factor
    J(n+1:end,1) = shape3D * R(2,:)';
     % with respect to angular rotation around x, y, and z axes
    % Change of y with respect to change in axis angle rotation
    dydR = [      0,  R(2,3), -R(2,2);
            -R(2,3),       0,  R(2,1);
             R(2,2), -R(2,1),      0];
     J(n+1:end,2:4) = s*(dydR * shape3D')';
     % with respect to translation
     J(n+1:end,5) = 0;
     J(n+1:end,6) = 1;
 end
 % This updates the parameters based on the updates from the RLMS
 function [non_rigid, rigid] = CalcReferenceUpdate(params_delta, current_non_rigid, current_global)
    rigid = zeros(6, 1);
    % Same goes for scaling and translation parameters
    rigid(1) = current_global(1) + params_delta(1);
    rigid(5) = current_global(5) + params_delta(5);
    rigid(6) = current_global(6) + params_delta(6);
    % for rotation however, we want to make sure that the rotation matrix
    % approximation we have 
 	% R' = [1, -wz, wy
 	%       wz, 1, -wx
 	%       -wy, wx, 1]	
    % is a legal rotation matrix, and then we combine it with current
    % rotation (through matrix multiplication) to acquire the new rotation
 	R = Euler2Rot(current_global(2:4));
    wx = params_delta(2);
    wy = params_delta(3);
    wz = params_delta(4);
    R_delta = [1, -wz, wy;
               wz, 1, -wx;
               -wy, wx, 1];
 	% Make sure R_delta is orthonormal
 	R_delta = OrthonormaliseRotation(R_delta);
    % Combine rotations
 	R_final = R * R_delta;
 	% Extract euler angle
 	euler = Rot2Euler(R_final);	
 	rigid(2:4) = euler;
    if(length(params_delta) > 6)
        % non-rigid parameters can just be added together
        non_rigid = params_delta(7:end) +  current_non_rigid;
    else
        non_rigid = current_non_rigid;
    end
 end
 function R_ortho = OrthonormaliseRotation(R)
    % U * V' is basically what we want, as it's guaranteed to be
    % orthonormal
    [U, ~, V] = svd(R);
    % We also want to make sure no reflection happened
    % get the orthogonal matrix from the initial rotation matrix
    X = U*V';
    % This makes sure that the handedness is preserved and no reflection happened
    % by making sure the determinant is 1 and not -1
    W = eye(3);
    W(3,3) = det(X);
    R_ortho = U*W*V';
 end
--- a/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/iterate_piece_wise.m
+++ b/model_training/CCNF/patch_experts/data_preparation/scripts/PDM_helpers/iterate_piece_wise.m
@@ -0,0 +1,32 @@
 function [ pts_new ] = iterate_piece_wise( pts_orig, new_num_points )
 %ITERATE_PIECE_WISE Summary of this function goes here
 %   Detailed explanation goes here
    % Reinterpolate the new points, but make sure they are still on the
    % same lines
    num_orig = size(pts_orig,1);
    % Divide the number of original segments by number of new segments
    step_size = (num_orig - 1) / (new_num_points-1);
    pts_new = zeros(new_num_points,2);
    % Clamp the beginning and end, as they will be the same
    pts_new(1,:) = pts_orig(1,:);
    pts_new(end,:) = pts_orig(end,:);
    for i=1:new_num_points-2
        low_point = floor(1 + i * step_size);
        high_point = ceil(1 + i * step_size);
        coeff_1 = floor(1 + i * step_size) - i * step_size;
        coeff_2 = 1 - coeff_1;
        new_pt = coeff_1 * pts_orig(low_point,:) + coeff_2 * pts_orig(high_point,:);
        pts_new(i+1,:) = new_pt;
    end
 end
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1 @@`
							`Data collection for train CE-CLM on Menpo data.`