mirror of
https://gitcode.com/gh_mirrors/ope/OpenFace.git
synced 2026-05-15 11:47:50 +00:00
516 lines
19 KiB
C++
516 lines
19 KiB
C++
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
#ifndef DLIB_MODULARITY_ClUSTERING__H__
|
|
#define DLIB_MODULARITY_ClUSTERING__H__
|
|
|
|
#include "modularity_clustering_abstract.h"
|
|
#include "../sparse_vector.h"
|
|
#include "../graph_utils/edge_list_graphs.h"
|
|
#include "../matrix.h"
|
|
#include "../rand.h"
|
|
|
|
namespace dlib
|
|
{
|
|
|
|
// -----------------------------------------------------------------------------------------
|
|
|
|
namespace impl
|
|
{
|
|
inline double newman_cluster_split (
|
|
dlib::rand& rnd,
|
|
const std::vector<ordered_sample_pair>& edges,
|
|
const matrix<double,0,1>& node_degrees, // k from the Newman paper
|
|
const matrix<double,0,1>& Bdiag, // diag(B) from the Newman paper
|
|
const double& edge_sum, // m from the Newman paper
|
|
matrix<double,0,1>& labels,
|
|
const double eps,
|
|
const unsigned long max_iterations
|
|
)
|
|
/*!
|
|
requires
|
|
- node_degrees.size() == max_index_plus_one(edges)
|
|
- Bdiag.size() == max_index_plus_one(edges)
|
|
- edges must be sorted according to order_by_index()
|
|
ensures
|
|
- This routine splits a graph into two subgraphs using the Newman
|
|
clustering method.
|
|
- returns the modularity obtained when the graph is split according
|
|
to the contents of #labels.
|
|
- #labels.size() == node_degrees.size()
|
|
- for all valid i: #labels(i) == -1 or +1
|
|
- if (this function returns 0) then
|
|
- all the labels are equal, i.e. the graph is not split.
|
|
!*/
|
|
{
|
|
// Scale epsilon so that it is relative to the expected value of an element of a
|
|
// unit vector of length node_degrees.size().
|
|
const double power_iter_eps = eps * std::sqrt(1.0/node_degrees.size());
|
|
|
|
// Make a random unit vector and put in labels.
|
|
labels.set_size(node_degrees.size());
|
|
for (long i = 0; i < labels.size(); ++i)
|
|
labels(i) = rnd.get_random_gaussian();
|
|
labels /= length(labels);
|
|
|
|
matrix<double,0,1> Bv, Bv_unit;
|
|
|
|
// Do the power iteration for a while.
|
|
double eig = -1;
|
|
double offset = 0;
|
|
while (eig < 0)
|
|
{
|
|
|
|
// any number larger than power_iter_eps
|
|
double iteration_change = power_iter_eps*2+1;
|
|
for (unsigned long i = 0; i < max_iterations && iteration_change > power_iter_eps; ++i)
|
|
{
|
|
sparse_matrix_vector_multiply(edges, labels, Bv);
|
|
Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees;
|
|
|
|
if (offset != 0)
|
|
{
|
|
Bv -= offset*labels;
|
|
}
|
|
|
|
|
|
const double len = length(Bv);
|
|
if (len != 0)
|
|
{
|
|
Bv_unit = Bv/len;
|
|
iteration_change = max(abs(labels-Bv_unit));
|
|
labels.swap(Bv_unit);
|
|
}
|
|
else
|
|
{
|
|
// Had a bad time, pick another random vector and try it with the
|
|
// power iteration.
|
|
for (long i = 0; i < labels.size(); ++i)
|
|
labels(i) = rnd.get_random_gaussian();
|
|
}
|
|
}
|
|
|
|
eig = dot(Bv,labels);
|
|
// we will repeat this loop if the largest eigenvalue is negative
|
|
offset = eig;
|
|
}
|
|
|
|
|
|
for (long i = 0; i < labels.size(); ++i)
|
|
{
|
|
if (labels(i) > 0)
|
|
labels(i) = 1;
|
|
else
|
|
labels(i) = -1;
|
|
}
|
|
|
|
|
|
// compute B*labels, store result in Bv.
|
|
sparse_matrix_vector_multiply(edges, labels, Bv);
|
|
Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees;
|
|
|
|
// Do some label refinement. In this step we swap labels if it
|
|
// improves the modularity score.
|
|
bool flipped_label = true;
|
|
while(flipped_label)
|
|
{
|
|
flipped_label = false;
|
|
unsigned long idx = 0;
|
|
for (long i = 0; i < labels.size(); ++i)
|
|
{
|
|
const double val = -2*labels(i);
|
|
const double increase = 4*Bdiag(i) + 2*val*Bv(i);
|
|
|
|
// if there is an increase in modularity for swapping this label
|
|
if (increase > 0)
|
|
{
|
|
labels(i) *= -1;
|
|
while (idx < edges.size() && edges[idx].index1() == (unsigned long)i)
|
|
{
|
|
const long j = edges[idx].index2();
|
|
Bv(j) += val*edges[idx].distance();
|
|
++idx;
|
|
}
|
|
|
|
Bv -= (val*node_degrees(i)/(2*edge_sum))*node_degrees;
|
|
|
|
flipped_label = true;
|
|
}
|
|
else
|
|
{
|
|
while (idx < edges.size() && edges[idx].index1() == (unsigned long)i)
|
|
{
|
|
++idx;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
const double modularity = dot(Bv, labels)/(4*edge_sum);
|
|
|
|
return modularity;
|
|
}
|
|
|
|
// -------------------------------------------------------------------------------------
|
|
|
|
inline unsigned long newman_cluster_helper (
|
|
dlib::rand& rnd,
|
|
const std::vector<ordered_sample_pair>& edges,
|
|
const matrix<double,0,1>& node_degrees, // k from the Newman paper
|
|
const matrix<double,0,1>& Bdiag, // diag(B) from the Newman paper
|
|
const double& edge_sum, // m from the Newman paper
|
|
std::vector<unsigned long>& labels,
|
|
double modularity_threshold,
|
|
const double eps,
|
|
const unsigned long max_iterations
|
|
)
|
|
/*!
|
|
ensures
|
|
- returns the number of clusters the data was split into
|
|
!*/
|
|
{
|
|
matrix<double,0,1> l;
|
|
const double modularity = newman_cluster_split(rnd,edges,node_degrees,Bdiag,edge_sum,l,eps,max_iterations);
|
|
|
|
|
|
// We need to collapse the node index values down to contiguous values. So
|
|
// we use the following two vectors to contain the mappings from input index
|
|
// values to their corresponding index values in each split.
|
|
std::vector<unsigned long> left_idx_map(node_degrees.size());
|
|
std::vector<unsigned long> right_idx_map(node_degrees.size());
|
|
|
|
// figure out how many nodes went into each side of the split.
|
|
unsigned long num_left_split = 0;
|
|
unsigned long num_right_split = 0;
|
|
for (long i = 0; i < l.size(); ++i)
|
|
{
|
|
if (l(i) > 0)
|
|
{
|
|
left_idx_map[i] = num_left_split;
|
|
++num_left_split;
|
|
}
|
|
else
|
|
{
|
|
right_idx_map[i] = num_right_split;
|
|
++num_right_split;
|
|
}
|
|
}
|
|
|
|
// do a recursive split if it will improve the modularity.
|
|
if (modularity > modularity_threshold && num_left_split > 0 && num_right_split > 0)
|
|
{
|
|
|
|
// split the node_degrees and Bdiag matrices into left and right split parts
|
|
matrix<double,0,1> left_node_degrees(num_left_split);
|
|
matrix<double,0,1> right_node_degrees(num_right_split);
|
|
matrix<double,0,1> left_Bdiag(num_left_split);
|
|
matrix<double,0,1> right_Bdiag(num_right_split);
|
|
for (long i = 0; i < l.size(); ++i)
|
|
{
|
|
if (l(i) > 0)
|
|
{
|
|
left_node_degrees(left_idx_map[i]) = node_degrees(i);
|
|
left_Bdiag(left_idx_map[i]) = Bdiag(i);
|
|
}
|
|
else
|
|
{
|
|
right_node_degrees(right_idx_map[i]) = node_degrees(i);
|
|
right_Bdiag(right_idx_map[i]) = Bdiag(i);
|
|
}
|
|
}
|
|
|
|
|
|
// put the edges from one side of the split into split_edges
|
|
std::vector<ordered_sample_pair> split_edges;
|
|
modularity_threshold = 0;
|
|
for (unsigned long k = 0; k < edges.size(); ++k)
|
|
{
|
|
const unsigned long i = edges[k].index1();
|
|
const unsigned long j = edges[k].index2();
|
|
const double d = edges[k].distance();
|
|
if (l(i) > 0 && l(j) > 0)
|
|
{
|
|
split_edges.push_back(ordered_sample_pair(left_idx_map[i], left_idx_map[j], d));
|
|
modularity_threshold += d;
|
|
}
|
|
}
|
|
modularity_threshold -= sum(left_node_degrees*sum(left_node_degrees))/(2*edge_sum);
|
|
modularity_threshold /= 4*edge_sum;
|
|
|
|
unsigned long num_left_clusters;
|
|
std::vector<unsigned long> left_labels;
|
|
num_left_clusters = newman_cluster_helper(rnd,split_edges,left_node_degrees,left_Bdiag,
|
|
edge_sum,left_labels,modularity_threshold,
|
|
eps, max_iterations);
|
|
|
|
// now load the other side into split_edges and cluster it as well
|
|
split_edges.clear();
|
|
modularity_threshold = 0;
|
|
for (unsigned long k = 0; k < edges.size(); ++k)
|
|
{
|
|
const unsigned long i = edges[k].index1();
|
|
const unsigned long j = edges[k].index2();
|
|
const double d = edges[k].distance();
|
|
if (l(i) < 0 && l(j) < 0)
|
|
{
|
|
split_edges.push_back(ordered_sample_pair(right_idx_map[i], right_idx_map[j], d));
|
|
modularity_threshold += d;
|
|
}
|
|
}
|
|
modularity_threshold -= sum(right_node_degrees*sum(right_node_degrees))/(2*edge_sum);
|
|
modularity_threshold /= 4*edge_sum;
|
|
|
|
unsigned long num_right_clusters;
|
|
std::vector<unsigned long> right_labels;
|
|
num_right_clusters = newman_cluster_helper(rnd,split_edges,right_node_degrees,right_Bdiag,
|
|
edge_sum,right_labels,modularity_threshold,
|
|
eps, max_iterations);
|
|
|
|
// Now merge the labels from the two splits.
|
|
labels.resize(node_degrees.size());
|
|
for (unsigned long i = 0; i < labels.size(); ++i)
|
|
{
|
|
// if this node was in the left split
|
|
if (l(i) > 0)
|
|
{
|
|
labels[i] = left_labels[left_idx_map[i]];
|
|
}
|
|
else // if this node was in the right split
|
|
{
|
|
labels[i] = right_labels[right_idx_map[i]] + num_left_clusters;
|
|
}
|
|
}
|
|
|
|
|
|
return num_left_clusters + num_right_clusters;
|
|
}
|
|
else
|
|
{
|
|
labels.assign(node_degrees.size(),0);
|
|
return 1;
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
inline unsigned long newman_cluster (
|
|
const std::vector<ordered_sample_pair>& edges,
|
|
std::vector<unsigned long>& labels,
|
|
const double eps = 1e-4,
|
|
const unsigned long max_iterations = 2000
|
|
)
|
|
{
|
|
// make sure requires clause is not broken
|
|
DLIB_ASSERT(is_ordered_by_index(edges),
|
|
"\t unsigned long newman_cluster()"
|
|
<< "\n\t Invalid inputs were given to this function"
|
|
);
|
|
|
|
labels.clear();
|
|
if (edges.size() == 0)
|
|
return 0;
|
|
|
|
const unsigned long num_nodes = max_index_plus_one(edges);
|
|
|
|
// compute the node_degrees vector, edge_sum value, and diag(B).
|
|
matrix<double,0,1> node_degrees(num_nodes);
|
|
matrix<double,0,1> Bdiag(num_nodes);
|
|
Bdiag = 0;
|
|
double edge_sum = 0;
|
|
node_degrees = 0;
|
|
for (unsigned long i = 0; i < edges.size(); ++i)
|
|
{
|
|
node_degrees(edges[i].index1()) += edges[i].distance();
|
|
edge_sum += edges[i].distance();
|
|
if (edges[i].index1() == edges[i].index2())
|
|
Bdiag(edges[i].index1()) += edges[i].distance();
|
|
}
|
|
edge_sum /= 2;
|
|
Bdiag -= squared(node_degrees)/(2*edge_sum);
|
|
|
|
|
|
dlib::rand rnd;
|
|
return impl::newman_cluster_helper(rnd,edges,node_degrees,Bdiag,edge_sum,labels,0,eps,max_iterations);
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
inline unsigned long newman_cluster (
|
|
const std::vector<sample_pair>& edges,
|
|
std::vector<unsigned long>& labels,
|
|
const double eps = 1e-4,
|
|
const unsigned long max_iterations = 2000
|
|
)
|
|
{
|
|
std::vector<ordered_sample_pair> oedges;
|
|
convert_unordered_to_ordered(edges, oedges);
|
|
std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>);
|
|
|
|
return newman_cluster(oedges, labels, eps, max_iterations);
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
namespace impl
|
|
{
|
|
inline std::vector<unsigned long> remap_labels (
|
|
const std::vector<unsigned long>& labels,
|
|
unsigned long& num_labels
|
|
)
|
|
/*!
|
|
ensures
|
|
- This function takes labels and produces a mapping which maps elements of
|
|
labels into the most compact range in [0, max] as possible. In particular,
|
|
there won't be any unused integers in the mapped range.
|
|
- #num_labels == the number of distinct values in labels.
|
|
- returns a vector V such that:
|
|
- V.size() == labels.size()
|
|
- max(mat(V))+1 == num_labels.
|
|
- for all valid i,j:
|
|
- if (labels[i] == labels[j]) then
|
|
- V[i] == V[j]
|
|
- else
|
|
- V[i] != V[j]
|
|
!*/
|
|
{
|
|
std::map<unsigned long, unsigned long> temp;
|
|
for (unsigned long i = 0; i < labels.size(); ++i)
|
|
{
|
|
if (temp.count(labels[i]) == 0)
|
|
{
|
|
const unsigned long next = temp.size();
|
|
temp[labels[i]] = next;
|
|
}
|
|
}
|
|
|
|
num_labels = temp.size();
|
|
|
|
std::vector<unsigned long> result(labels.size());
|
|
for (unsigned long i = 0; i < labels.size(); ++i)
|
|
{
|
|
result[i] = temp[labels[i]];
|
|
}
|
|
return result;
|
|
}
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
inline double modularity (
|
|
const std::vector<sample_pair>& edges,
|
|
const std::vector<unsigned long>& labels
|
|
)
|
|
{
|
|
const unsigned long num_nodes = max_index_plus_one(edges);
|
|
// make sure requires clause is not broken
|
|
DLIB_ASSERT(labels.size() == num_nodes,
|
|
"\t double modularity()"
|
|
<< "\n\t Invalid inputs were given to this function"
|
|
);
|
|
|
|
unsigned long num_labels;
|
|
const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels);
|
|
|
|
std::vector<double> cluster_sums(num_labels,0);
|
|
std::vector<double> k(num_nodes,0);
|
|
|
|
double Q = 0;
|
|
double m = 0;
|
|
for (unsigned long i = 0; i < edges.size(); ++i)
|
|
{
|
|
const unsigned long n1 = edges[i].index1();
|
|
const unsigned long n2 = edges[i].index2();
|
|
k[n1] += edges[i].distance();
|
|
if (n1 != n2)
|
|
k[n2] += edges[i].distance();
|
|
|
|
if (n1 != n2)
|
|
m += edges[i].distance();
|
|
else
|
|
m += edges[i].distance()/2;
|
|
|
|
if (labels_[n1] == labels_[n2])
|
|
{
|
|
if (n1 != n2)
|
|
Q += 2*edges[i].distance();
|
|
else
|
|
Q += edges[i].distance();
|
|
}
|
|
}
|
|
|
|
if (m == 0)
|
|
return 0;
|
|
|
|
for (unsigned long i = 0; i < labels_.size(); ++i)
|
|
{
|
|
cluster_sums[labels_[i]] += k[i];
|
|
}
|
|
|
|
for (unsigned long i = 0; i < labels_.size(); ++i)
|
|
{
|
|
Q -= k[i]*cluster_sums[labels_[i]]/(2*m);
|
|
}
|
|
|
|
return 1.0/(2*m)*Q;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
inline double modularity (
|
|
const std::vector<ordered_sample_pair>& edges,
|
|
const std::vector<unsigned long>& labels
|
|
)
|
|
{
|
|
const unsigned long num_nodes = max_index_plus_one(edges);
|
|
// make sure requires clause is not broken
|
|
DLIB_ASSERT(labels.size() == num_nodes,
|
|
"\t double modularity()"
|
|
<< "\n\t Invalid inputs were given to this function"
|
|
);
|
|
|
|
|
|
unsigned long num_labels;
|
|
const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels);
|
|
|
|
std::vector<double> cluster_sums(num_labels,0);
|
|
std::vector<double> k(num_nodes,0);
|
|
|
|
double Q = 0;
|
|
double m = 0;
|
|
for (unsigned long i = 0; i < edges.size(); ++i)
|
|
{
|
|
const unsigned long n1 = edges[i].index1();
|
|
const unsigned long n2 = edges[i].index2();
|
|
k[n1] += edges[i].distance();
|
|
m += edges[i].distance();
|
|
if (labels_[n1] == labels_[n2])
|
|
{
|
|
Q += edges[i].distance();
|
|
}
|
|
}
|
|
|
|
if (m == 0)
|
|
return 0;
|
|
|
|
for (unsigned long i = 0; i < labels_.size(); ++i)
|
|
{
|
|
cluster_sums[labels_[i]] += k[i];
|
|
}
|
|
|
|
for (unsigned long i = 0; i < labels_.size(); ++i)
|
|
{
|
|
Q -= k[i]*cluster_sums[labels_[i]]/m;
|
|
}
|
|
|
|
return 1.0/m*Q;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
}
|
|
|
|
#endif // DLIB_MODULARITY_ClUSTERING__H__
|
|
|