From 1933850e30f94dcadf34bd5d51ee05dc8563da60 Mon Sep 17 00:00:00 2001 From: jamisonmeindl Date: Mon, 18 Dec 2023 16:22:50 +0000 Subject: [PATCH 1/3] added edge density for overlapping communities --- clusterers/clustering_stats.proto | 1 + clusterers/stats/stats_density.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/clusterers/clustering_stats.proto b/clusterers/clustering_stats.proto index 2557466..9ada4e0 100644 --- a/clusterers/clustering_stats.proto +++ b/clusterers/clustering_stats.proto @@ -44,4 +44,5 @@ message ClusteringStatistics { optional double f_score_param = 31; optional double weighted_edge_density_mean = 32; optional double weighted_triangle_density_mean = 33; + optional double weighted_edge_density_overlap_mean = 34; } \ No newline at end of file diff --git a/clusterers/stats/stats_density.h b/clusterers/stats/stats_density.h index edc92e5..4e6cbb0 100644 --- a/clusterers/stats/stats_density.h +++ b/clusterers/stats/stats_density.h @@ -71,6 +71,25 @@ inline absl::Status ComputeEdgeDensity(const GbbsGraph& graph, set_distribution_stats(result.size(), result_func, clustering_stats->mutable_edge_density()); clustering_stats->set_weighted_edge_density_mean(weighted_mean); + parlay::sequence cluster_sum = parlay::sequence(n, 0); + parlay::sequence cluster_count = parlay::sequence(n, 0); + parlay::parallel_for(0, clustering.size(), [&](size_t i){ + const auto& cluster = clustering[i]; + parlay::parallel_for(0, cluster.size(), [&](size_t j){ + cluster_sum[cluster[j]] += result_func(i); + cluster_count[cluster[j]] += 1; + }); + }); + + double weighted_mean_overlap = 0; + for (int i=0;iset_weighted_edge_density_overlap_mean(weighted_mean_overlap); + return absl::OkStatus(); } From 9d489ccf490fdc943ddfba56ee241b3762ca99db Mon Sep 17 00:00:00 2001 From: jamisonmeindl Date: Mon, 18 Dec 2023 17:31:41 +0000 Subject: [PATCH 2/3] cleaned up edge density calculation --- clusterers/stats/stats_density.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/clusterers/stats/stats_density.h b/clusterers/stats/stats_density.h index 4e6cbb0..1dcb5db 100644 --- a/clusterers/stats/stats_density.h +++ b/clusterers/stats/stats_density.h @@ -135,6 +135,25 @@ inline absl::Status ComputeTriangleDensity(const GbbsGraph& graph, } clustering_stats->set_weighted_triangle_density_mean(weighted_mean); + // parlay::sequence cluster_sum = parlay::sequence(n, 0); + // parlay::sequence cluster_count = parlay::sequence(n, 0); + // parlay::parallel_for(0, clustering.size(), [&](size_t i){ + // const auto& cluster = clustering[i]; + // parlay::parallel_for(0, cluster.size(), [&](size_t j){ + // cluster_sum[cluster[j]] += result_func(i); + // cluster_count[cluster[j]] += 1; + // }); + // }); + + // double weighted_mean_overlap = 0; + // // for (int i=0;iset_weighted_triangle_density_overlap_mean(weighted_mean_overlap); + return absl::OkStatus(); } From b7a357d81e61837b0ac73471998a09a46a32d15b Mon Sep 17 00:00:00 2001 From: jamisonmeindl Date: Sun, 11 Feb 2024 19:22:39 +0000 Subject: [PATCH 3/3] redid triangle density for overlap --- clusterers/clustering_stats.cc | 10 ++- clusterers/clustering_stats.proto | 3 + clusterers/stats/stats_density.h | 130 +++++++++++++++++++++++++----- 3 files changed, 123 insertions(+), 20 deletions(-) diff --git a/clusterers/clustering_stats.cc b/clusterers/clustering_stats.cc index 8eed452..9d251be 100644 --- a/clusterers/clustering_stats.cc +++ b/clusterers/clustering_stats.cc @@ -71,14 +71,20 @@ absl::StatusOr GetStats(const GbbsGraph& graph, ComputeEdgeDensity(graph, clustering, &clustering_stats, cluster_ids, clustering_stats_config); auto end_edge_density = std::chrono::steady_clock::now(); PrintTime(end_diameter, end_edge_density, "Compute EdgeDensity"); + ComputeEdgeDensityOverlap(graph, clustering, &clustering_stats, cluster_ids, clustering_stats_config); + auto end_edge_density_overlap = std::chrono::steady_clock::now(); + PrintTime(end_edge_density, end_edge_density_overlap, "Compute EdgeDensityOverlap"); ComputeTriangleDensity(graph, clustering, &clustering_stats, cluster_ids, clustering_stats_config); auto end_triangle_density = std::chrono::steady_clock::now(); - PrintTime(end_edge_density, end_triangle_density, "Compute Triangle Density"); + PrintTime(end_edge_density_overlap, end_triangle_density, "Compute Triangle Density"); + ComputeTriangleDensityOverlap(graph, clustering, &clustering_stats, cluster_ids, clustering_stats_config); + auto end_triangle_density_overlap = std::chrono::steady_clock::now(); + PrintTime(end_triangle_density, end_triangle_density_overlap, "Compute Triangle Density"); size_t n = graph.Graph()->n; ComputeARI(n, clustering, &clustering_stats, communities, clustering_stats_config); auto end_ari = std::chrono::steady_clock::now(); - PrintTime(end_triangle_density, end_ari, "Compute ARI"); + PrintTime(end_triangle_density_overlap, end_ari, "Compute ARI"); ComputeNMI(n, clustering, &clustering_stats, communities, clustering_stats_config); auto end_nmi = std::chrono::steady_clock::now(); PrintTime(end_ari, end_nmi, "Compute NMI"); diff --git a/clusterers/clustering_stats.proto b/clusterers/clustering_stats.proto index 9ada4e0..f835a80 100644 --- a/clusterers/clustering_stats.proto +++ b/clusterers/clustering_stats.proto @@ -14,6 +14,8 @@ message ClusteringStatsConfig { optional bool compute_precision_recall = 9; optional bool compute_nmi = 10; optional double f_score_param = 11; + optional bool compute_edge_density_overlap = 12; + optional bool compute_triangle_density_overlap = 13; } message DistributionStats { @@ -45,4 +47,5 @@ message ClusteringStatistics { optional double weighted_edge_density_mean = 32; optional double weighted_triangle_density_mean = 33; optional double weighted_edge_density_overlap_mean = 34; + optional double weighted_triangle_density_overlap_mean = 35; } \ No newline at end of file diff --git a/clusterers/stats/stats_density.h b/clusterers/stats/stats_density.h index 1dcb5db..a680295 100644 --- a/clusterers/stats/stats_density.h +++ b/clusterers/stats/stats_density.h @@ -71,6 +71,53 @@ inline absl::Status ComputeEdgeDensity(const GbbsGraph& graph, set_distribution_stats(result.size(), result_func, clustering_stats->mutable_edge_density()); clustering_stats->set_weighted_edge_density_mean(weighted_mean); + return absl::OkStatus(); +} + +// compute the edge density of each cluster +// edge density is the number of edges divided by the number of possible edges +inline absl::Status ComputeEdgeDensityOverlap(const GbbsGraph& graph, + const InMemoryClusterer::Clustering& clustering, ClusteringStatistics* clustering_stats, + const parlay::sequence& cluster_ids, const ClusteringStatsConfig& clustering_stats_config) { + const bool compute_edge_density_overlap = clustering_stats_config.compute_edge_density_overlap(); + if (!compute_edge_density_overlap) { + return absl::OkStatus(); + } + + parlay::sequence cluster_ids_overlap = parlay::sequence(graph.Graph()->n); + parlay::parallel_for(0, clustering.size(), [&](size_t i){ + const auto& cluster = clustering[i]; + parlay::parallel_for(0, cluster.size(), [&](size_t j){ + cluster_ids_overlap[cluster[j]] = i; + }); + }); + + std::size_t n = graph.Graph()->n; + auto result = std::vector(clustering.size()); + + if(clustering.size()==1){ + result[0] = (static_cast(graph.Graph()->m)) / (static_cast(n)*(n-1)); + }else{ + for(size_t i = 0; i < clustering.size(); i++) { + if (clustering[i].size() == 1){ + result[i] = 0; + } + else{ + const auto& cluster = clustering[i]; + parlay::parallel_for(0, cluster.size(), [&](size_t j){ + cluster_ids_overlap[cluster[j]] = i; + }); + size_t m_subgraph = get_subgraph_num_edges(graph, clustering[i], cluster_ids_overlap); + double m_total = clustering[i].size()*(clustering[i].size()-1); + // std::cout << "m_subgraph" << " " << m_subgraph << std::endl; + // std::cout << "m_total" << " " << m_total << std::endl; + result[i] = (static_cast(m_subgraph)) / (static_cast(m_total)); + } + } + } + auto result_func = [&](std::size_t i) { + return result[i]; + }; parlay::sequence cluster_sum = parlay::sequence(n, 0); parlay::sequence cluster_count = parlay::sequence(n, 0); parlay::parallel_for(0, clustering.size(), [&](size_t i){ @@ -116,6 +163,7 @@ inline absl::Status ComputeTriangleDensity(const GbbsGraph& graph, result[i] = 0; }else{ size_t num_tri = gbbs::Triangle_degree_ordering(G, f); + // size_t num_tri = 0; result[i] = (static_cast(num_tri)) / (static_cast(num_wedges)); } }); @@ -135,24 +183,70 @@ inline absl::Status ComputeTriangleDensity(const GbbsGraph& graph, } clustering_stats->set_weighted_triangle_density_mean(weighted_mean); - // parlay::sequence cluster_sum = parlay::sequence(n, 0); - // parlay::sequence cluster_count = parlay::sequence(n, 0); - // parlay::parallel_for(0, clustering.size(), [&](size_t i){ - // const auto& cluster = clustering[i]; - // parlay::parallel_for(0, cluster.size(), [&](size_t j){ - // cluster_sum[cluster[j]] += result_func(i); - // cluster_count[cluster[j]] += 1; - // }); - // }); - - // double weighted_mean_overlap = 0; - // // for (int i=0;iset_weighted_triangle_density_overlap_mean(weighted_mean_overlap); + return absl::OkStatus(); +} + +// compute the triangle density of each cluster with overlapping clusters +// triangle density is the number of triangles divided by the number of wedges +// if no wedge, density is 0 +inline absl::Status ComputeTriangleDensityOverlap(const GbbsGraph& graph, + const InMemoryClusterer::Clustering& clustering, ClusteringStatistics* clustering_stats, + const parlay::sequence& cluster_ids, const ClusteringStatsConfig& clustering_stats_config) { + const bool compute_triangle_density_overlap = clustering_stats_config.compute_triangle_density_overlap(); + if (!compute_triangle_density_overlap) { + return absl::OkStatus(); + } + + parlay::sequence cluster_ids_overlap = parlay::sequence(graph.Graph()->n); + parlay::parallel_for(0, clustering.size(), [&](size_t i){ + const auto& cluster = clustering[i]; + parlay::parallel_for(0, cluster.size(), [&](size_t j){ + cluster_ids_overlap[cluster[j]] = i; + }); + }); + + std::size_t n = graph.Graph()->n; + auto result = std::vector(clustering.size()); + auto f = [&] (gbbs::uintE u, gbbs::uintE v, gbbs::uintE w) { }; + + //even if clustering.size()==1, we need to get the subgraph because could not match 'symmetric_graph' against 'symmetric_ptr_graph' + for(size_t i = 0; i < clustering.size(); i++) { + const auto& cluster = clustering[i]; + parlay::parallel_for(0, cluster.size(), [&](size_t j){ + cluster_ids_overlap[cluster[j]] = i; + }); + auto G = get_subgraph(graph, clustering[i], cluster_ids_overlap); //have to use unweighted graph, otherwise result is wrong + size_t num_wedges = get_num_wedges(&G); + if(num_wedges == 0){ + result[i] = 0; + }else{ + size_t num_tri = gbbs::Triangle_degree_ordering(G, f); + result[i] = (static_cast(num_tri)) / (static_cast(num_wedges)); + } + } + // for(double l:result) std::cout << l << std::endl; + auto result_func = [&](std::size_t i) { + return result[i]; + }; + + parlay::sequence cluster_sum = parlay::sequence(n, 0); + parlay::sequence cluster_count = parlay::sequence(n, 0); + parlay::parallel_for(0, clustering.size(), [&](size_t i){ + const auto& cluster = clustering[i]; + parlay::parallel_for(0, cluster.size(), [&](size_t j){ + cluster_sum[cluster[j]] += result_func(i); + cluster_count[cluster[j]] += 1; + }); + }); + + double weighted_mean_overlap = 0; + for (int i=0;iset_weighted_triangle_density_overlap_mean(weighted_mean_overlap); return absl::OkStatus(); }