From a480e10700ec54b99eecc9590302bc9bd6c830f6 Mon Sep 17 00:00:00 2001 From: Patrick Connolly Date: Tue, 25 Feb 2025 15:14:17 -0800 Subject: [PATCH 01/10] Allow passing init_centers into utils.find_optimal_k(). --- reddwarf/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/reddwarf/utils.py b/reddwarf/utils.py index 2c1c143..67b4649 100644 --- a/reddwarf/utils.py +++ b/reddwarf/utils.py @@ -304,6 +304,7 @@ def run_kmeans( def find_optimal_k( projected_data: pd.DataFrame, max_group_count: int = 5, + init_centers: Optional[List] = None, random_state: Optional[int] = None, debug: bool = False, ) -> Tuple[int, float, np.ndarray]: @@ -329,6 +330,7 @@ def find_optimal_k( cluster_labels, _ = run_kmeans( dataframe=projected_data, n_clusters=k_test, + init_centers=init_centers, random_state=random_state, ) this_silhouette_score = silhouette_score(projected_data, cluster_labels) From efb4231433b7f26ff164a131ed84a1c5148ded01 Mon Sep 17 00:00:00 2001 From: Patrick Connolly Date: Tue, 25 Feb 2025 15:14:44 -0800 Subject: [PATCH 02/10] Return cluster_centers from utils.find_optimal_k(). --- reddwarf/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reddwarf/utils.py b/reddwarf/utils.py index 67b4649..00198f4 100644 --- a/reddwarf/utils.py +++ b/reddwarf/utils.py @@ -327,7 +327,7 @@ def find_optimal_k( best_silhouette_score = -np.inf for k_test in K_RANGE: - cluster_labels, _ = run_kmeans( + cluster_labels, cluster_centers = run_kmeans( dataframe=projected_data, n_clusters=k_test, init_centers=init_centers, @@ -340,9 +340,11 @@ def find_optimal_k( k_best = k_test best_silhouette_score = this_silhouette_score best_cluster_labels = cluster_labels + best_cluster_centers = cluster_centers optimal_k = k_best optimal_silhouette = best_silhouette_score optimal_cluster_labels = best_cluster_labels + optimal_cluster_centers = best_cluster_centers - return optimal_k, optimal_silhouette, optimal_cluster_labels + return optimal_k, optimal_silhouette, optimal_cluster_labels, optimal_cluster_centers From 103981c1b290f1ff96c30f84d30aaef9c4d4ca8c Mon Sep 17 00:00:00 2001 From: Patrick Connolly Date: Tue, 25 Feb 2025 15:15:16 -0800 Subject: [PATCH 03/10] Added demdis type definitions. --- reddwarf/types/demdis.py | 72 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 reddwarf/types/demdis.py diff --git a/reddwarf/types/demdis.py b/reddwarf/types/demdis.py new file mode 100644 index 0000000..9a9f67c --- /dev/null +++ b/reddwarf/types/demdis.py @@ -0,0 +1,72 @@ +from typing import Annotated +from datetime import datetime + + +# Source: https://github.com/Demdis/Clustering-types/blob/main/types.py + +class VoteValueEnum(str): + AGREE = "agree" + DISAGREE = "disagree" + SKIP = "skip" + + +class VoteModel(): + id: int + conversation_id: int + statement_id: int + voted_by_participant_id: int + + value: VoteValueEnum + + +class ClusteringCenterModel(): + center_x: float + center_y: float + + +class StatementConversationMetric(): + statement_id: int + mean_agreement_percentage: float + consensus_points: int + polarization_measurement: float + + +class ClusteringParticipant(): + participant_id: int + cluster_center_name: str + x: float + y: float + + +class ClusteredStatement(): + statement_id: int + cluster_center_name: str + agreement_count: int + disagreement_count: int + skip_count: int + unseen_count: int + agreement_percentage: float + cluster_defining_pos_coefficient: float + cluster_defining_neg_coefficient: float + cluster_defining_skip_coefficient: float + + +class ClusteringCenter(): + name: str + center_x: float + center_y: float + participant_count: int + + participants: list[ClusteringParticipant] + statements: list[ClusteredStatement] + + +class ClusteringResult(): + participant_count: int + participants_clustered: int + vote_count: int + statement_count: int + last_vote_at: datetime + + statement_metrics: list[StatementConversationMetric] + centers: list[ClusteringCenter] From 6393083794d198bb0efb2020477795becc7d219d Mon Sep 17 00:00:00 2001 From: Patrick Connolly Date: Tue, 25 Feb 2025 15:16:05 -0800 Subject: [PATCH 04/10] Added basic reproduction of DemDis run_clustering (still missing statement statistics). --- reddwarf/demdis.py | 124 +++++++++++++++++++++++++++++++++++++++++ reddwarf/types/base.py | 6 ++ 2 files changed, 130 insertions(+) create mode 100644 reddwarf/demdis.py create mode 100644 reddwarf/types/base.py diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py new file mode 100644 index 0000000..2d148af --- /dev/null +++ b/reddwarf/demdis.py @@ -0,0 +1,124 @@ +from typing import Annotated +from reddwarf.types.demdis import ClusteringResult, VoteModel, ClusteringCenterModel, ClusteringCenter, VoteValueEnum as DemDisVoteValueEnum +from reddwarf.types.base import VoteValueEnum as BaseVoteValueEnum +from reddwarf import utils + + +DEFAULT_MIN_USER_VOTE_THRESHOLD = 7 +DEFAULT_MAX_CLUSTERS = 5 +DEFAULT_CLUSTER_NAMES = ["A", "B", "C", "D", "E"] + +DEMDIS_VOTE_KEY_MAPPING = { + "voted_by_participant_id": "participant_id", + "value": "vote", +} + +DEMDIS_VOTE_VALUE_MAPPING = { + DemDisVoteValueEnum.AGREE: BaseVoteValueEnum.UP, + DemDisVoteValueEnum.SKIP: BaseVoteValueEnum.NEUTRAL, + DemDisVoteValueEnum.DISAGREE: BaseVoteValueEnum.DOWN, +} + +def remap_vote_values(votes, value_mapping, key_mapping = {}): + rekeyed_votes = [ + { + # Use key_mapping if available, otherwise keep key unchanged. + (key_mapping.get(k, k)): val + for k, val in vote.items() + } + for vote in votes + ] + + remapped_votes = [ + { + # Use vote value_mapping if available, otherwise keep value unchanged. + k: (value_mapping.get(val, val) if k == "vote" else val) + for k, val in vote.items() + } + for vote in rekeyed_votes + ] + + return remapped_votes + +# See: https://github.com/Demdis/Clustering-types/blob/main/types.py +def run_clustering( + *, + votes: list[VoteModel], + reference_cluster_centers: list[ClusteringCenterModel] | None, + statement_boost: tuple[Annotated[int, "statement id"], Annotated[float, "boost"]] | None = None, + specific_cluster_count: int | None = None, + do_remap = True, +) -> ClusteringResult: + if do_remap: + votes = remap_vote_values(votes, DEMDIS_VOTE_VALUE_MAPPING, DEMDIS_VOTE_KEY_MAPPING) + + raw_vote_matrix = utils.generate_raw_matrix(votes=votes) + all_statement_ids = raw_vote_matrix.columns + + filtered_vote_matrix = utils.filter_matrix( + vote_matrix=raw_vote_matrix, + min_user_vote_threshold=DEFAULT_MIN_USER_VOTE_THRESHOLD, + active_statement_ids=all_statement_ids, + ) + + projected_data, _, _ = utils.run_pca(vote_matrix=filtered_vote_matrix) + + projected_data = utils.scale_projected_data( + projected_data=projected_data, + vote_matrix=filtered_vote_matrix, + ) + + # TODO: Confirm init_centers works. + if specific_cluster_count: + cluster_labels, cluster_centers = utils.run_kmeans( + dataframe=projected_data, + n_clusters=specific_cluster_count, + init_centers=([[c["center_x"], c["center_y"]] for c in reference_cluster_centers] if reference_cluster_centers else None) + ) + else: + _, _, cluster_labels, cluster_centers = utils.find_optimal_k( + projected_data=projected_data, + max_group_count=DEFAULT_MAX_CLUSTERS, + init_centers=([[c["center_x"], c["center_y"]] for c in reference_cluster_centers] if reference_cluster_centers else None) + ) + + # Add cluster label column to dataframe. + projected_data = projected_data.assign(cluster_id=cluster_labels) + # Convert participant_id index into regular column, for ease of transformation. + projected_data = projected_data.reset_index() + + def build_centers(projected_data): + centers = [ + { + "name": DEFAULT_CLUSTER_NAMES[cluster_id], + "center_x": float(cluster_centers[cluster_id][0]), + "center_y": float(cluster_centers[cluster_id][1]), + "participant_count": len(group_df), + "participants": [ + { + "participant_id": row.participant_id, + "cluster_center_name": DEFAULT_CLUSTER_NAMES[cluster_id], + "x": row.x, + "y": row.y, + } + for row in group_df.itertuples(index=False) + ], + "statements": [], # TODO + } + for cluster_id, group_df in projected_data.groupby("cluster_id") + ] + + return centers + + result: ClusteringResult = { + "participant_count": len(raw_vote_matrix.index), + "participants_clustered": len(filtered_vote_matrix.index), + "vote_count": int(raw_vote_matrix.count().sum()), + "statement_count": len(raw_vote_matrix.columns), + "last_vote_at": None, # TODO + + "statement_metrics": [], + "centers": build_centers(projected_data), + } + + return result diff --git a/reddwarf/types/base.py b/reddwarf/types/base.py new file mode 100644 index 0000000..0ea39fd --- /dev/null +++ b/reddwarf/types/base.py @@ -0,0 +1,6 @@ +from enum import IntEnum + +class VoteValueEnum(IntEnum): + UP = 1 + NEUTRAL = 0 + DOWN = -1 From f3d49d1d133b668dcf0c1e2728ab0944d6fa038c Mon Sep 17 00:00:00 2001 From: Patrick Connolly Date: Tue, 25 Feb 2025 15:16:19 -0800 Subject: [PATCH 05/10] Added simple debug test for DemDis. --- debug.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/debug.py b/debug.py index 1bd0827..3566017 100644 --- a/debug.py +++ b/debug.py @@ -29,7 +29,7 @@ }, } -if True: +if False: report_id = CONVOS["tech-politics-2018"]["report_id"] print(f"Loading data from report: https://pol.is/report/{report_id}") @@ -49,6 +49,26 @@ presenter.render_optimal_cluster_figure() # client.generate_figure(coord_dataframe=client.projected_data) +if True: + # test demdis method + from reddwarf.demdis import run_clustering + + report_id = CONVOS["tech-politics-2018"]["report_id"] + print(f"Loading data from report: https://pol.is/report/{report_id}") + + client = PolisClient() + client.load_data(report_id=report_id) + + results = run_clustering( + votes=client.data_loader.votes_data, + reference_cluster_centers=None, + # specific_cluster_count=2, + do_remap=False, + ) + + from pprint import pprint + pprint(results) + if False: # Show convo with duplicate votes. # Shareable demo: https://gist.github.com/patcon/9c1a39291cd75b23722a5379d7cfc3cc From 6e2b4090234534acd3dc65f26e9a761ee1d8a78f Mon Sep 17 00:00:00 2001 From: Patrick Connolly Date: Tue, 25 Feb 2025 16:09:37 -0800 Subject: [PATCH 06/10] Rename do_remap to skip_remap. --- debug.py | 2 +- reddwarf/demdis.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/debug.py b/debug.py index 3566017..6efa6c9 100644 --- a/debug.py +++ b/debug.py @@ -63,7 +63,7 @@ votes=client.data_loader.votes_data, reference_cluster_centers=None, # specific_cluster_count=2, - do_remap=False, + skip_remap=True, ) from pprint import pprint diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py index 2d148af..5fc08f1 100644 --- a/reddwarf/demdis.py +++ b/reddwarf/demdis.py @@ -47,9 +47,9 @@ def run_clustering( reference_cluster_centers: list[ClusteringCenterModel] | None, statement_boost: tuple[Annotated[int, "statement id"], Annotated[float, "boost"]] | None = None, specific_cluster_count: int | None = None, - do_remap = True, + skip_remap = False, ) -> ClusteringResult: - if do_remap: + if not skip_remap: votes = remap_vote_values(votes, DEMDIS_VOTE_VALUE_MAPPING, DEMDIS_VOTE_KEY_MAPPING) raw_vote_matrix = utils.generate_raw_matrix(votes=votes) From 67b3b2b10465ca764e445fbccd5e03cd13d6c58d Mon Sep 17 00:00:00 2001 From: Patrick Connolly Date: Tue, 25 Feb 2025 17:44:32 -0800 Subject: [PATCH 07/10] No need to reset_index on df. --- reddwarf/demdis.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py index 5fc08f1..d0296e6 100644 --- a/reddwarf/demdis.py +++ b/reddwarf/demdis.py @@ -84,8 +84,6 @@ def run_clustering( # Add cluster label column to dataframe. projected_data = projected_data.assign(cluster_id=cluster_labels) - # Convert participant_id index into regular column, for ease of transformation. - projected_data = projected_data.reset_index() def build_centers(projected_data): centers = [ @@ -96,12 +94,12 @@ def build_centers(projected_data): "participant_count": len(group_df), "participants": [ { - "participant_id": row.participant_id, + "participant_id": row.index, "cluster_center_name": DEFAULT_CLUSTER_NAMES[cluster_id], "x": row.x, "y": row.y, } - for row in group_df.itertuples(index=False) + for row in group_df.itertuples(index=True) ], "statements": [], # TODO } From cced506a848f57f4f57bd8d37f6e15f1ca2af084 Mon Sep 17 00:00:00 2001 From: Patrick Connolly Date: Tue, 25 Feb 2025 17:45:15 -0800 Subject: [PATCH 08/10] Added a bunch more data for ClusteredStatements object. --- reddwarf/demdis.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py index d0296e6..326eba5 100644 --- a/reddwarf/demdis.py +++ b/reddwarf/demdis.py @@ -85,13 +85,16 @@ def run_clustering( # Add cluster label column to dataframe. projected_data = projected_data.assign(cluster_id=cluster_labels) - def build_centers(projected_data): + merged_df = projected_data.join(filtered_vote_matrix) + + + def build_centers(df): centers = [ { "name": DEFAULT_CLUSTER_NAMES[cluster_id], "center_x": float(cluster_centers[cluster_id][0]), "center_y": float(cluster_centers[cluster_id][1]), - "participant_count": len(group_df), + "participant_count": (group_participant_count := len(group_df)), "participants": [ { "participant_id": row.index, @@ -101,9 +104,23 @@ def build_centers(projected_data): } for row in group_df.itertuples(index=True) ], - "statements": [], # TODO + "statements": [ + { + "statement_id": statement_id, + "cluster_center_name": DEFAULT_CLUSTER_NAMES[cluster_id], + "agreement_count": (agreement_count := int((group_df[statement_id] == 1).sum())), + "disagreement_count": int((group_df[statement_id] == -1).sum()), + "skip_count": int((group_df[statement_id] == 0).sum()), + "unseen_count": int(group_df[statement_id].isna().sum()), + "agreement_percentage": (agreement_count / group_participant_count) * 100, + "cluster_defining_pos_coefficient": 0.0, # TODO + "cluster_defining_neg_coefficient": 0.0, # TODO + "cluster_defining_skip_coefficient": 0.0, # TODO + } + for statement_id in filtered_vote_matrix.columns + ], } - for cluster_id, group_df in projected_data.groupby("cluster_id") + for cluster_id, group_df in df.groupby("cluster_id") ] return centers @@ -116,7 +133,7 @@ def build_centers(projected_data): "last_vote_at": None, # TODO "statement_metrics": [], - "centers": build_centers(projected_data), + "centers": build_centers(merged_df), } return result From 6e85dd67a8020a244cc79a133396ee85bd3fb32b Mon Sep 17 00:00:00 2001 From: Patrick Connolly Date: Tue, 25 Feb 2025 18:03:01 -0800 Subject: [PATCH 09/10] Added last_vote_at key to demdis response. --- reddwarf/demdis.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py index 326eba5..acef832 100644 --- a/reddwarf/demdis.py +++ b/reddwarf/demdis.py @@ -2,6 +2,7 @@ from reddwarf.types.demdis import ClusteringResult, VoteModel, ClusteringCenterModel, ClusteringCenter, VoteValueEnum as DemDisVoteValueEnum from reddwarf.types.base import VoteValueEnum as BaseVoteValueEnum from reddwarf import utils +from datetime import datetime DEFAULT_MIN_USER_VOTE_THRESHOLD = 7 @@ -52,6 +53,11 @@ def run_clustering( if not skip_remap: votes = remap_vote_values(votes, DEMDIS_VOTE_VALUE_MAPPING, DEMDIS_VOTE_KEY_MAPPING) + last_vote_timestamp = 0 + for vote in votes: + if vote["modified"] > last_vote_timestamp: + last_vote_timestamp = vote["modified"] + raw_vote_matrix = utils.generate_raw_matrix(votes=votes) all_statement_ids = raw_vote_matrix.columns @@ -130,7 +136,7 @@ def build_centers(df): "participants_clustered": len(filtered_vote_matrix.index), "vote_count": int(raw_vote_matrix.count().sum()), "statement_count": len(raw_vote_matrix.columns), - "last_vote_at": None, # TODO + "last_vote_at": datetime.fromtimestamp(last_vote_timestamp/1000), "statement_metrics": [], "centers": build_centers(merged_df), From 27924dc2b91449e957070b96860d394c6b661c7b Mon Sep 17 00:00:00 2001 From: Patrick Connolly Date: Tue, 25 Feb 2025 18:09:12 -0800 Subject: [PATCH 10/10] Stubbed out statement_metrics. --- reddwarf/demdis.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py index acef832..ac2bee2 100644 --- a/reddwarf/demdis.py +++ b/reddwarf/demdis.py @@ -138,7 +138,16 @@ def build_centers(df): "statement_count": len(raw_vote_matrix.columns), "last_vote_at": datetime.fromtimestamp(last_vote_timestamp/1000), - "statement_metrics": [], + "statement_metrics": [ + { + "statement_id": statement_id, + "mean_agreement_percentage": 0.0, # TODO + "consensus_points": 0, # TODO + "polarization_measurement": 0.0, # TODO + + } + for statement_id in filtered_vote_matrix.columns + ], "centers": build_centers(merged_df), }