diff --git a/debug.py b/debug.py index 1bd0827..6efa6c9 100644 --- a/debug.py +++ b/debug.py @@ -29,7 +29,7 @@ }, } -if True: +if False: report_id = CONVOS["tech-politics-2018"]["report_id"] print(f"Loading data from report: https://pol.is/report/{report_id}") @@ -49,6 +49,26 @@ presenter.render_optimal_cluster_figure() # client.generate_figure(coord_dataframe=client.projected_data) +if True: + # test demdis method + from reddwarf.demdis import run_clustering + + report_id = CONVOS["tech-politics-2018"]["report_id"] + print(f"Loading data from report: https://pol.is/report/{report_id}") + + client = PolisClient() + client.load_data(report_id=report_id) + + results = run_clustering( + votes=client.data_loader.votes_data, + reference_cluster_centers=None, + # specific_cluster_count=2, + skip_remap=True, + ) + + from pprint import pprint + pprint(results) + if False: # Show convo with duplicate votes. # Shareable demo: https://gist.github.com/patcon/9c1a39291cd75b23722a5379d7cfc3cc diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py new file mode 100644 index 0000000..ac2bee2 --- /dev/null +++ b/reddwarf/demdis.py @@ -0,0 +1,154 @@ +from typing import Annotated +from reddwarf.types.demdis import ClusteringResult, VoteModel, ClusteringCenterModel, ClusteringCenter, VoteValueEnum as DemDisVoteValueEnum +from reddwarf.types.base import VoteValueEnum as BaseVoteValueEnum +from reddwarf import utils +from datetime import datetime + + +DEFAULT_MIN_USER_VOTE_THRESHOLD = 7 +DEFAULT_MAX_CLUSTERS = 5 +DEFAULT_CLUSTER_NAMES = ["A", "B", "C", "D", "E"] + +DEMDIS_VOTE_KEY_MAPPING = { + "voted_by_participant_id": "participant_id", + "value": "vote", +} + +DEMDIS_VOTE_VALUE_MAPPING = { + DemDisVoteValueEnum.AGREE: BaseVoteValueEnum.UP, + DemDisVoteValueEnum.SKIP: BaseVoteValueEnum.NEUTRAL, + DemDisVoteValueEnum.DISAGREE: BaseVoteValueEnum.DOWN, +} + +def remap_vote_values(votes, value_mapping, key_mapping = {}): + rekeyed_votes = [ + { + # Use key_mapping if available, otherwise keep key unchanged. + (key_mapping.get(k, k)): val + for k, val in vote.items() + } + for vote in votes + ] + + remapped_votes = [ + { + # Use vote value_mapping if available, otherwise keep value unchanged. + k: (value_mapping.get(val, val) if k == "vote" else val) + for k, val in vote.items() + } + for vote in rekeyed_votes + ] + + return remapped_votes + +# See: https://github.com/Demdis/Clustering-types/blob/main/types.py +def run_clustering( + *, + votes: list[VoteModel], + reference_cluster_centers: list[ClusteringCenterModel] | None, + statement_boost: tuple[Annotated[int, "statement id"], Annotated[float, "boost"]] | None = None, + specific_cluster_count: int | None = None, + skip_remap = False, +) -> ClusteringResult: + if not skip_remap: + votes = remap_vote_values(votes, DEMDIS_VOTE_VALUE_MAPPING, DEMDIS_VOTE_KEY_MAPPING) + + last_vote_timestamp = 0 + for vote in votes: + if vote["modified"] > last_vote_timestamp: + last_vote_timestamp = vote["modified"] + + raw_vote_matrix = utils.generate_raw_matrix(votes=votes) + all_statement_ids = raw_vote_matrix.columns + + filtered_vote_matrix = utils.filter_matrix( + vote_matrix=raw_vote_matrix, + min_user_vote_threshold=DEFAULT_MIN_USER_VOTE_THRESHOLD, + active_statement_ids=all_statement_ids, + ) + + projected_data, _, _ = utils.run_pca(vote_matrix=filtered_vote_matrix) + + projected_data = utils.scale_projected_data( + projected_data=projected_data, + vote_matrix=filtered_vote_matrix, + ) + + # TODO: Confirm init_centers works. + if specific_cluster_count: + cluster_labels, cluster_centers = utils.run_kmeans( + dataframe=projected_data, + n_clusters=specific_cluster_count, + init_centers=([[c["center_x"], c["center_y"]] for c in reference_cluster_centers] if reference_cluster_centers else None) + ) + else: + _, _, cluster_labels, cluster_centers = utils.find_optimal_k( + projected_data=projected_data, + max_group_count=DEFAULT_MAX_CLUSTERS, + init_centers=([[c["center_x"], c["center_y"]] for c in reference_cluster_centers] if reference_cluster_centers else None) + ) + + # Add cluster label column to dataframe. + projected_data = projected_data.assign(cluster_id=cluster_labels) + + merged_df = projected_data.join(filtered_vote_matrix) + + + def build_centers(df): + centers = [ + { + "name": DEFAULT_CLUSTER_NAMES[cluster_id], + "center_x": float(cluster_centers[cluster_id][0]), + "center_y": float(cluster_centers[cluster_id][1]), + "participant_count": (group_participant_count := len(group_df)), + "participants": [ + { + "participant_id": row.index, + "cluster_center_name": DEFAULT_CLUSTER_NAMES[cluster_id], + "x": row.x, + "y": row.y, + } + for row in group_df.itertuples(index=True) + ], + "statements": [ + { + "statement_id": statement_id, + "cluster_center_name": DEFAULT_CLUSTER_NAMES[cluster_id], + "agreement_count": (agreement_count := int((group_df[statement_id] == 1).sum())), + "disagreement_count": int((group_df[statement_id] == -1).sum()), + "skip_count": int((group_df[statement_id] == 0).sum()), + "unseen_count": int(group_df[statement_id].isna().sum()), + "agreement_percentage": (agreement_count / group_participant_count) * 100, + "cluster_defining_pos_coefficient": 0.0, # TODO + "cluster_defining_neg_coefficient": 0.0, # TODO + "cluster_defining_skip_coefficient": 0.0, # TODO + } + for statement_id in filtered_vote_matrix.columns + ], + } + for cluster_id, group_df in df.groupby("cluster_id") + ] + + return centers + + result: ClusteringResult = { + "participant_count": len(raw_vote_matrix.index), + "participants_clustered": len(filtered_vote_matrix.index), + "vote_count": int(raw_vote_matrix.count().sum()), + "statement_count": len(raw_vote_matrix.columns), + "last_vote_at": datetime.fromtimestamp(last_vote_timestamp/1000), + + "statement_metrics": [ + { + "statement_id": statement_id, + "mean_agreement_percentage": 0.0, # TODO + "consensus_points": 0, # TODO + "polarization_measurement": 0.0, # TODO + + } + for statement_id in filtered_vote_matrix.columns + ], + "centers": build_centers(merged_df), + } + + return result diff --git a/reddwarf/types/base.py b/reddwarf/types/base.py new file mode 100644 index 0000000..0ea39fd --- /dev/null +++ b/reddwarf/types/base.py @@ -0,0 +1,6 @@ +from enum import IntEnum + +class VoteValueEnum(IntEnum): + UP = 1 + NEUTRAL = 0 + DOWN = -1 diff --git a/reddwarf/types/demdis.py b/reddwarf/types/demdis.py new file mode 100644 index 0000000..9a9f67c --- /dev/null +++ b/reddwarf/types/demdis.py @@ -0,0 +1,72 @@ +from typing import Annotated +from datetime import datetime + + +# Source: https://github.com/Demdis/Clustering-types/blob/main/types.py + +class VoteValueEnum(str): + AGREE = "agree" + DISAGREE = "disagree" + SKIP = "skip" + + +class VoteModel(): + id: int + conversation_id: int + statement_id: int + voted_by_participant_id: int + + value: VoteValueEnum + + +class ClusteringCenterModel(): + center_x: float + center_y: float + + +class StatementConversationMetric(): + statement_id: int + mean_agreement_percentage: float + consensus_points: int + polarization_measurement: float + + +class ClusteringParticipant(): + participant_id: int + cluster_center_name: str + x: float + y: float + + +class ClusteredStatement(): + statement_id: int + cluster_center_name: str + agreement_count: int + disagreement_count: int + skip_count: int + unseen_count: int + agreement_percentage: float + cluster_defining_pos_coefficient: float + cluster_defining_neg_coefficient: float + cluster_defining_skip_coefficient: float + + +class ClusteringCenter(): + name: str + center_x: float + center_y: float + participant_count: int + + participants: list[ClusteringParticipant] + statements: list[ClusteredStatement] + + +class ClusteringResult(): + participant_count: int + participants_clustered: int + vote_count: int + statement_count: int + last_vote_at: datetime + + statement_metrics: list[StatementConversationMetric] + centers: list[ClusteringCenter] diff --git a/reddwarf/utils.py b/reddwarf/utils.py index 2c1c143..00198f4 100644 --- a/reddwarf/utils.py +++ b/reddwarf/utils.py @@ -304,6 +304,7 @@ def run_kmeans( def find_optimal_k( projected_data: pd.DataFrame, max_group_count: int = 5, + init_centers: Optional[List] = None, random_state: Optional[int] = None, debug: bool = False, ) -> Tuple[int, float, np.ndarray]: @@ -326,9 +327,10 @@ def find_optimal_k( best_silhouette_score = -np.inf for k_test in K_RANGE: - cluster_labels, _ = run_kmeans( + cluster_labels, cluster_centers = run_kmeans( dataframe=projected_data, n_clusters=k_test, + init_centers=init_centers, random_state=random_state, ) this_silhouette_score = silhouette_score(projected_data, cluster_labels) @@ -338,9 +340,11 @@ def find_optimal_k( k_best = k_test best_silhouette_score = this_silhouette_score best_cluster_labels = cluster_labels + best_cluster_centers = cluster_centers optimal_k = k_best optimal_silhouette = best_silhouette_score optimal_cluster_labels = best_cluster_labels + optimal_cluster_centers = best_cluster_centers - return optimal_k, optimal_silhouette, optimal_cluster_labels + return optimal_k, optimal_silhouette, optimal_cluster_labels, optimal_cluster_centers