From a480e10700ec54b99eecc9590302bc9bd6c830f6 Mon Sep 17 00:00:00 2001
From: Patrick Connolly <patrick.c.connolly@gmail.com>
Date: Tue, 25 Feb 2025 15:14:17 -0800
Subject: [PATCH 01/10] Allow passing init_centers into utils.find_optimal_k().

---
 reddwarf/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/reddwarf/utils.py b/reddwarf/utils.py
index 2c1c143..67b4649 100644
--- a/reddwarf/utils.py
+++ b/reddwarf/utils.py
@@ -304,6 +304,7 @@ def run_kmeans(
 def find_optimal_k(
         projected_data: pd.DataFrame,
         max_group_count: int = 5,
+        init_centers: Optional[List] = None,
         random_state: Optional[int] = None,
         debug: bool = False,
 ) -> Tuple[int, float, np.ndarray]:
@@ -329,6 +330,7 @@ def find_optimal_k(
         cluster_labels, _ = run_kmeans(
             dataframe=projected_data,
             n_clusters=k_test,
+            init_centers=init_centers,
             random_state=random_state,
         )
         this_silhouette_score = silhouette_score(projected_data, cluster_labels)

From efb4231433b7f26ff164a131ed84a1c5148ded01 Mon Sep 17 00:00:00 2001
From: Patrick Connolly <patrick.c.connolly@gmail.com>
Date: Tue, 25 Feb 2025 15:14:44 -0800
Subject: [PATCH 02/10] Return cluster_centers from utils.find_optimal_k().

---
 reddwarf/utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/reddwarf/utils.py b/reddwarf/utils.py
index 67b4649..00198f4 100644
--- a/reddwarf/utils.py
+++ b/reddwarf/utils.py
@@ -327,7 +327,7 @@ def find_optimal_k(
     best_silhouette_score = -np.inf
 
     for k_test in K_RANGE:
-        cluster_labels, _ = run_kmeans(
+        cluster_labels, cluster_centers = run_kmeans(
             dataframe=projected_data,
             n_clusters=k_test,
             init_centers=init_centers,
@@ -340,9 +340,11 @@ def find_optimal_k(
             k_best = k_test
             best_silhouette_score = this_silhouette_score
             best_cluster_labels = cluster_labels
+            best_cluster_centers = cluster_centers
 
     optimal_k = k_best
     optimal_silhouette = best_silhouette_score
     optimal_cluster_labels = best_cluster_labels
+    optimal_cluster_centers = best_cluster_centers
 
-    return optimal_k, optimal_silhouette, optimal_cluster_labels
+    return optimal_k, optimal_silhouette, optimal_cluster_labels, optimal_cluster_centers

From 103981c1b290f1ff96c30f84d30aaef9c4d4ca8c Mon Sep 17 00:00:00 2001
From: Patrick Connolly <patrick.c.connolly@gmail.com>
Date: Tue, 25 Feb 2025 15:15:16 -0800
Subject: [PATCH 03/10] Added demdis type definitions.

---
 reddwarf/types/demdis.py | 72 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 reddwarf/types/demdis.py

diff --git a/reddwarf/types/demdis.py b/reddwarf/types/demdis.py
new file mode 100644
index 0000000..9a9f67c
--- /dev/null
+++ b/reddwarf/types/demdis.py
@@ -0,0 +1,72 @@
+from typing import Annotated
+from datetime import datetime
+
+
+# Source: https://github.com/Demdis/Clustering-types/blob/main/types.py
+
+class VoteValueEnum(str):
+    AGREE = "agree"
+    DISAGREE = "disagree"
+    SKIP = "skip"
+
+
+class VoteModel():
+    id: int
+    conversation_id: int
+    statement_id: int
+    voted_by_participant_id: int
+
+    value: VoteValueEnum
+
+
+class ClusteringCenterModel():
+    center_x: float
+    center_y: float
+
+
+class StatementConversationMetric():
+    statement_id: int
+    mean_agreement_percentage: float
+    consensus_points: int
+    polarization_measurement: float
+
+
+class ClusteringParticipant():
+    participant_id: int
+    cluster_center_name: str
+    x: float
+    y: float
+
+
+class ClusteredStatement():
+    statement_id: int
+    cluster_center_name: str
+    agreement_count: int
+    disagreement_count: int
+    skip_count: int
+    unseen_count: int
+    agreement_percentage: float
+    cluster_defining_pos_coefficient: float
+    cluster_defining_neg_coefficient: float
+    cluster_defining_skip_coefficient: float
+
+
+class ClusteringCenter():
+    name: str
+    center_x: float
+    center_y: float
+    participant_count: int
+
+    participants: list[ClusteringParticipant]
+    statements: list[ClusteredStatement]
+
+
+class ClusteringResult():
+    participant_count: int
+    participants_clustered: int
+    vote_count: int
+    statement_count: int
+    last_vote_at: datetime
+
+    statement_metrics: list[StatementConversationMetric]
+    centers: list[ClusteringCenter]

From 6393083794d198bb0efb2020477795becc7d219d Mon Sep 17 00:00:00 2001
From: Patrick Connolly <patrick.c.connolly@gmail.com>
Date: Tue, 25 Feb 2025 15:16:05 -0800
Subject: [PATCH 04/10] Added basic reproduction of DemDis run_clustering
 (still missing statement statistics).

---
 reddwarf/demdis.py     | 124 +++++++++++++++++++++++++++++++++++++++++
 reddwarf/types/base.py |   6 ++
 2 files changed, 130 insertions(+)
 create mode 100644 reddwarf/demdis.py
 create mode 100644 reddwarf/types/base.py

diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py
new file mode 100644
index 0000000..2d148af
--- /dev/null
+++ b/reddwarf/demdis.py
@@ -0,0 +1,124 @@
+from typing import Annotated
+from reddwarf.types.demdis import ClusteringResult, VoteModel, ClusteringCenterModel, ClusteringCenter, VoteValueEnum as DemDisVoteValueEnum
+from reddwarf.types.base import VoteValueEnum as BaseVoteValueEnum
+from reddwarf import utils
+
+
+DEFAULT_MIN_USER_VOTE_THRESHOLD = 7
+DEFAULT_MAX_CLUSTERS = 5
+DEFAULT_CLUSTER_NAMES = ["A", "B", "C", "D", "E"]
+
+DEMDIS_VOTE_KEY_MAPPING = {
+    "voted_by_participant_id": "participant_id",
+    "value": "vote",
+}
+
+DEMDIS_VOTE_VALUE_MAPPING = {
+    DemDisVoteValueEnum.AGREE:    BaseVoteValueEnum.UP,
+    DemDisVoteValueEnum.SKIP:     BaseVoteValueEnum.NEUTRAL,
+    DemDisVoteValueEnum.DISAGREE: BaseVoteValueEnum.DOWN,
+}
+
+def remap_vote_values(votes, value_mapping, key_mapping = {}):
+    rekeyed_votes = [
+        {
+            # Use key_mapping if available, otherwise keep key unchanged.
+            (key_mapping.get(k, k)): val
+                for k, val in vote.items()
+        }
+        for vote in votes
+    ]
+
+    remapped_votes = [
+        {
+            # Use vote value_mapping if available, otherwise keep value unchanged.
+            k: (value_mapping.get(val, val) if k == "vote" else val)
+                for k, val in vote.items()
+        }
+        for vote in rekeyed_votes
+    ]
+
+    return remapped_votes
+
+# See: https://github.com/Demdis/Clustering-types/blob/main/types.py
+def run_clustering(
+    *,
+    votes: list[VoteModel],
+    reference_cluster_centers: list[ClusteringCenterModel] | None,
+    statement_boost: tuple[Annotated[int, "statement id"], Annotated[float, "boost"]] | None = None,
+    specific_cluster_count: int | None = None,
+    do_remap = True,
+) -> ClusteringResult:
+    if do_remap:
+        votes = remap_vote_values(votes, DEMDIS_VOTE_VALUE_MAPPING, DEMDIS_VOTE_KEY_MAPPING)
+
+    raw_vote_matrix = utils.generate_raw_matrix(votes=votes)
+    all_statement_ids = raw_vote_matrix.columns
+
+    filtered_vote_matrix = utils.filter_matrix(
+        vote_matrix=raw_vote_matrix,
+        min_user_vote_threshold=DEFAULT_MIN_USER_VOTE_THRESHOLD,
+        active_statement_ids=all_statement_ids,
+    )
+
+    projected_data, _, _ = utils.run_pca(vote_matrix=filtered_vote_matrix)
+
+    projected_data = utils.scale_projected_data(
+        projected_data=projected_data,
+        vote_matrix=filtered_vote_matrix,
+    )
+
+    # TODO: Confirm init_centers works.
+    if specific_cluster_count:
+        cluster_labels, cluster_centers = utils.run_kmeans(
+            dataframe=projected_data,
+            n_clusters=specific_cluster_count,
+            init_centers=([[c["center_x"], c["center_y"]] for c in reference_cluster_centers] if reference_cluster_centers else None)
+        )
+    else:
+        _, _, cluster_labels, cluster_centers = utils.find_optimal_k(
+            projected_data=projected_data,
+            max_group_count=DEFAULT_MAX_CLUSTERS,
+            init_centers=([[c["center_x"], c["center_y"]] for c in reference_cluster_centers] if reference_cluster_centers else None)
+        )
+
+    # Add cluster label column to dataframe.
+    projected_data = projected_data.assign(cluster_id=cluster_labels)
+    # Convert participant_id index into regular column, for ease of transformation.
+    projected_data = projected_data.reset_index()
+
+    def build_centers(projected_data):
+        centers = [
+            {
+                "name": DEFAULT_CLUSTER_NAMES[cluster_id],
+                "center_x": float(cluster_centers[cluster_id][0]),
+                "center_y": float(cluster_centers[cluster_id][1]),
+                "participant_count": len(group_df),
+                "participants": [
+                    {
+                        "participant_id": row.participant_id,
+                        "cluster_center_name": DEFAULT_CLUSTER_NAMES[cluster_id],
+                        "x": row.x,
+                        "y": row.y,
+                    }
+                    for row in group_df.itertuples(index=False)
+                ],
+                "statements": [], # TODO
+            }
+            for cluster_id, group_df in projected_data.groupby("cluster_id")
+        ]
+
+        return centers
+
+    result: ClusteringResult = {
+        "participant_count": len(raw_vote_matrix.index),
+        "participants_clustered": len(filtered_vote_matrix.index),
+        "vote_count": int(raw_vote_matrix.count().sum()),
+        "statement_count": len(raw_vote_matrix.columns),
+        "last_vote_at": None, # TODO
+
+        "statement_metrics": [],
+        "centers": build_centers(projected_data),
+    }
+
+    return result
diff --git a/reddwarf/types/base.py b/reddwarf/types/base.py
new file mode 100644
index 0000000..0ea39fd
--- /dev/null
+++ b/reddwarf/types/base.py
@@ -0,0 +1,6 @@
+from enum import IntEnum
+
+class VoteValueEnum(IntEnum):
+    UP = 1
+    NEUTRAL = 0
+    DOWN = -1

From f3d49d1d133b668dcf0c1e2728ab0944d6fa038c Mon Sep 17 00:00:00 2001
From: Patrick Connolly <patrick.c.connolly@gmail.com>
Date: Tue, 25 Feb 2025 15:16:19 -0800
Subject: [PATCH 05/10] Added simple debug test for DemDis.

---
 debug.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/debug.py b/debug.py
index 1bd0827..3566017 100644
--- a/debug.py
+++ b/debug.py
@@ -29,7 +29,7 @@
     },
 }
 
-if True:
+if False:
     report_id = CONVOS["tech-politics-2018"]["report_id"]
     print(f"Loading data from report: https://pol.is/report/{report_id}")
 
@@ -49,6 +49,26 @@
     presenter.render_optimal_cluster_figure()
     # client.generate_figure(coord_dataframe=client.projected_data)
 
+if True:
+    # test demdis method
+    from reddwarf.demdis import run_clustering
+
+    report_id = CONVOS["tech-politics-2018"]["report_id"]
+    print(f"Loading data from report: https://pol.is/report/{report_id}")
+
+    client = PolisClient()
+    client.load_data(report_id=report_id)
+
+    results = run_clustering(
+        votes=client.data_loader.votes_data,
+        reference_cluster_centers=None,
+        # specific_cluster_count=2,
+        do_remap=False,
+    )
+
+    from pprint import pprint
+    pprint(results)
+
 if False:
     # Show convo with duplicate votes.
     # Shareable demo: https://gist.github.com/patcon/9c1a39291cd75b23722a5379d7cfc3cc

From 6e2b4090234534acd3dc65f26e9a761ee1d8a78f Mon Sep 17 00:00:00 2001
From: Patrick Connolly <patrick.c.connolly@gmail.com>
Date: Tue, 25 Feb 2025 16:09:37 -0800
Subject: [PATCH 06/10] Rename do_remap to skip_remap.

---
 debug.py           | 2 +-
 reddwarf/demdis.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/debug.py b/debug.py
index 3566017..6efa6c9 100644
--- a/debug.py
+++ b/debug.py
@@ -63,7 +63,7 @@
         votes=client.data_loader.votes_data,
         reference_cluster_centers=None,
         # specific_cluster_count=2,
-        do_remap=False,
+        skip_remap=True,
     )
 
     from pprint import pprint
diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py
index 2d148af..5fc08f1 100644
--- a/reddwarf/demdis.py
+++ b/reddwarf/demdis.py
@@ -47,9 +47,9 @@ def run_clustering(
     reference_cluster_centers: list[ClusteringCenterModel] | None,
     statement_boost: tuple[Annotated[int, "statement id"], Annotated[float, "boost"]] | None = None,
     specific_cluster_count: int | None = None,
-    do_remap = True,
+    skip_remap = False,
 ) -> ClusteringResult:
-    if do_remap:
+    if not skip_remap:
         votes = remap_vote_values(votes, DEMDIS_VOTE_VALUE_MAPPING, DEMDIS_VOTE_KEY_MAPPING)
 
     raw_vote_matrix = utils.generate_raw_matrix(votes=votes)

From 67b3b2b10465ca764e445fbccd5e03cd13d6c58d Mon Sep 17 00:00:00 2001
From: Patrick Connolly <patrick.c.connolly@gmail.com>
Date: Tue, 25 Feb 2025 17:44:32 -0800
Subject: [PATCH 07/10] No need to reset_index on df.

---
 reddwarf/demdis.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py
index 5fc08f1..d0296e6 100644
--- a/reddwarf/demdis.py
+++ b/reddwarf/demdis.py
@@ -84,8 +84,6 @@ def run_clustering(
 
     # Add cluster label column to dataframe.
     projected_data = projected_data.assign(cluster_id=cluster_labels)
-    # Convert participant_id index into regular column, for ease of transformation.
-    projected_data = projected_data.reset_index()
 
     def build_centers(projected_data):
         centers = [
@@ -96,12 +94,12 @@ def build_centers(projected_data):
                 "participant_count": len(group_df),
                 "participants": [
                     {
-                        "participant_id": row.participant_id,
+                        "participant_id": row.index,
                         "cluster_center_name": DEFAULT_CLUSTER_NAMES[cluster_id],
                         "x": row.x,
                         "y": row.y,
                     }
-                    for row in group_df.itertuples(index=False)
+                    for row in group_df.itertuples(index=True)
                 ],
                 "statements": [], # TODO
             }

From cced506a848f57f4f57bd8d37f6e15f1ca2af084 Mon Sep 17 00:00:00 2001
From: Patrick Connolly <patrick.c.connolly@gmail.com>
Date: Tue, 25 Feb 2025 17:45:15 -0800
Subject: [PATCH 08/10] Added a bunch more data for ClusteredStatements object.

---
 reddwarf/demdis.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py
index d0296e6..326eba5 100644
--- a/reddwarf/demdis.py
+++ b/reddwarf/demdis.py
@@ -85,13 +85,16 @@ def run_clustering(
     # Add cluster label column to dataframe.
     projected_data = projected_data.assign(cluster_id=cluster_labels)
 
-    def build_centers(projected_data):
+    merged_df = projected_data.join(filtered_vote_matrix)
+
+
+    def build_centers(df):
         centers = [
             {
                 "name": DEFAULT_CLUSTER_NAMES[cluster_id],
                 "center_x": float(cluster_centers[cluster_id][0]),
                 "center_y": float(cluster_centers[cluster_id][1]),
-                "participant_count": len(group_df),
+                "participant_count": (group_participant_count := len(group_df)),
                 "participants": [
                     {
                         "participant_id": row.index,
@@ -101,9 +104,23 @@ def build_centers(projected_data):
                     }
                     for row in group_df.itertuples(index=True)
                 ],
-                "statements": [], # TODO
+                "statements": [
+                    {
+                        "statement_id": statement_id,
+                        "cluster_center_name": DEFAULT_CLUSTER_NAMES[cluster_id],
+                        "agreement_count": (agreement_count := int((group_df[statement_id] == 1).sum())),
+                        "disagreement_count": int((group_df[statement_id] == -1).sum()),
+                        "skip_count": int((group_df[statement_id] == 0).sum()),
+                        "unseen_count": int(group_df[statement_id].isna().sum()),
+                        "agreement_percentage": (agreement_count / group_participant_count) * 100,
+                        "cluster_defining_pos_coefficient": 0.0, # TODO
+                        "cluster_defining_neg_coefficient": 0.0, # TODO
+                        "cluster_defining_skip_coefficient": 0.0, # TODO
+                    }
+                    for statement_id in filtered_vote_matrix.columns
+                ],
             }
-            for cluster_id, group_df in projected_data.groupby("cluster_id")
+            for cluster_id, group_df in df.groupby("cluster_id")
         ]
 
         return centers
@@ -116,7 +133,7 @@ def build_centers(projected_data):
         "last_vote_at": None, # TODO
 
         "statement_metrics": [],
-        "centers": build_centers(projected_data),
+        "centers": build_centers(merged_df),
     }
 
     return result

From 6e85dd67a8020a244cc79a133396ee85bd3fb32b Mon Sep 17 00:00:00 2001
From: Patrick Connolly <patrick.c.connolly@gmail.com>
Date: Tue, 25 Feb 2025 18:03:01 -0800
Subject: [PATCH 09/10] Added last_vote_at key to demdis response.

---
 reddwarf/demdis.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py
index 326eba5..acef832 100644
--- a/reddwarf/demdis.py
+++ b/reddwarf/demdis.py
@@ -2,6 +2,7 @@
 from reddwarf.types.demdis import ClusteringResult, VoteModel, ClusteringCenterModel, ClusteringCenter, VoteValueEnum as DemDisVoteValueEnum
 from reddwarf.types.base import VoteValueEnum as BaseVoteValueEnum
 from reddwarf import utils
+from datetime import datetime
 
 
 DEFAULT_MIN_USER_VOTE_THRESHOLD = 7
@@ -52,6 +53,11 @@ def run_clustering(
     if not skip_remap:
         votes = remap_vote_values(votes, DEMDIS_VOTE_VALUE_MAPPING, DEMDIS_VOTE_KEY_MAPPING)
 
+    last_vote_timestamp = 0
+    for vote in votes:
+        if vote["modified"] > last_vote_timestamp:
+            last_vote_timestamp = vote["modified"]
+
     raw_vote_matrix = utils.generate_raw_matrix(votes=votes)
     all_statement_ids = raw_vote_matrix.columns
 
@@ -130,7 +136,7 @@ def build_centers(df):
         "participants_clustered": len(filtered_vote_matrix.index),
         "vote_count": int(raw_vote_matrix.count().sum()),
         "statement_count": len(raw_vote_matrix.columns),
-        "last_vote_at": None, # TODO
+        "last_vote_at": datetime.fromtimestamp(last_vote_timestamp/1000),
 
         "statement_metrics": [],
         "centers": build_centers(merged_df),

From 27924dc2b91449e957070b96860d394c6b661c7b Mon Sep 17 00:00:00 2001
From: Patrick Connolly <patrick.c.connolly@gmail.com>
Date: Tue, 25 Feb 2025 18:09:12 -0800
Subject: [PATCH 10/10] Stubbed out statement_metrics.

---
 reddwarf/demdis.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/reddwarf/demdis.py b/reddwarf/demdis.py
index acef832..ac2bee2 100644
--- a/reddwarf/demdis.py
+++ b/reddwarf/demdis.py
@@ -138,7 +138,16 @@ def build_centers(df):
         "statement_count": len(raw_vote_matrix.columns),
         "last_vote_at": datetime.fromtimestamp(last_vote_timestamp/1000),
 
-        "statement_metrics": [],
+        "statement_metrics": [
+            {
+                "statement_id": statement_id,
+                "mean_agreement_percentage": 0.0, # TODO
+                "consensus_points": 0, # TODO
+                "polarization_measurement": 0.0, # TODO
+
+            }
+            for statement_id in filtered_vote_matrix.columns
+        ],
         "centers": build_centers(merged_df),
     }