ContextLab
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 17 additions & 0 deletions b/‎README.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎code/compute_stats.py‎
Lines changed: 185 additions & 0 deletions b/‎code/compute_stats.py‎
Lines changed: 185 additions & 0 deletions
diff --git a/‎data/model_results.pkl‎
22.3 KB b/‎data/model_results.pkl‎
22.3 KB
diff --git a/‎models/austen_tokenizer=gpt2_seed=0/config.json‎
Lines changed: 2 additions & 2 deletions b/‎models/austen_tokenizer=gpt2_seed=0/config.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎models/austen_tokenizer=gpt2_seed=0/generation_config.json‎
Lines changed: 1 addition & 1 deletion b/‎models/austen_tokenizer=gpt2_seed=0/generation_config.json‎
Lines changed: 1 addition & 1 deletion
@@ -26,4 +26,6 @@ tests/data/*.pkl
 !tests/data/test_model_results.pkl
 
 # Temporary test files
-.test_credentials
+.test_credentials
+models/*/model.safetensors
+models/*/training_state.pt
@@ -110,6 +110,9 @@ pip install -e .
 # Train models from scratch (requires GPU)
 ./run_llm_stylometry.sh -t
 
+# Compute statistical analyses (Table 1 and key statistics)
+./run_stats.sh
+
 # Custom data and output paths
 ./run_llm_stylometry.sh -d path/to/model_results.pkl -o path/to/output
 
@@ -168,6 +171,20 @@ fig = generate_all_losses_figure(
 - **4**: Figure 4 - 3D MDS plot (3d_MDS_plot.pdf)
 - **5**: Figure 5 - Oz authorship analysis (oz_losses.pdf)
 
+### Statistical Analysis
+
+Generate key statistics from the paper:
+
+```bash
+# Compute statistical analyses
+./run_stats.sh
+```
+
+This produces:
+- **Twain p-threshold analysis**: Epoch where Twain model first achieves p < 0.001
+- **Average t-test**: t-test of average t-statistics across seeds, at 500th epoch
+- **Table 1**: Individual author model t-tests comparing self vs. other losses
+
 ## Training Models from Scratch
 
 **Note**: Training requires a CUDA-enabled GPU and takes significant time (~80 models total).
 
@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+"""
+Compute statistics for LLM stylometry paper reproduction.
+"""
+
+import pickle
+import pandas as pd
+import numpy as np
+from scipy import stats
+from pathlib import Path
+from constants import AUTHORS
+
+def load_data():
+    """Load the model results data."""
+    with open('data/model_results.pkl', 'rb') as f:
+        return pickle.load(f)
+
+
+def find_twain_threshold_epoch(df, p_threshold=0.001):
+    """
+    Find the epoch where Twain model's p-value first drops below threshold.
+    This corresponds to t-threshold of 3.291 for p < 0.001.
+    """
+    # Filter for Twain models comparing Twain vs other authors
+    twain_df = df[df['train_author'] == 'twain'].copy()
+
+    # Get unique epochs sorted
+    epochs = sorted(twain_df['epochs_completed'].unique())
+
+    for epoch in epochs:
+        epoch_df = twain_df[twain_df['epochs_completed'] == epoch]
+
+        # Get self losses (Twain model on Twain text)
+        self_losses = epoch_df[epoch_df['loss_dataset'] == 'twain']['loss_value'].values
+
+        # Get other losses (Twain model on other authors' texts)
+        other_authors = [a for a in AUTHORS if a != 'twain']
+        other_losses = epoch_df[epoch_df['loss_dataset'].isin(other_authors)]['loss_value'].values
+
+        if len(self_losses) >= 10 and len(other_losses) >= 70:
+            # Perform t-test (other vs self)
+            t_stat, p_value = stats.ttest_ind(other_losses, self_losses, equal_var=False)
+
+            if p_value < p_threshold:
+                return epoch, t_stat, p_value
+
+    return None, None, None
+
+
+def compute_average_t_test(df, epoch=500):
+    """
+    Compute t-test comparing average t-values across seeds to 0.
+    For each seed, compute average t-statistic across all authors.
+    This reproduces the test on line 230 of the paper.
+    """
+    # For each seed, get the t-statistics for all authors
+    seed_avg_t_stats = []
+
+    for seed in range(10):
+        author_t_stats = []
+
+        for author in AUTHORS:
+            # Get all data for this author-seed combination
+            model_name = f"{author}_tokenizer=gpt2_seed={seed}"
+            model_df = df[df['model_name'] == model_name]
+
+            # Get data at the specified epoch (or closest if not exact)
+            epoch_data = model_df[model_df['epochs_completed'] <= epoch].groupby('loss_dataset').tail(1)
+
+            # Get self losses
+            self_losses = epoch_data[epoch_data['loss_dataset'] == author]['loss_value'].values
+
+            # Get other losses
+            other_authors = [a for a in AUTHORS if a != author]
+            other_losses = epoch_data[epoch_data['loss_dataset'].isin(other_authors)]['loss_value'].values
+
+            if len(self_losses) > 0 and len(other_losses) > 0:
+                # Use mean values if we only have one sample
+                if len(self_losses) == 1:
+                    # Compute t-statistic using difference of means and std of others
+                    mean_diff = np.mean(other_losses) - self_losses[0]
+                    std_other = np.std(other_losses)
+                    if std_other > 0:
+                        t_stat = mean_diff / (std_other / np.sqrt(len(other_losses)))
+                        author_t_stats.append(t_stat)
+                else:
+                    t_stat, _ = stats.ttest_ind(other_losses, self_losses, equal_var=False)
+                    if not np.isnan(t_stat):
+                        author_t_stats.append(t_stat)
+
+        # Average t-statistic across authors for this seed
+        if len(author_t_stats) == len(AUTHORS):
+            seed_avg_t_stats.append(np.mean(author_t_stats))
+
+    # Test if mean t-statistic is significantly different from 0
+    if len(seed_avg_t_stats) == 10:
+        t_stat, p_value = stats.ttest_1samp(seed_avg_t_stats, 0)
+        return t_stat, p_value, len(seed_avg_t_stats) - 1
+
+    return None, None, None
+
+
+def generate_author_comparison_table(df):
+    """
+    Generate table of t-tests comparing each author's model losses.
+    This reproduces Table 1 in the paper.
+    """
+    # Get final epoch data
+    final_df = df.groupby(['train_author', 'loss_dataset', 'seed']).tail(1)
+
+    # Use the same author order as in the figures
+    author_order = ['baum', 'thompson', 'austen', 'dickens', 'fitzgerald', 'melville', 'twain', 'wells']
+
+    results = []
+    for author in author_order:
+        author_df = final_df[final_df['train_author'] == author]
+
+        # Get self losses (model trained on author, tested on same author)
+        self_losses = author_df[author_df['loss_dataset'] == author]['loss_value'].values
+
+        # Get other losses (model trained on author, tested on other authors)
+        other_authors = [a for a in AUTHORS if a != author]
+        other_losses = author_df[author_df['loss_dataset'].isin(other_authors)]['loss_value'].values
+
+        if len(self_losses) >= 10 and len(other_losses) >= 70:
+            # Perform t-test (other vs self)
+            t_result = stats.ttest_ind(other_losses, self_losses, equal_var=False)
+
+            results.append({
+                'Model': author.capitalize(),
+                't-stat': f'{t_result.statistic:.2f}',
+                'df': f'{t_result.df:.2f}',
+                'p-value': f'{t_result.pvalue:.2e}'
+            })
+
+    return pd.DataFrame(results)
+
+
+def main():
+    """Main function to compute and display all statistics."""
+    print("=" * 60)
+    print("LLM Stylometry Statistical Analysis")
+    print("=" * 60)
+
+    # Load data
+    print("\nLoading data...")
+    df = load_data()
+
+    # 1. Find Twain threshold epoch
+    print("\n1. Twain Model P-Threshold Analysis")
+    print("-" * 40)
+    epoch, t_stat, p_value = find_twain_threshold_epoch(df)
+    if epoch is not None:
+        print(f"First epoch where p < 0.001: {epoch}")
+        print(f"t-statistic at epoch {epoch}: {t_stat:.3f}")
+        print(f"p-value at epoch {epoch}: {p_value:.3e}")
+    else:
+        print("Threshold not reached within training epochs")
+
+    # 2. Average t-test at final epoch
+    print("\n2. Average T-Test Across Authors (Epoch 500)")
+    print("-" * 40)
+    t_stat, p_value, df_val = compute_average_t_test(df, epoch=500)
+    if t_stat is not None:
+        print(f"t({df_val}) = {t_stat:.3f}, p = {p_value:.2e}")
+
+        # Format p-value in scientific notation
+        if p_value < 1e-10:
+            exponent = int(np.floor(np.log10(p_value)))
+            mantissa = p_value / (10 ** exponent)
+            print(f"(p-value in scientific notation: {mantissa:.1f} × 10^{exponent})")
+    else:
+        print("Insufficient data for t-test")
+
+    # 3. Author comparison table
+    print("\n3. Author Model Comparison Table (Table 1)")
+    print("-" * 40)
+    table = generate_author_comparison_table(df)
+    print("\n" + table.to_string(index=False))
+
+    print("\n" + "=" * 60)
+
+
+if __name__ == "__main__":
+    main()
@@ -5,6 +5,7 @@
   ],
   "attn_pdrop": 0.1,
   "bos_token_id": 50256,
+  "dtype": "float32",
   "embd_pdrop": 0.1,
   "eos_token_id": 50256,
   "initializer_range": 0.02,
@@ -24,8 +25,7 @@
   "summary_proj_to_labels": true,
   "summary_type": "cls_index",
   "summary_use_proj": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.45.2",
+  "transformers_version": "4.56.1",
   "use_cache": true,
   "vocab_size": 50257
 }
@@ -2,5 +2,5 @@
   "_from_model_config": true,
   "bos_token_id": 50256,
   "eos_token_id": 50256,
-  "transformers_version": "4.45.2"
+  "transformers_version": "4.56.1"
 }
Original file line number	Diff line number	Diff line change
`@@ -2,5 +2,5 @@`
`2`	`2`	`"_from_model_config": true,`
`3`	`3`	`"bos_token_id": 50256,`
`4`	`4`	`"eos_token_id": 50256,`
`5`		`- "transformers_version": "4.45.2"`
	`5`	`+ "transformers_version": "4.56.1"`
`6`	`6`	`}`