1+ #!/usr/bin/env python
2+ """
3+ Compute statistics for LLM stylometry paper reproduction.
4+ """
5+
6+ import pickle
7+ import pandas as pd
8+ import numpy as np
9+ from scipy import stats
10+ from pathlib import Path
11+ from constants import AUTHORS
12+
13+ def load_data ():
14+ """Load the model results data."""
15+ with open ('data/model_results.pkl' , 'rb' ) as f :
16+ return pickle .load (f )
17+
18+
19+ def find_twain_threshold_epoch (df , p_threshold = 0.001 ):
20+ """
21+ Find the epoch where Twain model's p-value first drops below threshold.
22+ This corresponds to t-threshold of 3.291 for p < 0.001.
23+ """
24+ # Filter for Twain models comparing Twain vs other authors
25+ twain_df = df [df ['train_author' ] == 'twain' ].copy ()
26+
27+ # Get unique epochs sorted
28+ epochs = sorted (twain_df ['epochs_completed' ].unique ())
29+
30+ for epoch in epochs :
31+ epoch_df = twain_df [twain_df ['epochs_completed' ] == epoch ]
32+
33+ # Get self losses (Twain model on Twain text)
34+ self_losses = epoch_df [epoch_df ['loss_dataset' ] == 'twain' ]['loss_value' ].values
35+
36+ # Get other losses (Twain model on other authors' texts)
37+ other_authors = [a for a in AUTHORS if a != 'twain' ]
38+ other_losses = epoch_df [epoch_df ['loss_dataset' ].isin (other_authors )]['loss_value' ].values
39+
40+ if len (self_losses ) >= 10 and len (other_losses ) >= 70 :
41+ # Perform t-test (other vs self)
42+ t_stat , p_value = stats .ttest_ind (other_losses , self_losses , equal_var = False )
43+
44+ if p_value < p_threshold :
45+ return epoch , t_stat , p_value
46+
47+ return None , None , None
48+
49+
50+ def compute_average_t_test (df , epoch = 500 ):
51+ """
52+ Compute t-test comparing average t-values across seeds to 0.
53+ For each seed, compute average t-statistic across all authors.
54+ This reproduces the test on line 230 of the paper.
55+ """
56+ # For each seed, get the t-statistics for all authors
57+ seed_avg_t_stats = []
58+
59+ for seed in range (10 ):
60+ author_t_stats = []
61+
62+ for author in AUTHORS :
63+ # Get all data for this author-seed combination
64+ model_name = f"{ author } _tokenizer=gpt2_seed={ seed } "
65+ model_df = df [df ['model_name' ] == model_name ]
66+
67+ # Get data at the specified epoch (or closest if not exact)
68+ epoch_data = model_df [model_df ['epochs_completed' ] <= epoch ].groupby ('loss_dataset' ).tail (1 )
69+
70+ # Get self losses
71+ self_losses = epoch_data [epoch_data ['loss_dataset' ] == author ]['loss_value' ].values
72+
73+ # Get other losses
74+ other_authors = [a for a in AUTHORS if a != author ]
75+ other_losses = epoch_data [epoch_data ['loss_dataset' ].isin (other_authors )]['loss_value' ].values
76+
77+ if len (self_losses ) > 0 and len (other_losses ) > 0 :
78+ # Use mean values if we only have one sample
79+ if len (self_losses ) == 1 :
80+ # Compute t-statistic using difference of means and std of others
81+ mean_diff = np .mean (other_losses ) - self_losses [0 ]
82+ std_other = np .std (other_losses )
83+ if std_other > 0 :
84+ t_stat = mean_diff / (std_other / np .sqrt (len (other_losses )))
85+ author_t_stats .append (t_stat )
86+ else :
87+ t_stat , _ = stats .ttest_ind (other_losses , self_losses , equal_var = False )
88+ if not np .isnan (t_stat ):
89+ author_t_stats .append (t_stat )
90+
91+ # Average t-statistic across authors for this seed
92+ if len (author_t_stats ) == len (AUTHORS ):
93+ seed_avg_t_stats .append (np .mean (author_t_stats ))
94+
95+ # Test if mean t-statistic is significantly different from 0
96+ if len (seed_avg_t_stats ) == 10 :
97+ t_stat , p_value = stats .ttest_1samp (seed_avg_t_stats , 0 )
98+ return t_stat , p_value , len (seed_avg_t_stats ) - 1
99+
100+ return None , None , None
101+
102+
103+ def generate_author_comparison_table (df ):
104+ """
105+ Generate table of t-tests comparing each author's model losses.
106+ This reproduces Table 1 in the paper.
107+ """
108+ # Get final epoch data
109+ final_df = df .groupby (['train_author' , 'loss_dataset' , 'seed' ]).tail (1 )
110+
111+ # Use the same author order as in the figures
112+ author_order = ['baum' , 'thompson' , 'austen' , 'dickens' , 'fitzgerald' , 'melville' , 'twain' , 'wells' ]
113+
114+ results = []
115+ for author in author_order :
116+ author_df = final_df [final_df ['train_author' ] == author ]
117+
118+ # Get self losses (model trained on author, tested on same author)
119+ self_losses = author_df [author_df ['loss_dataset' ] == author ]['loss_value' ].values
120+
121+ # Get other losses (model trained on author, tested on other authors)
122+ other_authors = [a for a in AUTHORS if a != author ]
123+ other_losses = author_df [author_df ['loss_dataset' ].isin (other_authors )]['loss_value' ].values
124+
125+ if len (self_losses ) >= 10 and len (other_losses ) >= 70 :
126+ # Perform t-test (other vs self)
127+ t_result = stats .ttest_ind (other_losses , self_losses , equal_var = False )
128+
129+ results .append ({
130+ 'Model' : author .capitalize (),
131+ 't-stat' : f'{ t_result .statistic :.2f} ' ,
132+ 'df' : f'{ t_result .df :.2f} ' ,
133+ 'p-value' : f'{ t_result .pvalue :.2e} '
134+ })
135+
136+ return pd .DataFrame (results )
137+
138+
139+ def main ():
140+ """Main function to compute and display all statistics."""
141+ print ("=" * 60 )
142+ print ("LLM Stylometry Statistical Analysis" )
143+ print ("=" * 60 )
144+
145+ # Load data
146+ print ("\n Loading data..." )
147+ df = load_data ()
148+
149+ # 1. Find Twain threshold epoch
150+ print ("\n 1. Twain Model P-Threshold Analysis" )
151+ print ("-" * 40 )
152+ epoch , t_stat , p_value = find_twain_threshold_epoch (df )
153+ if epoch is not None :
154+ print (f"First epoch where p < 0.001: { epoch } " )
155+ print (f"t-statistic at epoch { epoch } : { t_stat :.3f} " )
156+ print (f"p-value at epoch { epoch } : { p_value :.3e} " )
157+ else :
158+ print ("Threshold not reached within training epochs" )
159+
160+ # 2. Average t-test at final epoch
161+ print ("\n 2. Average T-Test Across Authors (Epoch 500)" )
162+ print ("-" * 40 )
163+ t_stat , p_value , df_val = compute_average_t_test (df , epoch = 500 )
164+ if t_stat is not None :
165+ print (f"t({ df_val } ) = { t_stat :.3f} , p = { p_value :.2e} " )
166+
167+ # Format p-value in scientific notation
168+ if p_value < 1e-10 :
169+ exponent = int (np .floor (np .log10 (p_value )))
170+ mantissa = p_value / (10 ** exponent )
171+ print (f"(p-value in scientific notation: { mantissa :.1f} × 10^{ exponent } )" )
172+ else :
173+ print ("Insufficient data for t-test" )
174+
175+ # 3. Author comparison table
176+ print ("\n 3. Author Model Comparison Table (Table 1)" )
177+ print ("-" * 40 )
178+ table = generate_author_comparison_table (df )
179+ print ("\n " + table .to_string (index = False ))
180+
181+ print ("\n " + "=" * 60 )
182+
183+
184+ if __name__ == "__main__" :
185+ main ()
0 commit comments