-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathsimulation_saturation.py
More file actions
82 lines (66 loc) · 3.62 KB
/
simulation_saturation.py
File metadata and controls
82 lines (66 loc) · 3.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from utils import get_embedding, calculate_similarity, model
def compute_simulation_saturation(num_iterations):
file_path = 'Output/simulation_log.csv'
df = pd.read_csv(file_path)
filtered_df = df[df['Choice'] == "Posting new content"]
seen_contents = set() # Track the contents already published
repeated_counts = [] # Count the repeated contents for each iteration
num_repeated = 0 # Number of repeated contents
for iteration in range(1, num_iterations + 1):
group = filtered_df[filtered_df['Iteration'] == iteration] # Filter the data for the current iteration
for row in group.itertuples(): # Iterate over the rows of the filtered data
if pd.notna(row.Content): # Check if the 'Content' column is not empty
embedding = get_embedding(row.Content) # Get the embedding for the content
embedding_tuple = tuple(embedding.tolist()) # Convert the NumPy array to a tuple
is_in_seen_contents = False
if iteration == 1:
seen_contents.add(embedding_tuple)
else:
for seen_content in seen_contents:
if calculate_similarity(embedding, np.array(seen_content)) > 0.8:
is_in_seen_contents = True
num_repeated += 1
break
if not is_in_seen_contents:
seen_contents.add(embedding_tuple)
repeated_counts.append(num_repeated)
print(f"Iteration {iteration}: {num_repeated} repeated contents")
# Plot the repeated contents for each iteration
plt.plot(range(1, len(repeated_counts) + 1), repeated_counts, marker='o')
plt.title('Number of repeated contents for iteration')
plt.xlabel('Iteration')
plt.xticks(range(1, len(repeated_counts) + 1))
plt.ylabel('Repeated Contents')
plt.grid(True)
plt.show()
def compute_simulation_saturation_OLD():
file_path = 'Output/simulation_log.csv'
data = pd.read_csv(file_path)
data['Text'] = data['Reason'].fillna('') + ' ' + data['Content'].fillna('') # Concatenate the 'Reason' and 'Content' texts for each iteration
grouped_data = data.groupby('Iteration')['Text'].apply(lambda x: ' '.join(x)).reset_index() # Group the data by 'Iteration' and concatenate the texts within each group
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(grouped_data['Text'].tolist())
# Calculate cosine similarity between each iteration and all previous iterations
similarities = []
for i in range(1, len(embeddings)):
sim = cosine_similarity([embeddings[i]], embeddings[:i])
similarities.append(sim.max()) # Take the maximum similarity with previous iterations
# Plot the similarities
iterations = list(range(2, len(similarities) + 2)) # Generate a list of iteration numbers for the x-axis (starting from 2)
plt.plot(iterations, similarities, marker='o', label='Cosine Similarity')
plt.axhline(y=0.95, color='r', linestyle='--', label='Saturation Threshold')
plt.xlabel('Iterations')
plt.ylabel('Cosine Similarity')
plt.title('Similarity between Iterations and Previous Ones')
plt.legend()
plt.grid(True)
plt.show()
# Check if the last N iterations have a similarity greater than 0.95
N = 5
saturation = all(s > 0.95 for s in similarities[-N:])
print("Is the simulation saturating:", saturation)