-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathcold.py
More file actions
129 lines (98 loc) · 3.97 KB
/
cold.py
File metadata and controls
129 lines (98 loc) · 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from typing import Dict, Optional, Tuple, Union
import pandas as pd
from .utils import resample_by_time
def get_cold(
data: pd.DataFrame, reference_data: pd.DataFrame, col: str = "user_id"
) -> pd.DataFrame:
"""
Mark entries in data as 'cold' if their ID is not present in reference_data.
Args:
data: Target DataFrame to evaluate for cold entries.
reference_data: Reference DataFrame with known IDs.
col: Column name to check for coldness (e.g., 'user_id' or 'item_id').
Returns:
A copy of data with an added boolean 'is_cold' column.
"""
# Get warm values from reference_data
warm = reference_data[col].unique()
# Mark entries not in warm set as cold
final_df = data.copy()
final_df["is_cold"] = ~final_df[col].isin(warm)
return final_df
def share_of_cold(
data: pd.DataFrame, reference_data: pd.DataFrame, col: str = "user_id"
) -> Tuple[int, float, float]:
"""
Calculates the share and count of cold entities and interactions.
Args:
data (pd.DataFrame): Target DataFrame to evaluate for cold entries.
reference_data (pd.DataFrame): Reference DataFrame containing known entities.
col (str): Column name to check for coldness (e.g., 'user_id' or 'item_id').
Returns:
Tuple[int, float, float]:
- Number of cold entities.
- Share of cold entities (by unique count).
- Share of cold interactions (by total interactions).
"""
cold_df = get_cold(data, reference_data, col)
# Number of unique cold entities
col_num = cold_df[cold_df["is_cold"]][col].nunique()
# Share of cold entities in total count (e.g., share of cold users in all users)
per_col = col_num / cold_df[col].nunique()
# Share of cold intercations
per_inter = cold_df["is_cold"].mean()
return col_num, per_col, per_inter
def cold_stats(data: pd.DataFrame, reference_data: pd.DataFrame) -> pd.DataFrame:
"""
Computes cold-start statistics for users and items.
Args:
data (pd.DataFrame): Target DataFrame to evaluate for cold entries.
reference_data (pd.DataFrame): Reference DataFrame containing known entities.
Returns:
pd.DataFrame: A DataFrame summarizing cold-start metrics for users and items.
"""
cold_user, cold_user_per_user, cold_user_per_inter = share_of_cold(
data, reference_data, "user_id"
)
cold_item, cold_item_per_item, cold_item_per_inter = share_of_cold(
data, reference_data, "item_id"
)
data = [
[cold_user, cold_user_per_user, cold_user_per_inter],
[cold_item, cold_item_per_item, cold_item_per_inter],
]
metrics_df = pd.DataFrame(
data,
index=["Cold Users", "Cold Items"],
columns=["Number", "Share (by count)", "Share (by interactions)"],
)
return metrics_df
def cold_counts(
data: pd.DataFrame,
reference_data: pd.DataFrame,
col: str = "user_id",
granularity: Optional[str] = None,
) -> Dict[str, Union[pd.Series, float]]:
"""
Computes cold interaction counts over time.
Args:
data (pd.DataFrame): Target interactions DataFrame.
reference_data (pd.DataFrame): Reference DataFrame containing known entities.
col (str): Column name to check for coldness (e.g., 'user_id').
granularity (Optional[str]): Time-based resampling granularity (e.g., 'D', 'W') from pandas.
Returns:
Dict[str, Union[pd.Series, float]]: Dictionary with total, cold interaction counts, and share.
"""
df = get_cold(data, reference_data, col)
if granularity:
# Convert timestamps and set as index for resampling
df = resample_by_time(df, granularity)
# Calculate cold interaction counts
cold_counts = df["is_cold"].sum()
total_counts = df["item_id"].count()
result = {
"total_interactions": total_counts,
"cold_interactions": cold_counts,
"cold_share": cold_counts / total_counts,
}
return result