-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathtemporal.py
More file actions
113 lines (90 loc) · 3.83 KB
/
temporal.py
File metadata and controls
113 lines (90 loc) · 3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from typing import Optional
import pandas as pd
from ..preprocess.filters import filter_by_date
def deltas_between_subsets(
input_data: pd.DataFrame, target_data: pd.DataFrame
) -> pd.DataFrame:
"""
Calculates the time difference between the last timestamp in the input_data and
the first timestamp in the target_data for each user.
Args:
input_data (DataFrame): Input interaction data with 'user_id' and 'timestamp' columns.
target_data (DataFrame): Target interaction data with 'user_id' and 'timestamp' columns.
Returns:
DataFrame: A DataFrame with columns ['user_id', 'timestamp', 'delta'], where
'delta' is the time difference between the first target interaction
and the last input interaction for each user.
"""
first_target = (
target_data.sort_values("timestamp", kind="stable")
.groupby("user_id")["timestamp"]
.apply(lambda x: x.iloc[0])
.to_frame()
)
last_input = (
input_data.sort_values("timestamp", kind="stable")
.groupby("user_id")["timestamp"]
.apply(lambda x: x.iloc[-1])
)
first_target = first_target.reset_index()
first_target["delta"] = first_target["timestamp"] - first_target["user_id"].map(
last_input
)
return first_target
def inters_per_period(
data: pd.DataFrame,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
granularity: str = "D",
) -> pd.DataFrame:
"""
Counts number of interactions per time interval (e.g., day, hour, etc.).
Args:
data (DataFrame): DataFrame containing 'timestamp' and 'item_id'.
start_date (str, optional): Start date for filtering (inclusive) in DD/MM/YYYY format.
end_date (str, optional): End date for filtering (inclusive) in DD/MM/YYYY format.
granularity (str): Time-based resampling granularity (e.g., 'D', 'W') from pandas.
Returns:
DataFrame: A DataFrame with time periods and corresponding interaction counts.
"""
filtered = filter_by_date(data, start_date, end_date)
result = (
filtered.resample(granularity)["item_id"].count().reset_index(name="n_inters")
)
return result
def time_counts(
data: pd.DataFrame,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
granularity: str = "h",
normalize: bool = False,
) -> pd.DataFrame:
"""
Calculates the number of interactions occurred in a specified time unit (hour, day of week, etc.).
Args:
data (DataFrame): DataFrame containing 'timestamp' and 'item_id'.
start_date (str, optional): Start date for filtering (inclusive) in DD/MM/YYYY format.
end_date (str, optional): End date for filtering (inclusive) in DD/MM/YYYY format.
granularity (str): Time unit for aggregation ('h', 'd', 'm', 'y').
normalize (bool): Whether to normalize the interaction counts.
Returns:
DataFrame: A DataFrame with interaction counts per time unit.
"""
data = data.copy()
data = inters_per_period(data, start_date, end_date, granularity=granularity)
time_unit_map = {"h": "hour", "d": "day_of_week", "m": "month", "y": "year"}
time_unit = time_unit_map[granularity.lower()]
# Extract time component from timestamps
if granularity.lower() == "h":
data[time_unit] = data["timestamp"].dt.hour
elif granularity.lower() == "d":
data[time_unit] = data["timestamp"].dt.day_of_week
elif granularity.lower() == "m":
data[time_unit] = data["timestamp"].dt.month
elif granularity.lower() == "y":
data[time_unit] = data["timestamp"].dt.year
# Group by time unit and sum interactions
result = data.groupby(time_unit)["n_inters"].sum().reset_index()
if normalize:
result["n_inters"] /= result["n_inters"].sum()
return result