SplitLight/src/stats/temporal.py at main · monkey0head/SplitLight · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from typing import Optional

import pandas as pd

from ..preprocess.filters import filter_by_date


def deltas_between_subsets(
    input_data: pd.DataFrame, target_data: pd.DataFrame
) -> pd.DataFrame:
    """
    Calculates the time difference between the last timestamp in the input_data and
    the first timestamp in the target_data for each user.

    Args:
        input_data (DataFrame): Input interaction data with 'user_id' and 'timestamp' columns.
        target_data (DataFrame): Target interaction data with 'user_id' and 'timestamp' columns.

    Returns:
        DataFrame: A DataFrame with columns ['user_id', 'timestamp', 'delta'], where
                   'delta' is the time difference between the first target interaction
                   and the last input interaction for each user.
    """
    first_target = (
        target_data.sort_values("timestamp", kind="stable")
        .groupby("user_id")["timestamp"]
        .apply(lambda x: x.iloc[0])
        .to_frame()
    )
    last_input = (
        input_data.sort_values("timestamp", kind="stable")
        .groupby("user_id")["timestamp"]
        .apply(lambda x: x.iloc[-1])
    )

    first_target = first_target.reset_index()
    first_target["delta"] = first_target["timestamp"] - first_target["user_id"].map(
        last_input
    )

    return first_target


def inters_per_period(
    data: pd.DataFrame,
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    granularity: str = "D",
) -> pd.DataFrame:
    """
    Counts number of interactions per time interval (e.g., day, hour, etc.).

    Args:
        data (DataFrame): DataFrame containing 'timestamp' and 'item_id'.
        start_date (str, optional): Start date for filtering (inclusive) in DD/MM/YYYY format.
        end_date (str, optional): End date for filtering (inclusive) in DD/MM/YYYY format.
        granularity (str): Time-based resampling granularity (e.g., 'D', 'W') from pandas.

    Returns:
        DataFrame: A DataFrame with time periods and corresponding interaction counts.
    """
    filtered = filter_by_date(data, start_date, end_date)
    result = (
        filtered.resample(granularity)["item_id"].count().reset_index(name="n_inters")
    )

    return result


def time_counts(
    data: pd.DataFrame,
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    granularity: str = "h",
    normalize: bool = False,
) -> pd.DataFrame:
    """
    Calculates the number of interactions occurred in a specified time unit (hour, day of week, etc.).

    Args:
        data (DataFrame): DataFrame containing 'timestamp' and 'item_id'.
        start_date (str, optional): Start date for filtering (inclusive) in DD/MM/YYYY format.
        end_date (str, optional): End date for filtering (inclusive) in DD/MM/YYYY format.
        granularity (str): Time unit for aggregation ('h', 'd', 'm', 'y').
        normalize (bool): Whether to normalize the interaction counts.

    Returns:
        DataFrame: A DataFrame with interaction counts per time unit.
    """

    data = data.copy()
    data = inters_per_period(data, start_date, end_date, granularity=granularity)

    time_unit_map = {"h": "hour", "d": "day_of_week", "m": "month", "y": "year"}
    time_unit = time_unit_map[granularity.lower()]

    # Extract time component from timestamps
    if granularity.lower() == "h":
        data[time_unit] = data["timestamp"].dt.hour
    elif granularity.lower() == "d":
        data[time_unit] = data["timestamp"].dt.day_of_week
    elif granularity.lower() == "m":
        data[time_unit] = data["timestamp"].dt.month
    elif granularity.lower() == "y":
        data[time_unit] = data["timestamp"].dt.year

    # Group by time unit and sum interactions
    result = data.groupby(time_unit)["n_inters"].sum().reset_index()

    if normalize:
        result["n_inters"] /= result["n_inters"].sum()

    return result