Skip to content

Commit 8556d76

Browse files
feat: PandasLabeledDataProvider Added
1 parent daf4341 commit 8556d76

4 files changed

Lines changed: 523 additions & 1 deletion

File tree

examples/tutorial_dataset.ipynb

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Dataset and PandasLabeledDataProvider tutorial\n",
8+
"\n",
9+
"This notebook demonstrates how to build annotated time series with `PandasLabeledDataProvider`, combine them into `Dataset`, and select bisegments for NoReset experiments."
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": null,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"import pandas as pd\n",
19+
"\n",
20+
"from pysatl_cpd.core.data_providers.dataset import (\n",
21+
" Annotation,\n",
22+
" Dataset,\n",
23+
" PandasLabeledDataProvider,\n",
24+
")\n"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": null,
30+
"metadata": {},
31+
"outputs": [],
32+
"source": [
33+
"ts_one = pd.DataFrame(\n",
34+
" {\n",
35+
" \"value\": [1.0, 1.2, 1.1, 4.0, 3.9, 8.0, 8.1],\n",
36+
" \"aux\": [10, 11, 12, 20, 21, 30, 31],\n",
37+
" \"segments\": [0, 0, 0, 1, 1, 2, 2],\n",
38+
" }\n",
39+
")\n",
40+
"\n",
41+
"segment_info_one = pd.DataFrame(\n",
42+
" {\n",
43+
" \"start\": [0, 3, 5],\n",
44+
" \"end\": [2, 4, 6],\n",
45+
" \"label\": [\"stable\", \"middle\", \"shifted\"],\n",
46+
" }\n",
47+
")\n",
48+
"\n",
49+
"provider_one = PandasLabeledDataProvider(\n",
50+
" dataset=ts_one,\n",
51+
" segment_info=segment_info_one,\n",
52+
" annotation=Annotation(path=\"ts_one.csv\", scenario=\"A\", version=\"v1\"),\n",
53+
" name=\"series_one\",\n",
54+
")\n",
55+
"\n",
56+
"ts_two = pd.DataFrame(\n",
57+
" {\n",
58+
" \"value\": [0.5, 0.4, 2.5, 2.7],\n",
59+
" \"aux\": [7, 8, 9, 10],\n",
60+
" \"segments\": [0, 0, 1, 1],\n",
61+
" }\n",
62+
")\n",
63+
"\n",
64+
"segment_info_two = pd.DataFrame(\n",
65+
" {\n",
66+
" \"start\": [0, 2],\n",
67+
" \"end\": [1, 3],\n",
68+
" \"label\": [\"baseline\", \"changed\"],\n",
69+
" }\n",
70+
")\n",
71+
"\n",
72+
"provider_two = PandasLabeledDataProvider(\n",
73+
" dataset=ts_two,\n",
74+
" segment_info=segment_info_two,\n",
75+
" annotation=Annotation(path=\"ts_two.csv\", scenario=\"B\", version=\"v1\"),\n",
76+
" name=\"series_two\",\n",
77+
")\n",
78+
"\n",
79+
"dataset = Dataset([provider_one, provider_two])\n",
80+
"dataset"
81+
]
82+
},
83+
{
84+
"cell_type": "code",
85+
"execution_count": null,
86+
"metadata": {},
87+
"outputs": [],
88+
"source": [
89+
"# 1) Change points are inferred from the `segments` column.\n",
90+
"provider_one.change_point\n"
91+
]
92+
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": null,
96+
"metadata": {},
97+
"outputs": [],
98+
"source": [
99+
"# 2) Select a subset of features while keeping the internal segmentation.\n",
100+
"provider_one_value_only = provider_one.select_columns([\"value\"])\n",
101+
"list(provider_one_value_only)[:3]\n"
102+
]
103+
},
104+
{
105+
"cell_type": "code",
106+
"execution_count": null,
107+
"metadata": {},
108+
"outputs": [],
109+
"source": [
110+
"# 3) Filter full dataset by annotation.\n",
111+
"scenario_a = dataset.filter_by_annotation(lambda ann: ann.scenario == \"A\")\n",
112+
"len(scenario_a.timeserieses)\n"
113+
]
114+
},
115+
{
116+
"cell_type": "code",
117+
"execution_count": null,
118+
"metadata": {},
119+
"outputs": [],
120+
"source": [
121+
"# 4) Select bisegments for NoReset mode.\n",
122+
"# Keep only pairs where the next segment starts from index >= 3.\n",
123+
"bisegments = dataset.select_bisegments_by_filter(lambda pair: pair[1].start >= 3)\n",
124+
"len(bisegments), [b.name for b in bisegments]\n"
125+
]
126+
},
127+
{
128+
"cell_type": "code",
129+
"execution_count": null,
130+
"metadata": {},
131+
"outputs": [],
132+
"source": [
133+
"# 5) Inspect one resulting bisegment.\n",
134+
"example_bisegment = bisegments[0]\n",
135+
"example_bisegment.dataset, example_bisegment.segment_info\n"
136+
]
137+
}
138+
],
139+
"metadata": {
140+
"kernelspec": {
141+
"display_name": "Python 3",
142+
"language": "python",
143+
"name": "python3"
144+
},
145+
"language_info": {
146+
"name": "python",
147+
"version": "3.12"
148+
}
149+
},
150+
"nbformat": 4,
151+
"nbformat_minor": 5
152+
}

pysatl_cpd/analysis/labeled_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from collections.abc import Collection, Iterator, Sequence
1111

12-
from pysatl_cpd.core.data_providers import DataProvider
12+
from pysatl_cpd.core.data_providers.idata_provider import DataProvider
1313

1414

1515
class LabeledData[T](DataProvider[T]):

pysatl_cpd/core/data_providers/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
__license__ = "SPDX-License-Identifier: MIT"
1313

1414

15+
from pysatl_cpd.core.data_providers.dataset import Annotation, Dataset, PandasLabeledDataProvider, SegmentInfo
1516
from pysatl_cpd.core.data_providers.idata_provider import DataProvider
1617
from pysatl_cpd.core.data_providers.numpy_data_provider import (
1718
NDArrayMultivariateProvider,
@@ -20,6 +21,10 @@
2021

2122
__all__ = [
2223
"DataProvider",
24+
"Annotation",
25+
"SegmentInfo",
26+
"PandasLabeledDataProvider",
27+
"Dataset",
2328
"NDArrayMultivariateProvider",
2429
"NDArrayUnivariateProvider",
2530
]

0 commit comments

Comments
 (0)