Skip to content

Commit 9b45a85

Browse files
Added new sinapsis-data-analysis package and fixed bugs dataset splitter and image saver templates.
1 parent f3db837 commit 9b45a85

23 files changed

Lines changed: 1396 additions & 11 deletions

File tree

packages/sinapsis_data_readers/src/sinapsis_data_readers/templates/datasets_readers/dataset_splitter.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def execute(self, container: DataContainer) -> DataContainer:
129129
x_data, y_data = self.extract_x_y_from_packet(packet)
130130

131131
custom_dataset = self.store_data_in_data_splitter(x_data, y_data)
132-
container.set_generic_data(container, custom_dataset)
132+
self._set_generic_data(container, custom_dataset)
133133
return container
134134

135135

@@ -212,16 +212,16 @@ class AttributesBaseModel(DatasetSplitterBase.AttributesBaseModel):
212212
generic_data_target_key: str = "target" # labels
213213
generic_data_feature_key: str = "data" # arrays
214214

215-
def extract_x_y_from_packet(self, packets: list[Packet] | dict) -> tuple[ArrayDataFrameType, StringDataFrameType]:
215+
def extract_x_y_from_packet(self, packets: list[Packet] | dict) -> tuple[StringDataFrameType, ArrayDataFrameType]:
216216
packet = cast(dict, packets)
217217
dataframe: pd.DataFrame | None = packet.get(self.attributes.generic_data_extract_key, None)
218-
target: pd.DataFrame
219-
feature: pd.DataFrame
220-
if dataframe:
221-
target = dataframe.get(self.attributes.generic_dataset_target_key)
222-
feature = dataframe.get(self.attributes.generic_dataset_feature_key)
218+
target: pd.DataFrame = pd.DataFrame()
219+
feature: pd.DataFrame = pd.DataFrame()
220+
if isinstance(dataframe, pd.DataFrame):
221+
target = dataframe.get(self.attributes.generic_data_target_key)
222+
feature = dataframe.get(self.attributes.generic_data_feature_key)
223223

224-
return target, feature
224+
return feature, target
225225

226226
@staticmethod
227227
def return_data_splitter_object(

packages/sinapsis_data_writers/src/sinapsis_data_writers/templates/image_writers/image_saver.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,16 +101,16 @@ def save_image(self, img_destination: Path, image_packet: ImagePacket) -> str:
101101
img_destination = img_destination.with_suffix(f".{self.attributes.extension}")
102102

103103
path_to_save = str(img_destination)
104-
if image_packet.content is not None and image_packet.content.size > 0: # Check if image is valid
105-
if image_packet.color_space != ImageColor.GRAY:
104+
if image_packet.content is not None and image_packet.content.size > 0:
105+
if image_packet.color_space is not None and image_packet.color_space != ImageColor.GRAY:
106106
image_packet = convert_color_space(image_packet, ImageColor.BGR)
107107
cv2.imwrite(str(img_destination.absolute()), image_packet.content)
108108
self.logger.debug(f"Saved image to: {img_destination.absolute()}")
109109
return path_to_save
110110
else:
111111
self.logger.warning(f"Attempted to save an invalid image: {img_destination}")
112112
return ""
113-
except (FileNotFoundError, PermissionError, OSError) as e:
113+
except OSError as e:
114114
self.logger.error(f"File system error while saving image to {img_destination}: {e}")
115115
return ""
116116

sinapsis_data_analysis/README.md

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
<h1 align="center">
2+
<br>
3+
<a href="https://sinapsis.tech/">
4+
<img
5+
src="https://github.com/Sinapsis-AI/brand-resources/blob/main/sinapsis_logo/4x/logo.png?raw=true"
6+
alt="" width="300">
7+
</a><br>
8+
Sinapsis Data Analysis
9+
<br>
10+
</h1>
11+
12+
<h4 align="center">Module for machine learning model training, analysis, and inference, using the Scikit-learn and XGBoost libraries.</h4>
13+
14+
<p align="center">
15+
<a href="#installation">🐍 Installation</a> •
16+
<a href="#features"> 🚀 Features</a> •
17+
<a href="#example"> 📚 Usage Example</a> •
18+
<a href="#documentation">📙 Documentation</a> •
19+
<a href="#license"> 🔍 License </a>
20+
</p>
21+
22+
**Sinapsis Data Analysis** provides a comprehensive set of tools for machine learning model training, evaluation, and inference using industry-standard libraries like scikit-learn and XGBoost.
23+
24+
<h2 id="installation"> 🐍 Installation </h2>
25+
26+
Install using your package manager of choice. We encourage the use of <code>uv</code>
27+
28+
Example with <code>uv</code>:
29+
30+
```bash
31+
uv pip install sinapsis-data-analysis --extra-index-url https://pypi.sinapsis.tech
32+
```
33+
or with raw <code>pip</code>:
34+
```bash
35+
pip install sinapsis-data-analysis --extra-index-url https://pypi.sinapsis.tech
36+
```
37+
38+
39+
<h2 id="features">🚀 Features</h2>
40+
41+
<h3> Templates Supported</h3>
42+
43+
**Sinapsis Data Analysis** provides a variety of templates for machine learning workflows:
44+
45+
<details>
46+
<summary><strong><span style="font-size: 1.25em;">Scikit-Learn Models</span></strong></summary>
47+
48+
The following model types are supported:
49+
50+
- **Linear Models**: LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression, etc.
51+
- **Neighbors Models**: KNeighborsClassifier, KNeighborsRegressor, RadiusNeighborsClassifier, etc.
52+
- **Neural Network Models**: MLPClassifier, MLPRegressor, BernoulliRBM
53+
- **Tree Models**: DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier, etc.
54+
55+
Each template uses the same base attributes:
56+
- **`generic_field_key` (str, required)**: Key of the generic field where datasets are stored
57+
- **`model_save_path` (str, required)**: Path where the trained model will be saved
58+
</details>
59+
60+
<details>
61+
<summary><strong><span style="font-size: 1.25em;">XGBoost Models</span></strong></summary>
62+
63+
XGBoost model templates include:
64+
- XGBClassifier
65+
- XGBRegressor
66+
- XGBRanker
67+
- XGBRFClassifier
68+
- XGBRFRegressor
69+
- Booster
70+
71+
Attributes are the same as those for Scikit-learn templates.
72+
</details>
73+
74+
<details>
75+
<summary><strong><span style="font-size: 1.25em;">Manifold Learning</span></strong></summary>
76+
77+
Templates for dimensionality reduction using scikit-learn's manifold learning techniques:
78+
79+
- **SKLearnManifold**: Base class for all manifold learning algorithms
80+
- **`generic_field_key` (str, required)**: Key of the generic field where the input data is stored
81+
82+
Specific algorithms include t-SNE, MDS, Isomap, LocallyLinearEmbedding, and more.
83+
</details>
84+
85+
<details>
86+
<summary><strong><span style="font-size: 1.25em;">Inference Templates</span></strong></summary>
87+
88+
Templates for using trained models to make predictions on new data:
89+
90+
- **SKLearnInference**: For inference with scikit-learn models
91+
- **XGBoostInference**: For inference with XGBoost models
92+
93+
To use these templates, you should replace the **`model_path`** to point to the path of the trained model.
94+
</details>
95+
96+
> [!TIP]
97+
> Use CLI command ``` sinapsis info --all-template-names``` to show a list with all the available Template names installed with Sinapsis Data Analysis.
98+
99+
> [!TIP]
100+
> Use CLI command ```sinapsis info --example-template-config TEMPLATE_NAME``` to produce an example Agent config for the Template specified in ***TEMPLATE_NAME***.
101+
102+
For example, for ***LinearRegression*** use ```sinapsis info --example-template-config LinearRegression``` to produce an example config.
103+
104+
<h2 id="example"> 📚 Usage Example </h2>
105+
Below is an example configuration for **Sinapsis Data Analysis** using LinearRegressionWrapper for regression.
106+
107+
<details>
108+
<summary><strong><span style="font-size: 1.25em;">Example config</span></strong></summary>
109+
110+
```yaml
111+
agent:
112+
name: sklearn_linear_models_agent
113+
description: agent to train a LinearRegression model from scikit-learn using the load_diabetes dataset
114+
115+
templates:
116+
- template_name: InputTemplate
117+
class_name: InputTemplate
118+
attributes: {}
119+
120+
- template_name: load_diabetesWrapper
121+
class_name: load_diabetesWrapper
122+
template_input: InputTemplate
123+
attributes:
124+
split_dataset: true
125+
train_size: 0.8
126+
load_diabetes:
127+
return_X_y: false
128+
as_frame: true
129+
130+
- template_name: LinearRegressionWrapper
131+
class_name: LinearRegressionWrapper
132+
template_input: load_diabetesWrapper
133+
attributes:
134+
generic_field_for_data: load_diabetesWrapper
135+
model_save_path: "artifacts/linear_regression.joblib"
136+
linearregression_init:
137+
fit_intercept: true
138+
copy_X: true
139+
n_jobs: null
140+
positive: false
141+
```
142+
</details>
143+
144+
To run the config, use the CLI:
145+
```bash
146+
sinapsis run name_of_config.yml
147+
```
148+
149+
<h2 id="documentation">📙 Documentation</h2>
150+
151+
Documentation for this and other sinapsis packages is available on the [sinapsis website](https://docs.sinapsis.tech/docs)
152+
153+
Tutorials for different projects within sinapsis are available at [sinapsis tutorials page](https://docs.sinapsis.tech/tutorials)
154+
155+
<h2 id="license">🔍 License</h2>
156+
157+
This project is licensed under the AGPLv3 license, which encourages open collaboration and sharing. For more details, please refer to the [LICENSE](LICENSE) file.
158+
159+
For commercial use, please refer to our [official Sinapsis website](https://sinapsis.tech) for information on obtaining a commercial license.
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
[project]
2+
name = "sinapsis-data-analysis"
3+
version = "0.1.0"
4+
description = "Templates to work with models for classification, regression and clustering with xgboost and sklearn."
5+
authors = [{ name = "SinapsisAI", email = "dev@sinapsis.tech" }]
6+
7+
readme = "README.md"
8+
license = { file = "LICENSE" }
9+
requires-python = ">=3.10"
10+
dependencies = [
11+
"scikit-learn>=1.6.1",
12+
"sinapsis>=0.1.1",
13+
"sinapsis-data-readers",
14+
"xgboost>=3.0.0",
15+
]
16+
17+
[project.optional-dependencies]
18+
19+
all = [
20+
]
21+
22+
23+
[tool.uv.sources]
24+
sinapsis-data-readers = { workspace = true }
25+
26+
[build-system]
27+
requires = ["setuptools"]
28+
build-backend = "setuptools.build_meta"
29+
30+
31+
[[tool.uv.index]]
32+
name = "sinapsis"
33+
url = "https://pypi.sinapsis.tech/"
34+
35+
36+
[project.urls]
37+
Homepage = "https://sinapsis.tech"
38+
Documentation = "https://docs.sinapsis.tech/docs"
39+
Tutorials = "https://docs.sinapsis.tech/tutorials"
40+
Repository = "https://github.com/Sinapsis-AI/sinapsis-data-tools.git"

sinapsis_data_analysis/src/sinapsis_data_analysis/__init__.py

Whitespace-only changes.
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
agent:
2+
name: sklearn_tree_models_agent
3+
description: agent to train a DecisionTreeClassifier using the load_wine dataset
4+
5+
templates:
6+
- template_name: InputTemplate
7+
class_name: InputTemplate
8+
attributes: {}
9+
10+
- template_name: load_wineWrapper
11+
class_name: load_wineWrapper
12+
template_input: InputTemplate
13+
attributes:
14+
split_dataset: true
15+
train_size: 0.8
16+
load_wine:
17+
return_X_y: false
18+
as_frame: true
19+
20+
- template_name: DecisionTreeClassifierWrapper
21+
class_name: DecisionTreeClassifierWrapper
22+
template_input: load_wineWrapper
23+
attributes:
24+
generic_field_key: load_wineWrapper
25+
model_save_path: "artifacts/decision_tree.joblib"
26+
decisiontreeclassifier_init:
27+
criterion: 'gini'
28+
splitter: 'best'
29+
max_depth: 5
30+
min_samples_split: 2
31+
min_samples_leaf: 1
32+
min_weight_fraction_leaf: 0.0
33+
max_features: null
34+
random_state: 42
35+
max_leaf_nodes: null
36+
min_impurity_decrease: 0.0
37+
class_weight: null
38+
ccp_alpha: 0.0
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
agent:
2+
name: sklearn_manifold_agent
3+
description: agent to train a TSNE from sklearn using the load_digits dataset
4+
5+
templates:
6+
- template_name: InputTemplate
7+
class_name: InputTemplate
8+
attributes: {}
9+
10+
- template_name: load_digitsWrapper
11+
class_name: load_digitsWrapper
12+
template_input: InputTemplate
13+
attributes:
14+
split_dataset: true
15+
train_size: 0.8
16+
load_digits:
17+
n_class: 10
18+
return_X_y: false
19+
as_frame: true
20+
21+
- template_name: TSNEWrapper
22+
class_name: TSNEWrapper
23+
template_input: load_digitsWrapper
24+
attributes:
25+
generic_field_key: load_digitsWrapper
26+
tsne_init:
27+
n_components: 2
28+
perplexity: 30.0
29+
early_exaggeration: 12.0
30+
learning_rate: 200.0
31+
n_iter: 1000
32+
n_iter_without_progress: 300
33+
min_grad_norm: 0.0000001
34+
metric: 'euclidean'
35+
init: 'random'
36+
random_state: 42
37+
method: 'barnes_hut'
38+
angle: 0.5
39+
n_jobs: null
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
agent:
2+
name: sklearn_nn_models_agent
3+
description: agent to train a MLPClassifier using the load_breast_cancer dataset
4+
5+
templates:
6+
- template_name: InputTemplate
7+
class_name: InputTemplate
8+
attributes: {}
9+
10+
- template_name: load_breast_cancerWrapper
11+
class_name: load_breast_cancerWrapper
12+
template_input: InputTemplate
13+
attributes:
14+
split_dataset: true
15+
train_size: 0.8
16+
load_breast_cancer:
17+
return_X_y: false
18+
as_frame: true
19+
20+
- template_name: MLPClassifierWrapper
21+
class_name: MLPClassifierWrapper
22+
template_input: load_breast_cancerWrapper
23+
attributes:
24+
generic_field_key: load_breast_cancerWrapper
25+
model_save_path: "artifacts/mlp_classifier.joblib"
26+
mlpclassifier_init:
27+
hidden_layer_sizes: [100, 50]
28+
activation: 'relu'
29+
solver: 'adam'
30+
alpha: 0.0001
31+
batch_size: 'auto'
32+
learning_rate: 'constant'
33+
learning_rate_init: 0.001
34+
max_iter: 200
35+
shuffle: true
36+
random_state: 42
37+
tol: 0.0001
38+
verbose: false
39+
warm_start: false
40+
momentum: 0.9
41+
nesterovs_momentum: true
42+
early_stopping: false
43+
validation_fraction: 0.1
44+
beta_1: 0.9
45+
beta_2: 0.999
46+
epsilon: 0.00000001
47+
n_iter_no_change: 10
48+
max_fun: 15000

0 commit comments

Comments
 (0)