0.2 version

dsmutin · dsmutin · commit c3d2de949446 · 2025-12-01T17:43:55.000+03:00
diff --git a/CITATION.cff b/CITATION.cff
@@ -1,7 +1,7 @@
 cff-version: 1.2.0
 message: "If you use this software, please cite it using the metadata from this file."
 title: "PROBEst"
-version: "0.1.4"
+version: "0.2.0"
 doi: "10.20944/preprints202511.2140.v1"
 date-released: "2025-11-01"
 authors:
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ python setup.py install
 ## Dependencies
 ```bash
 conda install bioconda::primer3
-conda install bioconda::blast
+conda install bioconda::blast==2.16.0
 ```
 
 ### Validate installation
diff --git a/app/templates/index.html b/app/templates/index.html
@@ -14,7 +14,7 @@
                 <div class="logo-title">
                     <img src="{{ url_for('static', filename='ctlab_probest_white.png') }}" alt="PROBESt" style="width: 30vw;">
                     <div>
-                        <h1>PROBESt v0.1.4</h1>
+                        <h1>PROBESt v0.2.0</h1>
                         <p class="subtitle">ITMO probe generation and optimization tool</p>
                     </div>
                 </div>
diff --git a/pipeline.py b/pipeline.py
@@ -51,7 +51,7 @@ def merge_iter(iter: int):
     "/scripts/generator/"
 
 # 1. Initial set generation ----
-print("\n---- PROBESt v.0.1.4 ----\n")
+print("\n---- PROBESt v.0.2.0 ----\n")
 print("Arguments passed")
 
 # Create TMP
diff --git a/scripts/databases/generate_noisy_probes.py b/scripts/databases/generate_noisy_probes.py
@@ -183,10 +183,10 @@ def main():
     parser = argparse.ArgumentParser(description='Generate noisy probe data')
     parser.add_argument('--input', required=True, help='Input probeBase CSV file')
     parser.add_argument('--output', type=str, default='data/databases/open/probeBase_false.csv', help='Output noisy probeBase CSV file')
-    parser.add_argument('--mutation-number', type=int, default=5, help='Maximum number of mutations to apply to each probe')
-    parser.add_argument('--insertion-rate', type=float, default=0.01, help='Insertion mutation rate')
-    parser.add_argument('--deletion-rate', type=float, default=0.01, help='Deletion mutation rate')
-    parser.add_argument('--mutation-rate', type=float, default=0.1, help='SNP mutation rate')
+    parser.add_argument('--mutation-number', type=int, default=20, help='Maximum number of mutations to apply to each probe')
+    parser.add_argument('--insertion-rate', type=float, default=0.1, help='Insertion mutation rate')
+    parser.add_argument('--deletion-rate', type=float, default=0.1, help='Deletion mutation rate')
+    parser.add_argument('--mutation-rate', type=float, default=0.2, help='SNP mutation rate')
     parser.add_argument('--iterations', type=int, default=10, help='Number of iterations to generate noisy data')
     
     args = parser.parse_args()
diff --git a/scripts/generator/ML_filtration.py b/scripts/generator/ML_filtration.py
@@ -4,6 +4,17 @@
 from sklearn.model_selection import train_test_split
 from PROBESt.AI import LogisticRegressionModel, PerceptronModel, DeepNeuralNetworkModel
 from PROBESt.filtration import train_filtration_AI, validate_filtration_AI, apply_filtration_AI
+from models_registry import ShallowNet, WideNet, ResidualNet, GAILDiscriminator, TabTransformer
+from PROBESt.AI import TorchClassifier
+from PROBESt.filtration import train_filtration_AI, validate_filtration_AI
+
+MODELS = {
+    "ShallowNet": lambda n: TorchClassifier(ShallowNet(n), weight_pos=5),
+    "WideNet": lambda n: TorchClassifier(WideNet(n), weight_pos=5),
+    "ResidualNet": lambda n: TorchClassifier(ResidualNet(n), weight_pos=5),
+    "GAIL": lambda n: TorchClassifier(GAILDiscriminator(n), weight_pos=5),
+    "TabTransformer": lambda n: TorchClassifier(TabTransformer(n), weight_pos=5),
+}
 
 def main():
     # Load data
diff --git a/setup.py b/setup.py
@@ -3,10 +3,10 @@
 
 setup(
     name='PROBESt',
-    version='0.1.4',
+    version='0.2.0',
     packages=find_packages(where='src'),
     package_dir={'': 'src'},
-    python_requires='>=3.10',
+    python_requires='>=3.12',
     author='CTLab',
     author_email='dvsmutin@itmo.ru',
     description='PROBESt: package for nucleotide probes generation',
@@ -17,7 +17,7 @@
         'Development Status :: 3 - Alpha',
         'Intended Audience :: Science/Research',
         'License :: OSI Approved :: MIT License',
-        'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.12',
         'Topic :: Scientific/Engineering :: Bio-Informatics',
     ],
     entry_points={
diff --git a/src/PROBESt/AI.py b/src/PROBESt/AI.py
@@ -125,6 +125,51 @@ def __init__(self, input_size: int, dropout_rate: float = 0.3):
         
     def forward(self, x):
         return self.network(x)
+class TorchClassifier(BaseAIModel):
+    def __init__(self, model: nn.Module, learning_rate=0.001, weight_pos=1.0):
+        super().__init__()
+        self.model = model
+        self.learning_rate = learning_rate
+        
+        pos_weight = torch.tensor([weight_pos])
+        self.criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+        
+        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=learning_rate)
+        
+    def train(self, X, y, epochs=100, batch_size=32):
+        X_scaled = self.preprocess_data(X)
+        X_tensor = torch.FloatTensor(X_scaled)
+        y_tensor = torch.FloatTensor(y.values).reshape(-1, 1)
+        
+        dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
+        loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
+        
+        for e in range(epochs):
+            total_loss = 0
+            for bx, by in loader:
+                self.optimizer.zero_grad()
+                logits = self.model(bx)
+                loss = self.criterion(logits, by)
+                loss.backward()
+                self.optimizer.step()
+                total_loss += loss.item()
+            
+            if e % 20 == 0:
+                print(f"Epoch {e}: loss = {total_loss:.4f}")
+
+    def predict(self, X):
+        X_scaled = self.scaler.transform(X)
+        X_tensor = torch.FloatTensor(X_scaled)
+        with torch.no_grad():
+            preds = torch.sigmoid(self.model(X_tensor)).numpy()
+        return (preds > 0.5).astype(int)
+
+    def predict_proba(self, X):
+        X_scaled = self.scaler.transform(X)
+        X_tensor = torch.FloatTensor(X_scaled)
+        with torch.no_grad():
+            preds = torch.sigmoid(self.model(X_tensor)).numpy()
+        return preds
 
 class DeepNeuralNetworkModel(BaseAIModel):
     def __init__(self, input_size: int, learning_rate: float = 0.001, dropout_rate: float = 0.3):
@@ -202,4 +247,4 @@ def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
         self.model.eval()
         with torch.no_grad():
             predictions = self.model(X_tensor)
-        return predictions.numpy() 
+        return predictions.numpy()
diff --git a/src/PROBESt/__init__.py b/src/PROBESt/__init__.py
@@ -7,4 +7,7 @@
 from . import merge
 from . import misc
 from . import args
-from . import bash_wrappers 
+from . import bash_wrappers 
+from . import models_registry
+from . import AI
+from . import filtration
diff --git a/src/PROBESt/models_registry.py b/src/PROBESt/models_registry.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# ---------- Shallow small net ----------
+class ShallowNet(nn.Module):
+    def __init__(self, input_size):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(input_size, 32),
+            nn.ReLU(),
+            nn.Linear(32, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        return self.net(x)
+
+# ---------- Wide fully-connected net ----------
+class WideNet(nn.Module):
+    def __init__(self, input_size):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(input_size, 256),
+            nn.ReLU(),
+            nn.Linear(256, 256),
+            nn.ReLU(),
+            nn.Linear(256, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        return self.net(x)
+
+# ---------- Residual MLP ----------
+class ResidualBlock(nn.Module):
+    def __init__(self, width):
+        super().__init__()
+        self.fc = nn.Linear(width, width)
+        self.bn = nn.BatchNorm1d(width)
+    def forward(self, x):
+        return F.relu(self.bn(self.fc(x)) + x)
+
+class ResidualNet(nn.Module):
+    def __init__(self, input_size):
+        super().__init__()
+        self.input_layer = nn.Linear(input_size, 128)
+        self.block1 = ResidualBlock(128)
+        self.block2 = ResidualBlock(128)
+        self.output = nn.Linear(128, 1)
+    def forward(self, x):
+        x = F.relu(self.input_layer(x))
+        x = self.block1(x)
+        x = self.block2(x)
+        return torch.sigmoid(self.output(x))
+
+# ---------- GAIL-style discriminator ----------
+class GAILDiscriminator(nn.Module):
+    def __init__(self, input_size):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(input_size, 256),
+            nn.LeakyReLU(0.2),
+            nn.Linear(256, 128),
+            nn.LeakyReLU(0.2),
+            nn.Linear(128, 1),
+            nn.Sigmoid() 
+        )
+    def forward(self, x):
+        return self.net(x)
+
+# ---------- TabTransformer ----------
+class TabTransformer(nn.Module):
+    def __init__(self, input_size, n_heads=4, depth=3):
+        super().__init__()
+        self.embedding = nn.Linear(input_size, 64)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=64, nhead=n_heads, batch_first=True
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)
+        self.fc_out = nn.Linear(64, 1)
+
+    def forward(self, x):
+        x = self.embedding(x).unsqueeze(1)  
+        x = self.transformer(x)
+        return torch.sigmoid(self.fc_out(x[:, 0]))