diff --git a/hypogenic/algorithm/generation/__init__.py b/hypogenic/algorithm/generation/__init__.py
index 741988d..de1ef70 100644
--- a/hypogenic/algorithm/generation/__init__.py
+++ b/hypogenic/algorithm/generation/__init__.py
@@ -3,5 +3,5 @@
 generation_register = Register("generation")
 
 from .base import Generation
-from .default import DefaultGeneration
+from .default import DefaultGeneration, DefaultGenerationContinuous
 from . import utils
diff --git a/hypogenic/algorithm/generation/base.py b/hypogenic/algorithm/generation/base.py
index 0133762..998dd48 100644
--- a/hypogenic/algorithm/generation/base.py
+++ b/hypogenic/algorithm/generation/base.py
@@ -188,3 +188,92 @@ def make_hypotheses_bank(
             new_generated_hypotheses[hyp].set_example(ex)
 
         return new_generated_hypotheses
+
+'''
+    def make_hypotheses_bank_continuous(
+        self,
+        example_indices,
+        current_sample,
+        alpha,
+        hypotheses_list: List[str],
+        cache_seed=None,
+        max_concurrent=3,
+        **generate_kwargs
+    ):
+        """
+        not actually used, just in case
+        """
+        """
+        Based on hypotheses generated by the LM, create new hypotheses_bank.
+
+        Parameters:
+            example_indices: the indices of examples being used to generate hypotheses
+            current_sample: the current sample in data which the algorithm is on
+            num_hypotheses_generate: the number of hypotheses that we expect our repsonse to generate
+            hypotheses: a list of hypotheses generated by the LM
+            alpha: eploration constant in hypogenic reward funciton
+            cache_seed: If `None`, will not use cache, otherwise will use cache with corresponding seed number
+            max_concurrent: the maximum number of concurrent requests to make to the API
+
+        Returns:
+            new_generated_hypotheses: A dictionary containing all newly generated hypotheses. The keys are the hypotheses and the values are the Summary Information class
+        """
+        idx_hyp_pair = []
+        new_generated_hypotheses = {}
+
+        for hyp in hypotheses_list:
+            new_generated_hypotheses[hyp] = SummaryInformation(
+                hypothesis=hyp, acc=0, num_visits=0, reward=0, correct_examples=[]
+            )
+
+            for index in example_indices:
+                idx_hyp_pair.append((index, {hyp: new_generated_hypotheses[hyp]}))
+
+        # ----------------------------------------------------------------------
+        # We try to predict the ground truth labels
+        # ----------------------------------------------------------------------
+        preds, labels = self.inference_class.batched_predict(
+            self.train_data,
+            idx_hyp_pair,
+            cache_seed=cache_seed,
+            max_concurrent=max_concurrent,
+            **generate_kwargs
+        )
+        preds, labels = preds[::-1], labels[::-1]
+
+        # ----------------------------------------------------------------------
+        # Finding the accuracy and the correct examples for each hypothesis
+        # ----------------------------------------------------------------------
+        for hyp in hypotheses_list:
+            correct = 0
+            ex = []
+
+            loss = 0.0
+            for index in example_indices:
+                prediction, actual_label = preds.pop(-1), labels.pop(-1)
+                if isinstance(prediction, str):
+                    if prediction == "other":
+                        prediction = -1
+                    else:
+                        prediction = float(prediction)
+                if isinstance(actual_label, str):
+                    actual_label = float(actual_label)
+                loss += (prediction - actual_label) ** 2
+            loss = loss / len(example_indices)
+            acc = 1.0 / (1.0 + loss)
+
+            # print(f"acc = {acc} for hypothesis: {hyp}")
+
+            new_generated_hypotheses[hyp].set_accuracy(acc)
+            new_generated_hypotheses[hyp].set_num_visits(len(example_indices))
+
+            # hypogenic reward
+            reward = acc + alpha * math.sqrt(
+                math.log(current_sample) / len(example_indices)
+            )
+
+            new_generated_hypotheses[hyp].set_reward(reward)
+            new_generated_hypotheses[hyp].set_example(ex)
+
+        return new_generated_hypotheses
+'''
\ No newline at end of file
diff --git a/hypogenic/algorithm/generation/default.py b/hypogenic/algorithm/generation/default.py
index bc33a3c..6d47089 100644
--- a/hypogenic/algorithm/generation/default.py
+++ b/hypogenic/algorithm/generation/default.py
@@ -155,3 +155,247 @@ def batched_hypothesis_generation(
             max_concurrent=max_concurrent,
             **generate_kwargs,
         )
+
+@generation_register.register("default_continuous")
+class DefaultGenerationContinuous(Generation):
+    """
+    Add on extra functionality to the generation fucntion - we consider this
+    the "default" task.
+    """
+
+    def __init__(
+        self,
+        api,
+        # reward_a,
+        # reward_b,
+        prompt_class: BasePrompt,
+        inference_class: Inference,
+        task: BaseTask,
+    ):
+        """
+        Parameters:
+            api: the language model that you're using, which may or may not be local
+            prompt_class: let's us know how the prompt is going to look
+            inference_class: gives us a way to predict labels for accuracy sake
+            task: determines the goal to accomplish
+        """
+        super().__init__(api, prompt_class, inference_class, task)
+        # self.reward_a = reward_a
+        # self.reward_b = reward_b
+
+    # ------------------------------------------------------------------------ #
+    #                                                                          #
+    # ------------------------------------------------------------------------ #
+    # BATCH INITLALIZE HYPOTHESES                                              #
+    # ------------------------------------------------------------------------ #
+    #                                                                          #
+    # ------------------------------------------------------------------------ #
+    def batched_initialize_hypotheses(
+        self,
+        num_init,
+        init_batch_size,
+        init_hypotheses_per_batch,
+        cache_seed=None,
+        max_concurrent=3,
+        **generate_kwargs,
+    ):
+        """
+        Batches the process of making new hypotheses
+
+        Parameters:
+            num_init: the total amount of examples you want to use for initialize hypotheses
+            init_batch size: the number of examples that will be used to generate these hypotheses
+            init_hypotheses_per_batch: the amount of hypotheses that you want to generate per btach
+            cache_seed: If `None`, will not use cache, otherwise will use cache with corresponding seed number
+            max_concurrent: the maximum amount of concurrent calls to the API
+
+        Returns:
+            hypotheses_bank: A dictionary with keys as hypotheses and the values as the Summary Information class
+        """
+        # ----------------------------------------------------------------------
+        # Finding batch size and confirming that it works
+        # ----------------------------------------------------------------------
+        assert (
+            num_init % init_batch_size == 0
+        ), "Number of initial examples must be divisible by the batch size"
+        num_batches = num_init // init_batch_size
+        prompt_inputs = []
+
+        # ----------------------------------------------------------------------
+        # Making the batch of the responses that will be used in the batch generation
+        # ----------------------------------------------------------------------
+        for i in range(num_batches):
+            example_indices = list(
+                range(i * init_batch_size, (i + 1) * init_batch_size)
+            )
+            # TODO: need copy()?
+            example_bank = (
+                self.train_data.loc[list(example_indices)].copy().reset_index(drop=True)
+            )
+            prompt_inputs.append(
+                self.prompt_class.batched_generation(
+                    example_bank, init_hypotheses_per_batch
+                )
+            )
+        responses = self.api.batched_generate(
+            prompt_inputs,
+            cache_seed=cache_seed,
+            max_concurrent=max_concurrent,
+            **generate_kwargs,
+        )
+
+        hypotheses_list = list(
+            set(
+                [
+                    hyp
+                    for response in responses
+                    for hyp in extract_hypotheses(response, init_hypotheses_per_batch)
+                ]
+            )
+        )
+
+        return hypotheses_list
+    
+    def make_hypotheses_bank(
+        self,
+        example_indices,
+        current_sample,
+        alpha,
+        hypotheses_list: List[str],
+        cache_seed=None,
+        max_concurrent=3,
+        **generate_kwargs
+    ):
+        """
+        override make_hypotheses_bank for continuous value prediction
+        """
+        """
+        Based on hypotheses generated by the LM, create new hypotheses_bank.
+
+        Parameters:
+            example_indices: the indices of examples being used to generate hypotheses
+            current_sample: the current sample in data which the algorithm is on
+            num_hypotheses_generate: the number of hypotheses that we expect our repsonse to generate
+            hypotheses: a list of hypotheses generated by the LM
+            alpha: eploration constant in hypogenic reward funciton
+            cache_seed: If `None`, will not use cache, otherwise will use cache with corresponding seed number
+            max_concurrent: the maximum number of concurrent requests to make to the API
+
+        Returns:
+            new_generated_hypotheses: A dictionary containing all newly generated hypotheses. The keys are the hypotheses and the values are the Summary Information class
+        """
+        idx_hyp_pair = []
+        new_generated_hypotheses = {}
+
+        for hyp in hypotheses_list:
+            new_generated_hypotheses[hyp] = SummaryInformation(
+                hypothesis=hyp, acc=0, num_visits=0, reward=0, correct_examples=[]
+            )
+
+            for index in example_indices:
+                idx_hyp_pair.append((index, {hyp: new_generated_hypotheses[hyp]}))
+
+        # ----------------------------------------------------------------------
+        # We try to predict the ground truth labels
+        # ----------------------------------------------------------------------
+        preds, labels = self.inference_class.batched_predict(
+            self.train_data,
+            idx_hyp_pair,
+            cache_seed=cache_seed,
+            max_concurrent=max_concurrent,
+            **generate_kwargs
+        )
+        preds, labels = preds[::-1], labels[::-1]
+
+        # ----------------------------------------------------------------------
+        # Finding the accuracy and the correct examples for each hypothesis
+        # ----------------------------------------------------------------------
+
+        # ----------------------------------------------------------------------
+        # New reward = -MSE + exploration
+        # ----------------------------------------------------------------------
+
+        for hyp in hypotheses_list:
+            ex = []
+
+            acc = 0.0
+            for index in example_indices:
+                prediction, actual_label = preds.pop(-1), labels.pop(-1)
+                if isinstance(prediction, str):
+                    if prediction in ["other", "unknown"]:
+                        prediction = self.task.y_mu
+                    else:
+                        prediction = float(prediction)
+                if isinstance(actual_label, str):
+                    actual_label = float(actual_label)
+                # normalize pred and label
+                
+                prediction = (prediction - self.task.y_min) / (self.task.y_max - self.task.y_min)
+                actual_label = (actual_label - self.task.y_min) / (self.task.y_max - self.task.y_min)
+
+                acc += float(- (actual_label - prediction) ** 2)
+            acc = acc / len(example_indices)
+
+            print(f"acc = {acc} for hypothesis: {hyp}")
+
+            new_generated_hypotheses[hyp].set_accuracy(acc)
+            new_generated_hypotheses[hyp].set_num_visits(len(example_indices))
+
+            # hypogenic reward
+            reward = acc + alpha * math.sqrt(
+                math.log(current_sample) / len(example_indices)
+            )
+
+            new_generated_hypotheses[hyp].set_reward(reward)
+            new_generated_hypotheses[hyp].set_example(ex)
+
+        return new_generated_hypotheses
+
+
+    # ------------------------------------------------------------------------ #
+    #                                                                          #
+    # ------------------------------------------------------------------------ #
+    # BATCHED_HYPOTHESIS GENERATION                                            #
+    # ------------------------------------------------------------------------ #
+    #                                                                          #
+    # ------------------------------------------------------------------------ #
+    def batched_hypothesis_generation(
+        self,
+        example_ids,
+        current_sample,
+        num_hypotheses_generate: int,
+        alpha: float,
+        cache_seed=None,
+        max_concurrent=3,
+        **generate_kwargs,
+    ):
+        """
+        Generates new hypotheses for the given examples
+
+        Parameters:
+            example_ids: The ids of the examples for which hypotheses need to be generated
+            current_sample: the current sample in data which the algorithm is on
+            num_hypotheses_generate: the number of hypotheses that we expect our response to generate
+            alpha: eploration constant in hypogenic reward funciton
+            cache_seed: If `None`, will not use cache, otherwise will use cache with corresponding seed number
+            max_concurrent: The maximum number of concurrent requests
+
+        Returns:
+            hypotheses_bank: A dictionary with keys as hypotheses and the values as the Summary Information class
+        """
+        new_hypotheses = self.batched_hyp_list_generation(
+            example_ids,
+            num_hypotheses_generate,
+            cache_seed=cache_seed,
+            **generate_kwargs,
+        )
+
+        return self.make_hypotheses_bank(
+            example_ids,
+            current_sample,
+            alpha,
+            new_hypotheses,
+            cache_seed=cache_seed,
+            max_concurrent=max_concurrent,
+            **generate_kwargs,
+        )
diff --git a/hypogenic/algorithm/summary_information.py b/hypogenic/algorithm/summary_information.py
index 1b9e5bb..7f3ec38 100644
--- a/hypogenic/algorithm/summary_information.py
+++ b/hypogenic/algorithm/summary_information.py
@@ -60,12 +60,19 @@ def update_info_if_useful(self, current_example, alpha):
     def update_useful_examples(self, example, label):
         self.correct_examples.append((example, label))
 
-    # In the update function, if it got the ith smample wrong, adjust accuracy accordinly
+    # In the update function, if it got the ith sample wrong, adjust accuracy accordinly
     def update_info_if_not_useful(self, current_example, alpha):
         self.acc = (self.acc * self.num_visits) / (self.num_visits + 1)
         self.num_visits += 1
         self.update_reward(alpha, current_example)
 
+    def update_info_continuous(self, current_example, alpha, diff_value,):
+        self.acc = self.acc * self.num_visits
+        self.acc += float(- (diff_value ** 2))
+        self.num_visits += 1
+        self.acc = self.acc / self.num_visits
+        self.update_reward(alpha, current_example)
+
     def __reduce__(self):
         return (self.__class__, (-1, self.acc, self.reward, self.num_visits))
 
diff --git a/hypogenic/algorithm/update/__init__.py b/hypogenic/algorithm/update/__init__.py
index 0952ffa..45ed4b4 100644
--- a/hypogenic/algorithm/update/__init__.py
+++ b/hypogenic/algorithm/update/__init__.py
@@ -3,5 +3,5 @@
 update_register = Register("update")
 
 from .base import Update
-from .default import DefaultUpdate
+from .default import DefaultUpdate, DefaultUpdateContinuous
 from .sampling import SamplingUpdate
diff --git a/hypogenic/algorithm/update/default.py b/hypogenic/algorithm/update/default.py
index 7b11386..631614a 100644
--- a/hypogenic/algorithm/update/default.py
+++ b/hypogenic/algorithm/update/default.py
@@ -217,3 +217,220 @@ def update(
 
         # Our new bank
         return hypotheses_bank
+
+
+@update_register.register("default_continuous")
+class DefaultUpdateContinuous(Update):
+    """
+    DefaultUpdate uses ONE hypothesis to make a prediction on a new example.
+    """
+
+    def __init__(
+        self,
+        generation_class: Generation,
+        inference_class: Inference,
+        replace_class: Replace,
+        save_path: str,
+        file_name_template: str = "hypotheses_training_sample_${sample}_seed_${seed}_epoch_${epoch}.json",
+        sample_num_to_restart_from=-1,
+        num_init=25,
+        epoch_to_start_from=0,
+        num_wrong_scale=0.8,
+        k=-1,
+        alpha=5e-1,
+        update_batch_size=5,
+        num_hypotheses_to_update=5,
+        update_hypotheses_per_batch=5,
+        only_best_hypothesis=False,
+        save_every_n_examples=100,
+        wrong_value_threshold=0.5,
+        # reward_a=1,
+        # reward_b=0.0625,
+    ):
+        super().__init__(
+            generation_class,
+            inference_class,
+            replace_class,
+            save_path,
+            file_name_template,
+            sample_num_to_restart_from,
+            num_init,
+            epoch_to_start_from,
+            num_wrong_scale,
+            k,
+            alpha,
+            update_batch_size,
+            num_hypotheses_to_update,
+            update_hypotheses_per_batch,
+            only_best_hypothesis,
+            save_every_n_examples,
+        )
+        self.wrong_value_threshold = wrong_value_threshold
+        # self.reward_a = reward_a
+        # self.reward_b = reward_b
+
+    def update(
+        self,
+        hypotheses_bank: Dict[str, SummaryInformation],
+        current_epoch,
+        current_seed,
+        cache_seed=None,
+        max_concurrent=3,
+        **generate_kwargs,
+    ):
+        """
+        We update the hypothesis bank once we reach a certain amount of regret
+
+        Parameters:
+            hypotheses_bank: The hypothesis bank
+            current_epoch: The current epoch
+            current_seed: The current seed
+            cache_seed: If `None`, will not use cache, otherwise will use cache with corresponding seed number
+            max_concurrent: The maximum number of concurrent requests
+        """
+        logger = LoggerConfig.get_logger(logger_name)
+
+        # initialize variables
+        num_train_examples = len(self.train_data)
+        wrong_example_ids = set()
+
+        # ----------------------------------------------------------------------
+        # Figuring out starting samples
+        # ----------------------------------------------------------------------
+        # go through training examples
+        # When restarting from epoch > 0, no need to start at num_init
+        # When not restarting, then default sample_num_to_restart_from = -1. start with num_init.
+        # For multiple epochs restarts, there should always be a non-negative sample_num_to_restart_from
+        if self.sample_num_to_restart_from >= 0:
+            start_sample = self.sample_num_to_restart_from
+        else:
+            start_sample = self.num_init
+
+        # This is to check if we are running more epochs than the starting epoch, if so, start at sample 0
+        # basically, if we've completed the starting epoch, we want to start the next one
+        if current_epoch > self.epoch_to_start_from:
+            start_sample = 0
+
+        # ----------------------------------------------------------------------
+        # Creating the new hypotheses
+        # ----------------------------------------------------------------------
+        # from the start to the end
+        for i in range(start_sample, num_train_examples):
+            # the 'i' here is the sample we are testing each of the top hypotheses
+
+            current_sample = i + 1
+            logger.info(f"Training on example {i}")
+
+            # We need to get the best k for testing the strength of our hypothesis bank
+            top_k_hypotheses = sorted(
+                hypotheses_bank, key=lambda x: hypotheses_bank[x].reward, reverse=True
+            )[: self.k]
+
+            # We are at the regret that we need in order to generate a new hypothesis
+            if self.num_wrong_scale > 0:
+                num_wrong_to_add_bank = (
+                    len(top_k_hypotheses) * i / num_train_examples
+                ) * self.num_wrong_scale
+
+            # ------------------------------------------------------------------
+            # We need to see how good our hypothesis is, which we do by way of the inference class
+            # ------------------------------------------------------------------
+            num_wrong_hypotheses = 0
+            preds, labels = self.inference_class.batched_predict(
+                self.train_data,
+                [
+                    (i, {hypothesis: hypotheses_bank[hypothesis]})
+                    for hypothesis in top_k_hypotheses
+                ],
+                cache_seed=cache_seed,
+                max_concurrent=max_concurrent,
+                **generate_kwargs,
+            )
+
+            # Comparison of the label and prediction
+            for pred, label, hypothesis in zip(preds, labels, top_k_hypotheses):
+                if isinstance(pred, str):
+                    pred = float(pred)
+                if isinstance(label, str):
+                    label = float(label)
+                # normalize
+                pred = (pred - self.generation_class.task.y_mu) / (self.generation_class.task.y_max - self.generation_class.task.y_min)
+                label = (label - self.generation_class.task.y_mu) / (self.generation_class.task.y_max - self.generation_class.task.y_min)
+
+                if abs(pred - label) > self.wrong_value_threshold:
+                    num_wrong_hypotheses += 1
+                    hypotheses_bank[hypothesis].update_info_continuous(
+                        current_sample, self.alpha, abs(pred - label),
+                    )  # let the bank know it got one wrong
+                else:
+                    hypotheses_bank[hypothesis].update_info_continuous(
+                        current_sample, self.alpha, abs(pred - label),
+                    )  # let the bank know it got one right
+
+                    # keeping track of good examples as we do in generation
+                    hypotheses_bank[hypothesis].update_useful_examples(i, label)
+
+            # ------------------------------------------------------------------
+            # Generating a new hypothesis
+            # ------------------------------------------------------------------
+
+            # if we get enough wrong examples as determined by num_wrong_to_add_bank,
+            # we need to generate new hypotheses
+            if (
+                num_wrong_hypotheses >= num_wrong_to_add_bank
+                or len(top_k_hypotheses) == 0
+            ):
+
+                # We note it as a bad sample
+                wrong_example_ids.add(i)
+                if (
+                    len(wrong_example_ids)
+                    == self.update_batch_size * self.num_hypotheses_to_update
+                ):
+                    new_hyp_bank = {}
+
+                    # generate new hypotheses
+                    for j in range(self.num_hypotheses_to_update):
+                        # Go through poorly performing exmaples and generate hypotheses for them
+                        # TODO: batched?
+                        new_hypotheses = (
+                            self.generation_class.batched_hypothesis_generation(
+                                wrong_example_ids,
+                                current_sample,
+                                self.update_hypotheses_per_batch,
+                                self.alpha,
+                                cache_seed=cache_seed,
+                                max_concurrent=max_concurrent,
+                                **generate_kwargs,
+                            )
+                        )
+
+                        # If we only take the best performing hypothesis from the batch
+                        if self.only_best_hypothesis:
+                            best_hypothesis = max(
+                                new_hypotheses, key=lambda x: new_hypotheses[x].reward
+                            )
+                            new_hyp_bank.update(
+                                {best_hypothesis: new_hypotheses[best_hypothesis]}
+                            )
+                        else:
+                            new_hyp_bank.update(new_hypotheses)
+                    # reset wrong examples to be empty
+                    wrong_example_ids = set()
+
+                    # call replace class to update the bank
+                    hypotheses_bank = self.replace_class.replace(
+                        hypotheses_bank, new_hyp_bank
+                    )
+
+            # save hypotheses to json
+            if (i + 1) % self.save_every_n_examples == 0:
+                self.save_to_json(
+                    hypotheses_bank,
+                    sample=i + 1,
+                    seed=current_seed,
+                    epoch=current_epoch,
+                )
+
+        # Our new bank
+        return hypotheses_bank
\ No newline at end of file
diff --git a/hypogenic/tasks.py b/hypogenic/tasks.py
index 264600f..9bc0eda 100644
--- a/hypogenic/tasks.py
+++ b/hypogenic/tasks.py
@@ -22,6 +22,7 @@ def __init__(
         extract_label: Union[Callable[[str], str], None] = None,
         from_register: Union[Register, None] = None,
         use_ood: bool = False,
+        regression: bool = False,
     ):
         if from_register is None and extract_label is None:
             raise ValueError("Either from_register or extract_label should be provided")
@@ -47,7 +48,12 @@ def __init__(
             self.test_data_path = data["ood_data_path"]
             self.val_data_path = data["ood_data_path"]
 
-        # getting omrpt templates from yaml file
+        if regression:
+            self.y_min = float(data["y_min"])
+            self.y_max = float(data["y_max"])
+            self.y_mu = float(data["y_mu"])
+
+        # getting prompt templates from yaml file
         self.prompt_template = data["prompt_templates"]
 
         # task label
diff --git a/hypogenic/utils.py b/hypogenic/utils.py
index 5007373..8c4d202 100644
--- a/hypogenic/utils.py
+++ b/hypogenic/utils.py
@@ -54,6 +54,29 @@ def get_results(pred_list, label_list):
 
     return {"accuracy": accuracy, "f1": f1}
 
+def get_results_regression(pred_list, label_list):
+    """
+    Compute MSE for regression.
+    """
+    print('label_list: ', label_list)
+    print('pred_list: ', pred_list)
+
+    # Check if labels can be turned into floats
+    try:
+        label_list = [float(label) for label in label_list]
+    except:
+        raise ValueError("Labels cannot be turned into floats")
+    
+    # if a prediction cannot be turned into a float, then it is 'unknown' and then we use the mean of the predictions as the prediction
+    mean_pred = np.mean([float(pred) for pred in pred_list if pred != 'unknown'])
+    print('mean_pred: ', mean_pred)
+    pred_list = [mean_pred if pred == 'unknown' else float(pred) for pred in pred_list]
+    print('pred_list: ', pred_list)
+    
+    mse = np.mean((np.array(label_list) - np.array(pred_list))**2)
+
+    return {"mse": mse}
+
 def set_seed(seed):
     logger = LoggerConfig.get_logger(logger_name)
     logger.info(f"Setting seed to {seed}")
diff --git a/regression_pipeline.py b/regression_pipeline.py
new file mode 100644
index 0000000..cccd160
--- /dev/null
+++ b/regression_pipeline.py
@@ -0,0 +1,1236 @@
+import datetime
+import json
+import logging
+import os
+import argparse
+import pandas as pd
+
+from hypogenic.LLM_wrapper import (
+    GPTWrapper,
+    LLMWrapper,
+    LocalVllmWrapper,
+    llm_wrapper_register,
+)
+from hypogenic.extract_label import extract_label_register
+from hypogenic.utils import set_seed, get_results_regression
+from hypogenic.prompt import BasePrompt
+from hypogenic.tasks import BaseTask
+from hypogenic.logger_config import LoggerConfig
+from hypogenic.algorithm.summary_information import SummaryInformation
+
+from hypogenic.algorithm.update import (
+    DefaultUpdate,
+    DefaultUpdateContinuous,
+    update_register,
+)
+from hypogenic.algorithm.replace import DefaultReplace
+from hypogenic.algorithm.inference import (
+    DefaultInference,
+)
+from hypogenic.algorithm.generation import (
+    DefaultGeneration,
+    DefaultGenerationContinuous,
+    generation_register,
+)
+from hypogenic.algorithm.summary_information import SummaryInformation
+
+from hypothesis_agent.data_analysis_agent.generation import (
+    TestGeneration,
+    OnlyPaperGeneration,
+    ZeroShotGeneration,
+)
+from hypothesis_agent.data_analysis_agent.inference import MultiHypDefaultInference
+from hypothesis_agent.data_analysis_agent.update import TestUpdate
+from hypothesis_agent.literature_review_agent import LiteratureAgent
+from hypothesis_agent.literature_review_agent.literature_processor.extract_info import (
+    BaseExtractor,
+    WholeExtractor,
+)
+from hypothesis_agent.literature_review_agent.literature_processor.summarize import (
+    BaseSummarize,
+    LLMSummarize,
+)
+from hypothesis_agent.data_analysis_agent.utils import multiple_hypotheses_remove_repetition
+from hypothesis_agent.data_analysis_agent.prompt import TestPrompt
+
+
+from IO_prompting.prompt import IOPrompt
+from IO_prompting.update import IOUpdate
+from IO_prompting.generation import IOGeneration
+
+
+parser = argparse.ArgumentParser()
+
+# Required base arguments
+parser.add_argument("--model_type", type=str, required=True)
+parser.add_argument("--model_name", type=str, required=True)
+parser.add_argument("--task_name", type=str, required=True)
+# This is needed for local models
+parser.add_argument("--model_path", type=str)
+parser.add_argument("--do_train", action="store_true", default=False)
+
+# Method toggle arguments
+parser.add_argument("--run_zero_shot", action="store_true", help="Run zero-shot baseline")
+parser.add_argument("--run_few_shot", action="store_true", help="Run few-shot baseline")
+parser.add_argument("--run_zero_shot_gen", action="store_true", help="Run zero-shot generation")
+parser.add_argument("--run_only_paper", action="store_true", help="Run literature-only")
+parser.add_argument("--run_hyperwrite", action="store_true", help="Run HyperWrite")
+parser.add_argument("--run_notebooklm", action="store_true", help="Run NotebookLM")
+parser.add_argument("--run_hypogenic", action="store_true", help="Run original HypoGeniC")
+parser.add_argument("--run_hyporefine", action="store_true", help="Run HypoRefine")
+parser.add_argument("--run_union_hypo", action="store_true", help="Run Union HypoGeniC and Paper")
+parser.add_argument("--run_union_refine", action="store_true", help="Run Union HypoRefine and Paper")
+parser.add_argument("--run_cross_model", action="store_true", help="Run cross-model evaluation")
+parser.add_argument("--run_io_refine", action="store_true", help="Run IO iterative refinement")
+
+# All algorithm-related arguments
+parser.add_argument("--cross_model_postfix", type=str, default="hypogenic_and_paper")
+parser.add_argument("--multihyp", action="store_true", default=True)
+parser.add_argument("--use_val", action="store_true", default=False)
+parser.add_argument("--max_num_hypotheses", type=int, default=10)
+parser.add_argument("--num_init", type=int, default=10)
+parser.add_argument("--num_train", type=int, default=20)
+parser.add_argument("--num_test", type=int, default=20)
+parser.add_argument("--num_val", type=int, default=200)
+parser.add_argument("--k", type=int, default=10)
+parser.add_argument("--alpha", type=float, default=5e-1)
+parser.add_argument("--update_batch_size", type=int, default=10)
+parser.add_argument("--num_hypotheses_to_update", type=int, default=1)
+parser.add_argument("--update_hypotheses_per_batch", type=int, default=10)
+parser.add_argument("--save_every_10_examples", type=int, default=10)
+parser.add_argument("--init_batch_size", type=int, default=10)
+parser.add_argument("--init_hypotheses_per_batch", type=int, default=10)
+parser.add_argument("--cache_seed", type=int, default=None)
+parser.add_argument("--temperature", type=float, default=1e-5)
+parser.add_argument("--max_tokens", type=int, default=8000)
+parser.add_argument("--use_refine", action="store_true", default=False)
+parser.add_argument("--max_refine", type=int, default=6)
+parser.add_argument("--seed", type=int, default=42)
+parser.add_argument("--use_ood", action="store_true", default=False, help="Use out-of-distribution data for testing")
+parser.add_argument("--regression", action="store_true", default=False, help="Use regression task")
+
+args = parser.parse_args()
+
+
+cross_model_postfix = args.cross_model_postfix
+multihyp = args.multihyp
+use_val = args.use_val
+use_ood = args.use_ood
+regression = args.regression
+max_num_hypotheses = args.max_num_hypotheses
+num_init = args.num_init
+num_train = args.num_train
+num_test = args.num_test
+num_val = args.num_val
+k = args.k
+alpha = args.alpha
+update_batch_size = args.update_batch_size
+num_hypotheses_to_update = args.num_hypotheses_to_update
+update_hypotheses_per_batch = args.update_hypotheses_per_batch
+save_every_10_examples = args.save_every_10_examples
+init_batch_size = args.init_batch_size
+init_hypotheses_per_batch = args.init_hypotheses_per_batch
+cache_seed = args.cache_seed
+temperature = args.temperature
+max_tokens = args.max_tokens
+use_refine = args.use_refine
+max_refine = args.max_refine
+seed = args.seed
+
+
+def zero_shot_hyp(task_name, api, model_name):
+    output_folder = (
+        f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_zero_shot/"
+    )
+    os.makedirs(output_folder, exist_ok=True)
+    set_seed(seed)
+
+    task = BaseTask(
+        config_path=f"./data/{task_name}/config.yaml",
+        from_register=extract_label_register,
+        use_ood=use_ood,
+        regression=regression
+    )
+
+    train_data, _, _ = task.get_data(num_train, num_test, num_val, seed=seed)
+
+    prompt_class = TestPrompt(task)
+    inference_class = DefaultInference(api, prompt_class, train_data, task)
+
+    generation_class = ZeroShotGeneration(
+        api=api,
+        prompt_class=prompt_class,
+        inference_class=inference_class,
+        task=task,
+    )
+
+    update_class = TestUpdate(
+        generation_class=generation_class,
+        inference_class=inference_class,
+        replace_class=DefaultReplace(max_num_hypotheses),
+        save_path=output_folder,
+        num_init=num_init,
+        k=k,
+        alpha=alpha,
+        update_batch_size=update_batch_size,
+        update_hypotheses_per_batch=update_hypotheses_per_batch,
+        num_hypotheses_to_update=num_hypotheses_to_update,
+        save_every_n_examples=save_every_10_examples,
+    )
+
+    hypotheses_bank = generation_class.initialize_hypotheses_0_shot(
+        num_hypotheses_generate=init_hypotheses_per_batch,
+        cache_seed=cache_seed,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        max_concurrent=64,
+    )
+    hypotheses_bank = {hyp: SummaryInformation() for hyp in hypotheses_bank}
+    update_class.save_to_json(hypotheses_bank, sample=0, seed=seed, epoch=0)
+
+
+def only_paper(task_name, api, model_name):
+    output_folder = (
+        f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_only_paper/"
+    )
+    os.makedirs(output_folder, exist_ok=True)
+    set_seed(seed)
+
+    task = BaseTask(
+        config_path=f"./data/{task_name}/config.yaml",
+        from_register=extract_label_register,
+        use_ood=use_ood,
+        regression=regression
+    )
+
+    train_data, _, _ = task.get_data(num_train, num_test, num_val, seed=seed)
+
+    prompt_class = TestPrompt(task)
+    inference_class = DefaultInference(api, prompt_class, train_data, task)
+    summarize_class = LLMSummarize(
+        extractor=WholeExtractor(), api=api, prompt_class=prompt_class
+    )
+
+    literature_agent = LiteratureAgent(
+        api=api,
+        prompt_class=prompt_class,
+        summizer=summarize_class,
+    )
+    literature_agent.summarize_papers(
+        data_file=f"./literature/{task_name}/processed",
+        cache_seed=cache_seed,
+        max_tokens=max_tokens,
+        temperature=temperature,
+    )
+    literature_agent.save_paper_infos(os.path.join(output_folder, "paper_infos.json"))
+
+    generation_class = OnlyPaperGeneration(
+        api=api,
+        prompt_class=prompt_class,
+        inference_class=inference_class,
+        task=task,
+        literature_agent=literature_agent,
+    )
+
+    update_class = TestUpdate(
+        generation_class=generation_class,
+        inference_class=inference_class,
+        replace_class=DefaultReplace(max_num_hypotheses),
+        save_path=output_folder,
+        num_init=num_init,
+        k=k,
+        alpha=alpha,
+        update_batch_size=update_batch_size,
+        num_hypotheses_to_update=num_hypotheses_to_update,
+        update_hypotheses_per_batch=update_hypotheses_per_batch,
+        save_every_n_examples=save_every_10_examples,
+    )
+
+    hypotheses_bank = generation_class.initialize_hypotheses_only_paper(
+        num_hypotheses_generate=init_hypotheses_per_batch,
+        cache_seed=cache_seed,
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+    hypotheses_bank = {hyp: SummaryInformation() for hyp in hypotheses_bank}
+    update_class.save_to_json(hypotheses_bank, sample=0, seed=seed, epoch=0)
+
+
+def with_paper(task_name, api, model_name):
+    output_folder = (
+        f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_with_paper/"
+    )
+    os.makedirs(output_folder, exist_ok=True)
+    set_seed(seed)
+
+    task = BaseTask(
+        config_path=f"./data/{task_name}/config.yaml",
+        from_register=extract_label_register,
+        use_ood=use_ood,
+        regression=regression
+    )
+
+    train_data, _, _ = task.get_data(num_train, num_test, num_val, seed=seed)
+
+    prompt_class = TestPrompt(task)
+    inference_class = DefaultInference(api, prompt_class, train_data, task)
+    summarize_class = LLMSummarize(
+        extractor=WholeExtractor(), api=api, prompt_class=prompt_class
+    )
+
+    literature_agent = LiteratureAgent(
+        api=api,
+        prompt_class=prompt_class,
+        summizer=summarize_class,
+    )
+    literature_agent.summarize_papers(
+        data_file=f"./literature/{task_name}/processed",
+        cache_seed=cache_seed,
+        max_tokens=max_tokens,
+        temperature=temperature,
+    )
+    literature_agent.save_paper_infos(os.path.join(output_folder, "paper_infos.json"))
+
+    generation_class = TestGeneration(
+        api=api,
+        prompt_class=prompt_class,
+        inference_class=inference_class,
+        task=task,
+        literature_agent=literature_agent,
+        max_refine=max_refine,
+    )
+
+    update_class = TestUpdate(
+        generation_class=generation_class,
+        inference_class=inference_class,
+        replace_class=DefaultReplace(max_num_hypotheses),
+        save_path=output_folder,
+        num_init=num_init,
+        k=k,
+        alpha=alpha,
+        update_batch_size=update_batch_size,
+        num_hypotheses_to_update=num_hypotheses_to_update,
+        update_hypotheses_per_batch=update_hypotheses_per_batch,
+        save_every_n_examples=save_every_10_examples,
+    )
+
+    hypotheses_bank = update_class.batched_initialize_hypotheses_with_paper(
+        num_init=num_init,
+        init_batch_size=init_batch_size,
+        init_hypotheses_per_batch=init_hypotheses_per_batch,
+        cache_seed=cache_seed,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        max_concurrent=64,
+    )
+    update_class.save_to_json(hypotheses_bank, sample=num_init, seed=seed, epoch=0)
+
+    for epoch in range(1):
+        hypotheses_bank = update_class.update(
+            current_epoch=epoch,
+            hypotheses_bank=hypotheses_bank,
+            current_seed=seed,
+            cache_seed=cache_seed,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            max_concurrent=64,
+        )
+        update_class.save_to_json(
+            hypotheses_bank,
+            sample="final",
+            seed=seed,
+            epoch=epoch,
+        )
+
+
+def original_hypogenic(task_name, api, model_name):
+    output_folder = f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}/"
+
+    os.makedirs(output_folder, exist_ok=True)
+
+    task = BaseTask(
+        config_path=f"./data/{task_name}/config.yaml",
+        from_register=extract_label_register,
+        use_ood=use_ood,
+        regression=regression
+    )
+
+    set_seed(seed)
+    train_data, _, _ = task.get_data(num_train, num_test, num_val, seed)
+    prompt_class = BasePrompt(task)
+    inference_class = DefaultInference(api, prompt_class, train_data, task)
+
+    class_build = "default"
+    if regression:
+        class_build = "default_continuous"
+    generation_class = generation_register.build(class_build)(api, prompt_class, inference_class, task)
+
+    update_class = update_register.build(class_build)(
+        generation_class=generation_class,
+        inference_class=inference_class,
+        replace_class=DefaultReplace(max_num_hypotheses),
+        save_path=output_folder,
+        num_init=num_init,
+        k=k,
+        alpha=alpha,
+        update_batch_size=update_batch_size,
+        num_hypotheses_to_update=num_hypotheses_to_update,
+        save_every_n_examples=save_every_10_examples,
+    )
+
+    hypotheses_bank = {}
+    hypotheses_bank = update_class.batched_initialize_hypotheses(
+        num_init,
+        init_batch_size=init_batch_size,
+        init_hypotheses_per_batch=init_hypotheses_per_batch,
+        cache_seed=cache_seed,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        max_concurrent=64,
+    )
+    update_class.save_to_json(
+        hypotheses_bank,
+        sample=num_init,
+        seed=seed,
+        epoch=0,
+    )
+    for epoch in range(1):
+        hypotheses_bank = update_class.update(
+            current_epoch=epoch,
+            hypotheses_bank=hypotheses_bank,
+            current_seed=seed,
+            cache_seed=cache_seed,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            max_concurrent=64,
+        )
+        update_class.save_to_json(
+            hypotheses_bank,
+            sample="final",
+            seed=seed,
+            epoch=epoch,
+        )
+
+
+def IO_iterative_refinement(task_name, api, model_name):
+    output_folder = f"./results/{task_name}/{model_name}/IO_refinement/"
+
+    os.makedirs(output_folder, exist_ok=True)
+
+    task = BaseTask(
+        config_path=f"./data/{task_name}/config.yaml",
+        from_register=extract_label_register,
+        use_ood=use_ood,
+        regression=regression
+    )
+
+    set_seed(seed)
+    train_data, _, _ = task.get_data(10, num_test, num_val, seed)
+    prompt_class = IOPrompt(task)
+    inference_class = DefaultInference(api, prompt_class, train_data, task)
+    generation_class = IOGeneration(api, prompt_class, inference_class, task)
+
+    update_class = IOUpdate(
+        generation_class=generation_class,
+        inference_class=inference_class,
+        replace_class=DefaultReplace(max_num_hypotheses),
+        save_path=output_folder,
+        num_init=10,
+        k=k,
+        alpha=alpha,
+        update_batch_size=update_batch_size,
+        num_hypotheses_to_update=num_hypotheses_to_update,
+        save_every_n_examples=save_every_10_examples,
+    )
+
+    hypotheses_bank = {}
+    # Use the Qiu et al. 2024 paper hyperparameters
+    hypotheses_bank = update_class.batched_initialize_hypotheses(
+        num_init=10,
+        init_batch_size=num_init,
+        init_hypotheses_per_batch=5,
+        cache_seed=cache_seed,
+        temperature=0.7,
+        max_tokens=max_tokens,
+        max_concurrent=64,
+    )
+    # only keep the hypothesis with the highest accuracy
+    # sorted_hypotheses = sorted(
+    #     hypotheses_bank, key=lambda x: hypotheses_bank[x].acc, reverse=True
+    # )
+    # hypotheses_bank = {
+    #     sorted_hypotheses[0]: hypotheses_bank[sorted_hypotheses[0]]
+    # }
+    update_class.save_to_json(
+        hypotheses_bank,
+        sample="init",
+        seed=seed,
+        epoch=0,
+    )
+    for epoch in range(3):
+        # if there exist a hypothesis with accuracy 1.0, stop the training
+        if any(hypotheses_bank[h].acc == 1.0 for h in hypotheses_bank):
+            update_class.save_to_json(
+                hypotheses_bank,
+                sample="final",
+                seed=seed,
+                epoch=2,
+            )
+            break
+        # Else, iteratively refine
+        hypotheses_bank = update_class.update(
+            current_epoch=epoch,
+            hypotheses_bank=hypotheses_bank,
+            current_seed=seed,
+            cache_seed=cache_seed,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            max_concurrent=64,
+        )
+        # sorted_hypotheses = sorted(
+        #     hypotheses_bank, key=lambda x: hypotheses_bank[x].acc, reverse=True
+        # )
+        # hypotheses_bank = {
+        #     sorted_hypotheses[0]: hypotheses_bank[sorted_hypotheses[0]]
+        # }
+        update_class.save_to_json(
+            hypotheses_bank,
+            sample="final",
+            seed=seed,
+            epoch=epoch,
+        )
+
+
+def union_hypotheses(task_name, api, model_name, use_refine=True, prioritize='balanced'):
+
+    union_postfix = "refine_and_paper" if use_refine else "hypogenic_and_paper"
+    output_folder = f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_{union_postfix}"
+
+    os.makedirs(output_folder, exist_ok=True)
+
+    os.makedirs(f"./results/{task_name}/{model_name}/dedup_data_only", exist_ok=True)
+    os.makedirs(f"./results/{task_name}/{model_name}/dedup_paper_only", exist_ok=True)
+
+    task = BaseTask(
+        config_path=f"./data/{task_name}/config.yaml",
+        from_register=extract_label_register,
+        use_ood=use_ood,
+        regression=regression
+    )
+
+    set_seed(seed)
+    prompt_class = TestPrompt(task)
+
+    if use_refine:
+        data_hyp_file =  f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_with_paper/hypotheses_training_sample_final_seed_{seed}_epoch_0.json"
+    else:
+        data_hyp_file =  f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json"
+
+    with open(data_hyp_file) as f:
+        hyp_dict = json.load(f)
+    data_hyp_bank = {}
+    for hypothesis in hyp_dict:
+        data_hyp_bank[hypothesis] = SummaryInformation.from_dict(hyp_dict[hypothesis])
+
+    paper_hyp_file = f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_only_paper/hypotheses_training_sample_0_seed_{seed}_epoch_0.json"
+    
+    with open(paper_hyp_file) as f:
+        hyp_dict = json.load(f)
+    paper_hyp_bank = {}
+    for hypothesis in hyp_dict:
+        paper_hyp_bank[hypothesis] = SummaryInformation.from_dict(hyp_dict[hypothesis])
+
+
+    unique_data_hyp_bank = multiple_hypotheses_remove_repetition(
+        prompt_class,
+        api,
+        data_hyp_bank,
+        cache_seed,
+        max_concurrent=64,
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+    unique_paper_hyp_bank = multiple_hypotheses_remove_repetition(
+        prompt_class,
+        api,
+        paper_hyp_bank,
+        cache_seed,
+        max_concurrent=64,
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+    data_dump_dict = {}
+    for hyp in unique_data_hyp_bank:
+        data_dump_dict[hyp] = {"hypothesis": hyp, "acc": unique_data_hyp_bank[hyp].acc}
+
+    paper_dump_dict = {}
+    for hyp in unique_paper_hyp_bank:
+        paper_dump_dict[hyp] = {"hypothesis": hyp, "acc": unique_paper_hyp_bank[hyp].acc}
+
+    with open(f"./results/{task_name}/{model_name}/dedup_data_only/hypotheses_training_sample_final_seed_{seed}_epoch_0.json", 'w') as file:
+        json.dump(data_dump_dict, file)
+
+    with open(f"./results/{task_name}/{model_name}/dedup_paper_only/hypotheses_training_sample_final_seed_{seed}_epoch_0.json", 'w') as file:
+        json.dump(paper_dump_dict, file)
+
+    unique_data_hyp_list = sorted(unique_data_hyp_bank, key=lambda x: unique_data_hyp_bank[x].acc, reverse=True)
+    unique_paper_hyp_list = sorted(unique_paper_hyp_bank, key=lambda x: unique_paper_hyp_bank[x].acc, reverse=True)
+    union_hyp_bank = {}
+
+    if prioritize == "data":
+        while len(union_hyp_bank) < max_num_hypotheses and len(unique_data_hyp_list) > 0:
+            hyp = unique_data_hyp_list.pop(0)
+            union_hyp_bank[hyp] = unique_data_hyp_bank[hyp]
+        print("num from data: ", len(union_hyp_bank))
+        while len(union_hyp_bank) < max_num_hypotheses and len(unique_paper_hyp_list) > 0:
+            hyp = unique_paper_hyp_list.pop(0)
+            union_hyp_bank[hyp] = unique_paper_hyp_bank[hyp]
+    elif prioritize == "paper":
+        while len(union_hyp_bank) < max_num_hypotheses and len(unique_paper_hyp_list) > 0:
+            hyp = unique_paper_hyp_list.pop(0)
+            union_hyp_bank[hyp] = unique_paper_hyp_bank[hyp]
+        print("num from paper: ", len(union_hyp_bank))
+        while len(union_hyp_bank) < max_num_hypotheses and len(unique_data_hyp_list) > 0:
+            hyp = unique_data_hyp_list.pop(0)
+            union_hyp_bank[hyp] = unique_data_hyp_bank[hyp]
+    else:
+        while len(union_hyp_bank) < max_num_hypotheses // 2 and len(unique_data_hyp_list) > 0:
+            hyp = unique_data_hyp_list.pop(0)
+            union_hyp_bank[hyp] = unique_data_hyp_bank[hyp]
+        print("num from data: ", len(union_hyp_bank))
+        while len(union_hyp_bank) < max_num_hypotheses and len(unique_paper_hyp_list) > 0:
+            hyp = unique_paper_hyp_list.pop(0)
+            union_hyp_bank[hyp] = unique_paper_hyp_bank[hyp]
+    
+    union_dump_dict = {}
+    for hyp in union_hyp_bank:
+        union_dump_dict[hyp] = {"hypothesis": hyp, "acc": union_hyp_bank[hyp].acc}
+
+    with open(f'{output_folder}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json', 'w') as file:
+        json.dump(union_dump_dict, file)
+
+
+def get_res(filename: str, task_name, api, model_name, use_val=False, multihyp=False):
+    logger = LoggerConfig.get_logger("Agent - get_res")
+
+    set_seed(seed)
+
+    task = BaseTask(
+        config_path=f"./data/{task_name}/config.yaml",
+        from_register=extract_label_register,
+        use_ood=use_ood,
+        regression=regression
+    )
+
+    train_data, test_data, val_data = task.get_data(num_train, num_test, num_val, seed)
+    if use_val:
+        test_data = val_data
+
+    prompt_class = TestPrompt(task)
+    
+
+    with open(filename) as f:
+        hyp_dict = json.load(f)
+    hyp_bank = {}
+    for hypothesis in hyp_dict:
+        hyp_bank[hypothesis] = SummaryInformation.from_dict(hyp_dict[hypothesis])
+
+
+    if multihyp:
+        inference_class = MultiHypDefaultInference(api, prompt_class, train_data, task)
+        pred_list, label_list = inference_class.run_inference_final(
+                test_data,
+                hyp_bank,
+                cache_seed=cache_seed,
+                max_concurrent=64,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            )
+        
+        results_dict = get_results_regression(pred_list, label_list)
+        # f1 = results_dict["f1"]
+        # acc = results_dict["accuracy"]
+        mse = results_dict["mse"]
+        logger_str = "Results:\n"
+        # logger_str += f"Accuracy: {acc}\n"
+        # logger_str += f"F1: {f1}\n\n"
+        logger_str += f"MSE: {mse}\n\n"
+        logger.info(logger_str)
+        return results_dict  # Return results dictionary
+    else:
+        inference_class = DefaultInference(api, prompt_class, train_data, task)
+
+        hyp_list = []
+
+        for idx, hyp in enumerate(hyp_bank):
+            pred_list, label_list = inference_class.run_inference_final(
+                test_data,
+                {hyp: hyp_bank[hyp]},
+                cache_seed=cache_seed,
+                max_concurrent=64,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            )
+            results_dict = get_results_regression(pred_list, label_list)
+            # hyp_list.append((hyp, results_dict["accuracy"], results_dict["f1"]))
+            hyp_list.append((hyp, results_dict["mse"]))
+
+        # hyp_list = sorted(hyp_list, key=lambda x: x[2], reverse=True)
+        hyp_list = sorted(hyp_list, key=lambda x: x[1])
+        logger_str = "Results:\n"
+        # for idx, (hyp, acc, f1) in enumerate(hyp_list):
+        #     logger_str += f"{idx + 1}. {hyp}\n"
+        #     logger_str += f"Train Accuracy: {hyp_bank[hyp].acc}\n"
+        #     logger_str += f"Test Accuracy: {acc}\n"
+        #     logger_str += f"Test F1: {f1}\n\n"
+        for idx, (hyp, mse) in enumerate(hyp_list):
+            logger_str += f"{idx + 1}. {hyp}\n"
+            logger_str += f"Train Exploitation (=-MSE): {hyp_bank[hyp].acc}\n"
+            logger_str += f"Test MSE: {mse}\n\n"
+        logger.info(logger_str)
+
+        # Format results in a more structured way
+        # formatted_results = {
+        #     "hypotheses": [
+        #         {
+        #             "hypothesis": hyp,
+        #             "train_accuracy": hyp_bank[hyp].acc,
+        #             "test_accuracy": acc,
+        #             "test_f1": f1
+        #         } for hyp, acc, f1 in hyp_list
+        #     ],
+        #     "best": {
+        #         "hypothesis": hyp_list[0][0],
+        #         "test_accuracy": hyp_list[0][1],
+        #         "test_f1": hyp_list[0][2]
+        #     } if hyp_list else {}
+        # }
+        formatted_results = {
+            "hypotheses": [
+                {
+                    "hypothesis": hyp,
+                    "train_exploitation": hyp_bank[hyp].acc,
+                    "test_mse": mse
+                } for hyp, mse in hyp_list
+            ],
+            "best": {
+                "hypothesis": hyp_list[0][0],
+                "test_mse": hyp_list[0][1]
+            } if hyp_list else {}
+        }
+        return formatted_results
+
+
+def save_method_results(method_name, results, task_name, model_name, seed=42, timestamp=None, use_ood=False):
+    """Save results for a specific evaluation method to a JSON file."""
+    # Create results directory if it doesn't exist
+    results_dir = f"./results/{task_name}/{model_name}/evaluation_results"
+    os.makedirs(results_dir, exist_ok=True)
+    
+    # Add OOD suffix if using out-of-distribution data
+    data_type = "OOD" if use_ood else "IND"
+    filename = f"{results_dir}/{method_name}_{data_type}_seed_{seed}.json"
+    current_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+    
+    result_data = {
+        "method": method_name,
+        "task": task_name,
+        "model": model_name,
+        "data_type": data_type,
+        "seed": seed,
+        "timestamp": current_timestamp,
+        "results": results,
+        "regression": True # Hardcoded for now
+    }
+    
+    with open(filename, 'w') as f:
+        json.dump(result_data, f, indent=2)
+    
+    return filename
+
+
+def combine_results(task_name, model_name, methods_run=None, seed=42, use_ood=False):
+    """Combine all evaluation results into a single summary file."""
+    results_dir = f"./results/{task_name}/{model_name}/evaluation_results"
+    if not os.path.exists(results_dir):
+        return None
+    
+    data_type = "OOD" if use_ood else "IND"
+    result_files = [f for f in os.listdir(results_dir) if f.endswith('.json') and data_type in f]
+    
+    if methods_run is not None:
+        method_files = []
+        for method in methods_run:
+            method_file = f"{method}_{data_type}_seed_{seed}.json"
+            if method_file in result_files:
+                method_files.append(method_file)
+        result_files = method_files
+    else:
+        # Filter by seed and data type
+        result_files = [f for f in result_files if f'seed_{seed}' in f and data_type in f]
+    
+    if not result_files:
+        return None
+    
+    # Combine results
+    combined_results = {
+        "task": task_name,
+        "model": model_name,
+        "data_type": data_type,
+        "seed": seed,
+        "timestamp": datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
+        "methods": {},
+        "regression": True # Hardcoded for now
+    }
+    
+    for filename in result_files:
+        with open(os.path.join(results_dir, filename), 'r') as f:
+            data = json.load(f)
+            method_name = data["method"]
+            combined_results["methods"][method_name] = data["results"]
+    
+    # Save combined results
+    combined_file = f"./results/{task_name}/{model_name}/combined_results_{data_type}_seed_{seed}.json"
+    with open(combined_file, 'w') as f:
+        json.dump(combined_results, f, indent=2)
+    
+    return combined_file
+
+
+def log_arguments(logger, args):
+    """Log all configuration parameters in an organized way."""
+    sections = {
+        "Run Configuration": [
+            ("Model Type", args.model_type),
+            ("Model Name", args.model_name),
+            ("Model Path", args.model_path),
+            ("Task Name", args.task_name),
+            ("Do Train", args.do_train),
+            ("Use OOD", args.use_ood),
+            ("Regression", args.regression)
+        ],
+        "Method Configuration": [
+            ("Run Zero Shot", args.run_zero_shot),
+            ("Run Few Shot", args.run_few_shot),
+            ("Run Zero Shot Gen", args.run_zero_shot_gen),
+            ("Run Only Paper", args.run_only_paper),
+            ("Run HyperWrite", args.run_hyperwrite),
+            ("Run NotebookLM", args.run_notebooklm),
+            ("Run HypoGeniC", args.run_hypogenic),
+            ("Run HypoRefine", args.run_hyporefine),
+            ("Run Union HypoGeniC", args.run_union_hypo),
+            ("Run Union HypoRefine", args.run_union_refine),
+            ("Run Cross Model", args.run_cross_model),
+            ("Run IO Refine", args.run_io_refine)
+        ],
+        "Algorithm Configuration": [
+            ("Cross Model Postfix", cross_model_postfix),
+            ("Multi Hypothesis", multihyp),
+            ("Use Validation", use_val),
+            ("Max Num Hypotheses", max_num_hypotheses),
+            ("Num Init", num_init),
+            ("Num Train", num_train),
+            ("Num Test", num_test),
+            ("Num Val", num_val),
+            ("K", k),
+            ("Alpha", alpha),
+            ("Update Batch Size", update_batch_size),
+            ("Num Hypotheses to Update", num_hypotheses_to_update),
+            ("Update Hypotheses Per Batch", update_hypotheses_per_batch),
+            ("Save Every N Examples", save_every_10_examples),
+            ("Init Batch Size", init_batch_size),
+            ("Init Hypotheses Per Batch", init_hypotheses_per_batch),
+            ("Cache Seed", cache_seed),
+            ("Temperature", temperature),
+            ("Max Tokens", max_tokens),
+            ("Use Refine", use_refine),
+            ("Max Refine", max_refine),
+            ("Seed", seed)
+        ]
+    }
+    
+    for section, params in sections.items():
+        logger.info(f"\n=== {section} ===")
+        for name, value in params:
+            logger.info(f"{name}: {value}")
+    logger.info("=====================\n")
+
+
+def baseline(few_shot_k, task_name, api, model_name, seed=42, use_val=False):
+    def few_shot(
+        api: LLMWrapper,
+        train_data,
+        test_data,
+        prompt_class: BasePrompt,
+        task,
+        few_shot_k,
+        cache_seed,
+    ):
+        """
+        Given one hyothesis and a dataset, return the accuracy of the hypothesis on the dataset.
+        """
+        results = []
+        prompt_inputs = [
+            prompt_class.few_shot_baseline(
+                train_data.reset_index(drop=True), few_shot_k, test_data, i
+            )
+            for i in range(len(test_data))
+        ]
+
+        # print(
+        #     yaml.dump(
+        #         prompt_inputs[0],
+        #         default_flow_style=False,
+        #         sort_keys=False,
+        #         allow_unicode=True,
+        #         Dumper=yaml.SafeDumper,
+        #     )
+        # )
+
+        responses = api.batched_generate(
+            prompt_inputs, cache_seed=cache_seed, max_concurrent=64, max_tokens=4000
+        )
+        for i in range(len(test_data)):
+            pred = task.extract_label(responses[i])
+            label = test_data[task.label_name][i]
+
+            results.append(
+                {
+                    "prompt": prompt_inputs[i],
+                    "response": responses[i],
+                    "label": label,
+                    "pred": pred,
+                }
+            )
+
+        return results
+
+    def preprocess(train_data, k):
+        data = []
+
+        label_nunique = train_data[task.label_name].nunique()
+        label_unique = train_data[task.label_name].unique()
+        for i in range(k):
+            data.append(
+                train_data[train_data[task.label_name] == label_unique[i % label_nunique]].iloc[
+                    i // label_nunique
+                ]
+            )
+
+        return pd.DataFrame(data)
+
+    set_seed(seed)
+
+    task = BaseTask(
+        config_path=f"./data/{task_name}/config.yaml",
+        from_register=extract_label_register,
+        use_ood=use_ood,
+        regression=regression
+    )
+    prompt_class = BasePrompt(task)
+    results_list = []
+    train_data, test_data, val_data = task.get_data(num_train, num_test, num_val, seed)
+    if use_val:
+        test_data = val_data
+
+    if few_shot_k > 0:
+        train_data = preprocess(train_data, few_shot_k)
+
+    results = few_shot(
+        api, train_data, test_data, prompt_class, task, few_shot_k, cache_seed
+    )
+
+    labels = [result["label"] for result in results]
+    preds = [result["pred"] for result in results]
+
+    results_dict = get_results_regression(preds, labels)
+    # results_list.append((None, results_dict["accuracy"], results_dict["f1"]))
+    results_list.append((None, results_dict["mse"]))
+    
+    # Format results in a structured way
+    formatted_results = {
+        "few_shot_k": few_shot_k,
+        # "accuracy": results_dict["accuracy"],
+        # "f1": results_dict["f1"],
+        # "precision": results_dict.get("precision", None),
+        # "recall": results_dict.get("recall", None)
+        "mse": results_dict["mse"],
+    }
+    
+    return formatted_results
+
+
+if __name__ == "__main__":
+    model_name = args.model_name
+    model_path = args.model_path
+    model_type = args.model_type
+    task_name = args.task_name
+    DO_TRAIN = args.do_train
+
+    task_folder = (
+        f"./results/{task_name}/"
+    )
+    model_folder = (
+        f"./results/{task_name}/{model_name}/"
+    )
+
+    os.makedirs(task_folder, exist_ok=True)
+    os.makedirs(model_folder, exist_ok=True)
+    
+    LoggerConfig.setup_logger(
+        logging.INFO,
+        f"results/{task_name}/{model_name}_seed_{seed}_{datetime.datetime.now().strftime('%Y-%m-%d,%H-%M-%S')}.log",
+    )
+
+    logger = LoggerConfig.get_logger("Agent")
+    log_arguments(logger, args)
+    
+    api = llm_wrapper_register.build(model_type)(model=model_name, path_name=model_path)
+    methods_run = []  # Track which methods were executed
+    
+    logger.info(f"=-=-=-=-=-=-=-=-=-=-=-={model_name}=-=-=-=-=-=-=-=-=-=-=-=")
+    logger.info(f"=-=-=-=-=-=-=-=-=-=-=-={task_name}=-=-=-=-=-=-=-=-=-=-=-=")
+
+    # Modify the execution flow to use toggle arguments and save results
+    if args.run_zero_shot:
+        method_name = "zero_shot_baseline"
+        methods_run.append(method_name)
+        logger.info(f"=-=-=-=-=-=-=-=-=-=-=-=Zero-shot baseline, seed {seed}=-=-=-=-=-=-=-=-=-=-=-=")
+        results = baseline(
+            0,
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            seed=seed,
+        )
+        logger.info(results)
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+
+    if args.run_few_shot:
+        method_name = "few_shot_baseline"
+        methods_run.append(method_name)
+        logger.info(f"=-=-=-=-=-=-=-=-=-=-=-=Few-shot baseline, seed {seed}=-=-=-=-=-=-=-=-=-=-=-=")
+        results = baseline(
+            3,
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            seed=seed,
+        )
+        logger.info(results)
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+
+    if args.run_zero_shot_gen:
+        method_name = "zero_shot_gen"
+        methods_run.append(method_name)
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=Zero-shot generation=-=-=-=-=-=-=-=-=-=-=-=")
+        if DO_TRAIN:
+            zero_shot_hyp(task_name=task_name, api=api, model_name=model_name)
+        results = get_res(
+            f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_zero_shot/hypotheses_training_sample_0_seed_{seed}_epoch_0.json",
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            multihyp=multihyp,
+        )
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+
+    if args.run_only_paper:
+        method_name = "literature_only"
+        methods_run.append(method_name)
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=Literature-only=-=-=-=-=-=-=-=-=-=-=-=")
+        if DO_TRAIN:
+            only_paper(task_name=task_name, api=api, model_name=model_name)
+        results = get_res(
+            f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_only_paper/hypotheses_training_sample_0_seed_{seed}_epoch_0.json",
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            multihyp=multihyp,
+        )
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+
+    if args.run_hypogenic:
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=Original HypoGeniC=-=-=-=-=-=-=-=-=-=-=-=")
+        if DO_TRAIN:
+            original_hypogenic(task_name=task_name, api=api, model_name=model_name)
+        
+        method_name = "hypogenic_no_update"
+        methods_run.append(method_name)
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=No Update=-=-=-=-=-=-=-=-=-=-=-=")
+        results = get_res(
+            f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}/hypotheses_training_sample_10_seed_{seed}_epoch_0.json",
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            multihyp=multihyp,
+        )
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+
+        method_name = "hypogenic"
+        methods_run.append(method_name)
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=With Update=-=-=-=-=-=-=-=-=-=-=-=")
+        results = get_res(
+            f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json",
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            multihyp=multihyp,
+        )
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+
+    if args.run_hyporefine:
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=HypoRefine=-=-=-=-=-=-=-=-=-=-=-=")
+        if DO_TRAIN:
+            with_paper(task_name=task_name, api=api, model_name=model_name)
+        
+        method_name = "hyporefine_no_update"
+        methods_run.append(method_name)
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=No Update=-=-=-=-=-=-=-=-=-=-=-=")
+        results = get_res(
+            f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_with_paper/hypotheses_training_sample_10_seed_{seed}_epoch_0.json",
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            multihyp=multihyp,
+        )
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+        
+        method_name = "hyporefine"
+        methods_run.append(method_name)
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=With Update=-=-=-=-=-=-=-=-=-=-=-=")
+        results = get_res(
+            f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_with_paper/hypotheses_training_sample_final_seed_{seed}_epoch_0.json",
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            multihyp=multihyp,
+        )
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+
+    if args.run_union_hypo:
+        method_name = "union_hypogenic_paper"
+        methods_run.append(method_name)
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=Union HypoGeniC and Paper=-=-=-=-=-=-=-=-=-=-=-=")
+        if DO_TRAIN:
+            union_hypotheses(task_name=task_name, api=api, model_name=model_name, use_refine=False, prioritize='balanced')
+        union_postfix = "hypogenic_and_paper"
+        results = get_res(
+            f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_{union_postfix}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json",
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            multihyp=multihyp,
+        )
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+
+    if args.run_union_refine:
+        method_name = "union_hyporefine_paper"
+        methods_run.append(method_name)
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=Union HypoRefine and Paper=-=-=-=-=-=-=-=-=-=-=-=")
+        if DO_TRAIN:
+            union_hypotheses(task_name=task_name, api=api, model_name=model_name, use_refine=True, prioritize='balanced')
+        union_postfix = "refine_and_paper"
+        results = get_res(
+            f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_{union_postfix}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json",
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            multihyp=multihyp,
+        )
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+
+    if args.run_cross_model and "gpt" in model_type:
+        method_name = "cross_model_llama"
+        methods_run.append(method_name)
+        cross_model_name = "meta-llama/Meta-Llama-3.1-70B-Instruct"
+        logger.info(f"=-=-=-=-=-=-=-=-=-=-=-=Cross model {task_name}=-=-=-=-=-=-=-=-=-=-=-=")
+        results = get_res(
+            f"results/{task_name}/{cross_model_name}/hyp_{max_num_hypotheses}_{cross_model_postfix}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json",
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            multihyp=multihyp,
+        )
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+    elif args.run_cross_model:
+        method_name = "cross_model_gpt"
+        methods_run.append(method_name)
+        cross_model_name = "gpt-4o-mini"
+        logger.info(f"=-=-=-=-=-=-=-=-=-=-=-=Cross model {task_name}=-=-=-=-=-=-=-=-=-=-=-=")
+        results = get_res(
+            f"results/{task_name}/{cross_model_name}/hyp_{max_num_hypotheses}_{cross_model_postfix}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json",
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            multihyp=multihyp,
+        )
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+
+    if args.run_io_refine:
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=IO Iterative Refinement=-=-=-=-=-=-=-=-=-=-=-=")
+        if DO_TRAIN:
+            IO_iterative_refinement(task_name=task_name, api=api, model_name=model_name)
+
+        method_name = "io_prompting"
+        methods_run.append(method_name)
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=IO Prompting=-=-=-=-=-=-=-=-=-=-=-=")
+        results = get_res(
+            f"results/{task_name}/{model_name}/IO_refinement/hypotheses_training_sample_init_seed_{seed}_epoch_0.json",
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            multihyp=False,
+        )
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+        
+        method_name = "io_refinement"
+        methods_run.append(method_name)
+        logger.info("=-=-=-=-=-=-=-=-=-=-=-=IO Iterative refinement=-=-=-=-=-=-=-=-=-=-=-=")
+        results = get_res(
+            f"results/{task_name}/{model_name}/IO_refinement/hypotheses_training_sample_final_seed_{seed}_epoch_2.json",
+            task_name=task_name,
+            api=api,
+            model_name=model_name,
+            use_val=use_val,
+            multihyp=False,
+        )
+        save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood)
+
+    # Combine all results into a single summary file
+    if methods_run:
+        combined_file = combine_results(task_name, model_name, methods_run, seed, use_ood=use_ood)
+        if combined_file:
+            logger.info(f"Combined results saved to: {combined_file}")
+
+    # Log total cost of the run
+    if model_type == 'gpt':
+        total_cost = api.get_cost()
+        logger.info(f"Total cost: {total_cost}")
+        
+        # Save cost information to a separate file, using seed and data type
+        data_type = "OOD" if use_ood else "IND"
+        cost_file = f"./results/{task_name}/{model_name}/cost_{data_type}_seed_{seed}.json"
+        with open(cost_file, 'w') as f:
+            json.dump({
+                "model": model_name,
+                "task": task_name,
+                "data_type": data_type,
+                "seed": seed,
+                "timestamp": datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
+                "total_cost_usd": total_cost
+            }, f, indent=2)
+
diff --git a/run_pipeline_regression.sh b/run_pipeline_regression.sh
new file mode 100755
index 0000000..5f556b9
--- /dev/null
+++ b/run_pipeline_regression.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+# Model settings
+MODEL_TYPE="gpt"
+MODEL_NAME="gpt-4o-mini"
+
+# MODEL_TYPE="vllm"  
+# MODEL_NAME="meta-llama/Meta-Llama-3.1-70B-Instruct" 
+# MODEL_PATH="/net/scratch/llama/Meta-Llama-3.1-70B-Instruct"  # only needed for local models
+
+# MODEL_TYPE="vllm"  
+# MODEL_NAME="Qwen/Qwen2.5-72B-Instruct" 
+# MODEL_PATH="/net/projects/chai-lab/shared_models/Qwen2.5-72B-Instruct"  # only needed for local models
+
+# MODEL_TYPE="vllm-dpskr1"  
+# MODEL_NAME="DeepSeek/DeepSeek-R1-Distill-Llama-70B-local" 
+# MODEL_PATH="/net/projects/chai-lab/shared_models/DeepSeek-R1-Distill-Llama-70B-local"  # only needed for local models
+
+# Define list of tasks to run
+TASKS=(
+    "regression/db-synth/adventure-travel_0_0"
+)
+
+# Define methods to run
+
+METHODS=(
+    "zero_shot"
+    "few_shot"
+    # "zero_shot_gen"
+    # "only_paper"
+    "hypogenic"
+    # "hyporefine"
+    # "union_hypo"
+    # "union_refine"
+    # "io_refine"
+)
+
+# Algorithm settings
+MAX_NUM_HYPOTHESES=5 #20
+NUM_TRAIN=20 #200
+NUM_TEST=30 #300
+SEED=42
+
+# Iterate through each task
+for TASK_NAME in "${TASKS[@]}"; do
+    echo "Running pipeline for task: $TASK_NAME"
+    
+    # Create command with base required arguments
+    CMD="python regression_pipeline.py \
+        --model_type ${MODEL_TYPE} \
+        --model_name ${MODEL_NAME} \
+        --task_name ${TASK_NAME} \
+        --seed ${SEED} \
+        --max_num_hypotheses ${MAX_NUM_HYPOTHESES} \
+        --num_train ${NUM_TRAIN} \
+        --num_test ${NUM_TEST} \
+        --regression"
+
+    if [ "${MODEL_TYPE}" = "vllm" ]; then
+        CMD="${CMD} --model_path ${MODEL_PATH}"
+    fi
+
+    # Default methods to run in pipeline
+    # Original commented version kept for reference
+    # CMD="${CMD} \
+    #     --run_zero_shot \
+    #     --run_few_shot \
+    #     --run_zero_shot_gen \
+    #     --run_hypogenic \
+    #     --do_train"
+
+    # Add methods dynamically
+    for METHOD in "${METHODS[@]}"; do
+        CMD="${CMD} --run_${METHOD}"
+    done
+    # IND setup
+    CMD="${CMD} 
+    --do_train
+    "
+
+    # OOD setup
+    # CMD="${CMD} 
+    # --use_ood
+    # "
+    
+    # Additional methods
+    # Uncomment if needed
+    # --run_only_paper \
+    # --run_hyperwrite \
+    # --run_notebooklm \
+    # --run_hyporefine \
+    # --run_union_hypo \
+    # --run_union_refine \
+    # --run_io_refine \
+    # --run_cross_model \
+    # --use_val \
+    # --multihyp \
+    # --use_refine
+
+    echo "Executing command: $CMD"
+    eval $CMD
+    
+    echo "Completed task: $TASK_NAME"
+    echo "----------------------------------------"
+done