diff --git a/hypogenic/algorithm/generation/__init__.py b/hypogenic/algorithm/generation/__init__.py index 741988d..de1ef70 100644 --- a/hypogenic/algorithm/generation/__init__.py +++ b/hypogenic/algorithm/generation/__init__.py @@ -3,5 +3,5 @@ generation_register = Register("generation") from .base import Generation -from .default import DefaultGeneration +from .default import DefaultGeneration, DefaultGenerationContinuous from . import utils diff --git a/hypogenic/algorithm/generation/base.py b/hypogenic/algorithm/generation/base.py index 0133762..998dd48 100644 --- a/hypogenic/algorithm/generation/base.py +++ b/hypogenic/algorithm/generation/base.py @@ -188,3 +188,92 @@ def make_hypotheses_bank( new_generated_hypotheses[hyp].set_example(ex) return new_generated_hypotheses + +''' + def make_hypotheses_bank_continuous( + self, + example_indices, + current_sample, + alpha, + hypotheses_list: List[str], + cache_seed=None, + max_concurrent=3, + **generate_kwargs + ): + """ + not actually used, just in case + """ + """ + Based on hypotheses generated by the LM, create new hypotheses_bank. + + Parameters: + example_indices: the indices of examples being used to generate hypotheses + current_sample: the current sample in data which the algorithm is on + num_hypotheses_generate: the number of hypotheses that we expect our repsonse to generate + hypotheses: a list of hypotheses generated by the LM + alpha: eploration constant in hypogenic reward funciton + cache_seed: If `None`, will not use cache, otherwise will use cache with corresponding seed number + max_concurrent: the maximum number of concurrent requests to make to the API + + Returns: + new_generated_hypotheses: A dictionary containing all newly generated hypotheses. The keys are the hypotheses and the values are the Summary Information class + """ + idx_hyp_pair = [] + new_generated_hypotheses = {} + + for hyp in hypotheses_list: + new_generated_hypotheses[hyp] = SummaryInformation( + hypothesis=hyp, acc=0, num_visits=0, reward=0, correct_examples=[] + ) + + for index in example_indices: + idx_hyp_pair.append((index, {hyp: new_generated_hypotheses[hyp]})) + + # ---------------------------------------------------------------------- + # We try to predict the ground truth labels + # ---------------------------------------------------------------------- + preds, labels = self.inference_class.batched_predict( + self.train_data, + idx_hyp_pair, + cache_seed=cache_seed, + max_concurrent=max_concurrent, + **generate_kwargs + ) + preds, labels = preds[::-1], labels[::-1] + + # ---------------------------------------------------------------------- + # Finding the accuracy and the correct examples for each hypothesis + # ---------------------------------------------------------------------- + for hyp in hypotheses_list: + correct = 0 + ex = [] + + loss = 0.0 + for index in example_indices: + prediction, actual_label = preds.pop(-1), labels.pop(-1) + if isinstance(prediction, str): + if prediction == "other": + prediction = -1 + else: + prediction = float(prediction) + if isinstance(actual_label, str): + actual_label = float(actual_label) + loss += (prediction - actual_label) ** 2 + loss = loss / len(example_indices) + acc = 1.0 / (1.0 + loss) + + # print(f"acc = {acc} for hypothesis: {hyp}") + + new_generated_hypotheses[hyp].set_accuracy(acc) + new_generated_hypotheses[hyp].set_num_visits(len(example_indices)) + + # hypogenic reward + reward = acc + alpha * math.sqrt( + math.log(current_sample) / len(example_indices) + ) + + new_generated_hypotheses[hyp].set_reward(reward) + new_generated_hypotheses[hyp].set_example(ex) + + return new_generated_hypotheses +''' \ No newline at end of file diff --git a/hypogenic/algorithm/generation/default.py b/hypogenic/algorithm/generation/default.py index bc33a3c..6d47089 100644 --- a/hypogenic/algorithm/generation/default.py +++ b/hypogenic/algorithm/generation/default.py @@ -155,3 +155,247 @@ def batched_hypothesis_generation( max_concurrent=max_concurrent, **generate_kwargs, ) + +@generation_register.register("default_continuous") +class DefaultGenerationContinuous(Generation): + """ + Add on extra functionality to the generation fucntion - we consider this + the "default" task. + """ + + def __init__( + self, + api, + # reward_a, + # reward_b, + prompt_class: BasePrompt, + inference_class: Inference, + task: BaseTask, + ): + """ + Parameters: + api: the language model that you're using, which may or may not be local + prompt_class: let's us know how the prompt is going to look + inference_class: gives us a way to predict labels for accuracy sake + task: determines the goal to accomplish + """ + super().__init__(api, prompt_class, inference_class, task) + # self.reward_a = reward_a + # self.reward_b = reward_b + + # ------------------------------------------------------------------------ # + # # + # ------------------------------------------------------------------------ # + # BATCH INITLALIZE HYPOTHESES # + # ------------------------------------------------------------------------ # + # # + # ------------------------------------------------------------------------ # + def batched_initialize_hypotheses( + self, + num_init, + init_batch_size, + init_hypotheses_per_batch, + cache_seed=None, + max_concurrent=3, + **generate_kwargs, + ): + """ + Batches the process of making new hypotheses + + Parameters: + num_init: the total amount of examples you want to use for initialize hypotheses + init_batch size: the number of examples that will be used to generate these hypotheses + init_hypotheses_per_batch: the amount of hypotheses that you want to generate per btach + cache_seed: If `None`, will not use cache, otherwise will use cache with corresponding seed number + max_concurrent: the maximum amount of concurrent calls to the API + + Returns: + hypotheses_bank: A dictionary with keys as hypotheses and the values as the Summary Information class + """ + # ---------------------------------------------------------------------- + # Finding batch size and confirming that it works + # ---------------------------------------------------------------------- + assert ( + num_init % init_batch_size == 0 + ), "Number of initial examples must be divisible by the batch size" + num_batches = num_init // init_batch_size + prompt_inputs = [] + + # ---------------------------------------------------------------------- + # Making the batch of the responses that will be used in the batch generation + # ---------------------------------------------------------------------- + for i in range(num_batches): + example_indices = list( + range(i * init_batch_size, (i + 1) * init_batch_size) + ) + # TODO: need copy()? + example_bank = ( + self.train_data.loc[list(example_indices)].copy().reset_index(drop=True) + ) + prompt_inputs.append( + self.prompt_class.batched_generation( + example_bank, init_hypotheses_per_batch + ) + ) + responses = self.api.batched_generate( + prompt_inputs, + cache_seed=cache_seed, + max_concurrent=max_concurrent, + **generate_kwargs, + ) + + hypotheses_list = list( + set( + [ + hyp + for response in responses + for hyp in extract_hypotheses(response, init_hypotheses_per_batch) + ] + ) + ) + + return hypotheses_list + + def make_hypotheses_bank( + self, + example_indices, + current_sample, + alpha, + hypotheses_list: List[str], + cache_seed=None, + max_concurrent=3, + **generate_kwargs + ): + """ + override make_hypotheses_bank for continuous value prediction + """ + """ + Based on hypotheses generated by the LM, create new hypotheses_bank. + + Parameters: + example_indices: the indices of examples being used to generate hypotheses + current_sample: the current sample in data which the algorithm is on + num_hypotheses_generate: the number of hypotheses that we expect our repsonse to generate + hypotheses: a list of hypotheses generated by the LM + alpha: eploration constant in hypogenic reward funciton + cache_seed: If `None`, will not use cache, otherwise will use cache with corresponding seed number + max_concurrent: the maximum number of concurrent requests to make to the API + + Returns: + new_generated_hypotheses: A dictionary containing all newly generated hypotheses. The keys are the hypotheses and the values are the Summary Information class + """ + idx_hyp_pair = [] + new_generated_hypotheses = {} + + for hyp in hypotheses_list: + new_generated_hypotheses[hyp] = SummaryInformation( + hypothesis=hyp, acc=0, num_visits=0, reward=0, correct_examples=[] + ) + + for index in example_indices: + idx_hyp_pair.append((index, {hyp: new_generated_hypotheses[hyp]})) + + # ---------------------------------------------------------------------- + # We try to predict the ground truth labels + # ---------------------------------------------------------------------- + preds, labels = self.inference_class.batched_predict( + self.train_data, + idx_hyp_pair, + cache_seed=cache_seed, + max_concurrent=max_concurrent, + **generate_kwargs + ) + preds, labels = preds[::-1], labels[::-1] + + # ---------------------------------------------------------------------- + # Finding the accuracy and the correct examples for each hypothesis + # ---------------------------------------------------------------------- + + # ---------------------------------------------------------------------- + # New reward = -MSE + exploration + # ---------------------------------------------------------------------- + + for hyp in hypotheses_list: + ex = [] + + acc = 0.0 + for index in example_indices: + prediction, actual_label = preds.pop(-1), labels.pop(-1) + if isinstance(prediction, str): + if prediction in ["other", "unknown"]: + prediction = self.task.y_mu + else: + prediction = float(prediction) + if isinstance(actual_label, str): + actual_label = float(actual_label) + # normalize pred and label + + prediction = (prediction - self.task.y_min) / (self.task.y_max - self.task.y_min) + actual_label = (actual_label - self.task.y_min) / (self.task.y_max - self.task.y_min) + + acc += float(- (actual_label - prediction) ** 2) + acc = acc / len(example_indices) + + print(f"acc = {acc} for hypothesis: {hyp}") + + new_generated_hypotheses[hyp].set_accuracy(acc) + new_generated_hypotheses[hyp].set_num_visits(len(example_indices)) + + # hypogenic reward + reward = acc + alpha * math.sqrt( + math.log(current_sample) / len(example_indices) + ) + + new_generated_hypotheses[hyp].set_reward(reward) + new_generated_hypotheses[hyp].set_example(ex) + + return new_generated_hypotheses + + + # ------------------------------------------------------------------------ # + # # + # ------------------------------------------------------------------------ # + # BATCHED_HYPOTHESIS GENERATION # + # ------------------------------------------------------------------------ # + # # + # ------------------------------------------------------------------------ # + def batched_hypothesis_generation( + self, + example_ids, + current_sample, + num_hypotheses_generate: int, + alpha: float, + cache_seed=None, + max_concurrent=3, + **generate_kwargs, + ): + """ + Generates new hypotheses for the given examples + + Parameters: + example_ids: The ids of the examples for which hypotheses need to be generated + current_sample: the current sample in data which the algorithm is on + num_hypotheses_generate: the number of hypotheses that we expect our response to generate + alpha: eploration constant in hypogenic reward funciton + cache_seed: If `None`, will not use cache, otherwise will use cache with corresponding seed number + max_concurrent: The maximum number of concurrent requests + + Returns: + hypotheses_bank: A dictionary with keys as hypotheses and the values as the Summary Information class + """ + new_hypotheses = self.batched_hyp_list_generation( + example_ids, + num_hypotheses_generate, + cache_seed=cache_seed, + **generate_kwargs, + ) + + return self.make_hypotheses_bank( + example_ids, + current_sample, + alpha, + new_hypotheses, + cache_seed=cache_seed, + max_concurrent=max_concurrent, + **generate_kwargs, + ) diff --git a/hypogenic/algorithm/summary_information.py b/hypogenic/algorithm/summary_information.py index 1b9e5bb..7f3ec38 100644 --- a/hypogenic/algorithm/summary_information.py +++ b/hypogenic/algorithm/summary_information.py @@ -60,12 +60,19 @@ def update_info_if_useful(self, current_example, alpha): def update_useful_examples(self, example, label): self.correct_examples.append((example, label)) - # In the update function, if it got the ith smample wrong, adjust accuracy accordinly + # In the update function, if it got the ith sample wrong, adjust accuracy accordinly def update_info_if_not_useful(self, current_example, alpha): self.acc = (self.acc * self.num_visits) / (self.num_visits + 1) self.num_visits += 1 self.update_reward(alpha, current_example) + def update_info_continuous(self, current_example, alpha, diff_value,): + self.acc = self.acc * self.num_visits + self.acc += float(- (diff_value ** 2)) + self.num_visits += 1 + self.acc = self.acc / self.num_visits + self.update_reward(alpha, current_example) + def __reduce__(self): return (self.__class__, (-1, self.acc, self.reward, self.num_visits)) diff --git a/hypogenic/algorithm/update/__init__.py b/hypogenic/algorithm/update/__init__.py index 0952ffa..45ed4b4 100644 --- a/hypogenic/algorithm/update/__init__.py +++ b/hypogenic/algorithm/update/__init__.py @@ -3,5 +3,5 @@ update_register = Register("update") from .base import Update -from .default import DefaultUpdate +from .default import DefaultUpdate, DefaultUpdateContinuous from .sampling import SamplingUpdate diff --git a/hypogenic/algorithm/update/default.py b/hypogenic/algorithm/update/default.py index 7b11386..631614a 100644 --- a/hypogenic/algorithm/update/default.py +++ b/hypogenic/algorithm/update/default.py @@ -217,3 +217,220 @@ def update( # Our new bank return hypotheses_bank + + +@update_register.register("default_continuous") +class DefaultUpdateContinuous(Update): + """ + DefaultUpdate uses ONE hypothesis to make a prediction on a new example. + """ + + def __init__( + self, + generation_class: Generation, + inference_class: Inference, + replace_class: Replace, + save_path: str, + file_name_template: str = "hypotheses_training_sample_${sample}_seed_${seed}_epoch_${epoch}.json", + sample_num_to_restart_from=-1, + num_init=25, + epoch_to_start_from=0, + num_wrong_scale=0.8, + k=-1, + alpha=5e-1, + update_batch_size=5, + num_hypotheses_to_update=5, + update_hypotheses_per_batch=5, + only_best_hypothesis=False, + save_every_n_examples=100, + wrong_value_threshold=0.5, + # reward_a=1, + # reward_b=0.0625, + ): + super().__init__( + generation_class, + inference_class, + replace_class, + save_path, + file_name_template, + sample_num_to_restart_from, + num_init, + epoch_to_start_from, + num_wrong_scale, + k, + alpha, + update_batch_size, + num_hypotheses_to_update, + update_hypotheses_per_batch, + only_best_hypothesis, + save_every_n_examples, + ) + self.wrong_value_threshold = wrong_value_threshold + # self.reward_a = reward_a + # self.reward_b = reward_b + + def update( + self, + hypotheses_bank: Dict[str, SummaryInformation], + current_epoch, + current_seed, + cache_seed=None, + max_concurrent=3, + **generate_kwargs, + ): + """ + We update the hypothesis bank once we reach a certain amount of regret + + Parameters: + hypotheses_bank: The hypothesis bank + current_epoch: The current epoch + current_seed: The current seed + cache_seed: If `None`, will not use cache, otherwise will use cache with corresponding seed number + max_concurrent: The maximum number of concurrent requests + """ + logger = LoggerConfig.get_logger(logger_name) + + # initialize variables + num_train_examples = len(self.train_data) + wrong_example_ids = set() + + # ---------------------------------------------------------------------- + # Figuring out starting samples + # ---------------------------------------------------------------------- + # go through training examples + # When restarting from epoch > 0, no need to start at num_init + # When not restarting, then default sample_num_to_restart_from = -1. start with num_init. + # For multiple epochs restarts, there should always be a non-negative sample_num_to_restart_from + if self.sample_num_to_restart_from >= 0: + start_sample = self.sample_num_to_restart_from + else: + start_sample = self.num_init + + # This is to check if we are running more epochs than the starting epoch, if so, start at sample 0 + # basically, if we've completed the starting epoch, we want to start the next one + if current_epoch > self.epoch_to_start_from: + start_sample = 0 + + # ---------------------------------------------------------------------- + # Creating the new hypotheses + # ---------------------------------------------------------------------- + # from the start to the end + for i in range(start_sample, num_train_examples): + # the 'i' here is the sample we are testing each of the top hypotheses + + current_sample = i + 1 + logger.info(f"Training on example {i}") + + # We need to get the best k for testing the strength of our hypothesis bank + top_k_hypotheses = sorted( + hypotheses_bank, key=lambda x: hypotheses_bank[x].reward, reverse=True + )[: self.k] + + # We are at the regret that we need in order to generate a new hypothesis + if self.num_wrong_scale > 0: + num_wrong_to_add_bank = ( + len(top_k_hypotheses) * i / num_train_examples + ) * self.num_wrong_scale + + # ------------------------------------------------------------------ + # We need to see how good our hypothesis is, which we do by way of the inference class + # ------------------------------------------------------------------ + num_wrong_hypotheses = 0 + preds, labels = self.inference_class.batched_predict( + self.train_data, + [ + (i, {hypothesis: hypotheses_bank[hypothesis]}) + for hypothesis in top_k_hypotheses + ], + cache_seed=cache_seed, + max_concurrent=max_concurrent, + **generate_kwargs, + ) + + # Comparison of the label and prediction + for pred, label, hypothesis in zip(preds, labels, top_k_hypotheses): + if isinstance(pred, str): + pred = float(pred) + if isinstance(label, str): + label = float(label) + # normalize + pred = (pred - self.generation_class.task.y_mu) / (self.generation_class.task.y_max - self.generation_class.task.y_min) + label = (label - self.generation_class.task.y_mu) / (self.generation_class.task.y_max - self.generation_class.task.y_min) + + if abs(pred - label) > self.wrong_value_threshold: + num_wrong_hypotheses += 1 + hypotheses_bank[hypothesis].update_info_continuous( + current_sample, self.alpha, abs(pred - label), + ) # let the bank know it got one wrong + else: + hypotheses_bank[hypothesis].update_info_continuous( + current_sample, self.alpha, abs(pred - label), + ) # let the bank know it got one right + + # keeping track of good examples as we do in generation + hypotheses_bank[hypothesis].update_useful_examples(i, label) + + # ------------------------------------------------------------------ + # Generating a new hypothesis + # ------------------------------------------------------------------ + + # if we get enough wrong examples as determined by num_wrong_to_add_bank, + # we need to generate new hypotheses + if ( + num_wrong_hypotheses >= num_wrong_to_add_bank + or len(top_k_hypotheses) == 0 + ): + + # We note it as a bad sample + wrong_example_ids.add(i) + if ( + len(wrong_example_ids) + == self.update_batch_size * self.num_hypotheses_to_update + ): + new_hyp_bank = {} + + # generate new hypotheses + for j in range(self.num_hypotheses_to_update): + # Go through poorly performing exmaples and generate hypotheses for them + # TODO: batched? + new_hypotheses = ( + self.generation_class.batched_hypothesis_generation( + wrong_example_ids, + current_sample, + self.update_hypotheses_per_batch, + self.alpha, + cache_seed=cache_seed, + max_concurrent=max_concurrent, + **generate_kwargs, + ) + ) + + # If we only take the best performing hypothesis from the batch + if self.only_best_hypothesis: + best_hypothesis = max( + new_hypotheses, key=lambda x: new_hypotheses[x].reward + ) + new_hyp_bank.update( + {best_hypothesis: new_hypotheses[best_hypothesis]} + ) + else: + new_hyp_bank.update(new_hypotheses) + # reset wrong examples to be empty + wrong_example_ids = set() + + # call replace class to update the bank + hypotheses_bank = self.replace_class.replace( + hypotheses_bank, new_hyp_bank + ) + + # save hypotheses to json + if (i + 1) % self.save_every_n_examples == 0: + self.save_to_json( + hypotheses_bank, + sample=i + 1, + seed=current_seed, + epoch=current_epoch, + ) + + # Our new bank + return hypotheses_bank \ No newline at end of file diff --git a/hypogenic/tasks.py b/hypogenic/tasks.py index 264600f..9bc0eda 100644 --- a/hypogenic/tasks.py +++ b/hypogenic/tasks.py @@ -22,6 +22,7 @@ def __init__( extract_label: Union[Callable[[str], str], None] = None, from_register: Union[Register, None] = None, use_ood: bool = False, + regression: bool = False, ): if from_register is None and extract_label is None: raise ValueError("Either from_register or extract_label should be provided") @@ -47,7 +48,12 @@ def __init__( self.test_data_path = data["ood_data_path"] self.val_data_path = data["ood_data_path"] - # getting omrpt templates from yaml file + if regression: + self.y_min = float(data["y_min"]) + self.y_max = float(data["y_max"]) + self.y_mu = float(data["y_mu"]) + + # getting prompt templates from yaml file self.prompt_template = data["prompt_templates"] # task label diff --git a/hypogenic/utils.py b/hypogenic/utils.py index 5007373..8c4d202 100644 --- a/hypogenic/utils.py +++ b/hypogenic/utils.py @@ -54,6 +54,29 @@ def get_results(pred_list, label_list): return {"accuracy": accuracy, "f1": f1} +def get_results_regression(pred_list, label_list): + """ + Compute MSE for regression. + """ + print('label_list: ', label_list) + print('pred_list: ', pred_list) + + # Check if labels can be turned into floats + try: + label_list = [float(label) for label in label_list] + except: + raise ValueError("Labels cannot be turned into floats") + + # if a prediction cannot be turned into a float, then it is 'unknown' and then we use the mean of the predictions as the prediction + mean_pred = np.mean([float(pred) for pred in pred_list if pred != 'unknown']) + print('mean_pred: ', mean_pred) + pred_list = [mean_pred if pred == 'unknown' else float(pred) for pred in pred_list] + print('pred_list: ', pred_list) + + mse = np.mean((np.array(label_list) - np.array(pred_list))**2) + + return {"mse": mse} + def set_seed(seed): logger = LoggerConfig.get_logger(logger_name) logger.info(f"Setting seed to {seed}") diff --git a/regression_pipeline.py b/regression_pipeline.py new file mode 100644 index 0000000..cccd160 --- /dev/null +++ b/regression_pipeline.py @@ -0,0 +1,1236 @@ +import datetime +import json +import logging +import os +import argparse +import pandas as pd + +from hypogenic.LLM_wrapper import ( + GPTWrapper, + LLMWrapper, + LocalVllmWrapper, + llm_wrapper_register, +) +from hypogenic.extract_label import extract_label_register +from hypogenic.utils import set_seed, get_results_regression +from hypogenic.prompt import BasePrompt +from hypogenic.tasks import BaseTask +from hypogenic.logger_config import LoggerConfig +from hypogenic.algorithm.summary_information import SummaryInformation + +from hypogenic.algorithm.update import ( + DefaultUpdate, + DefaultUpdateContinuous, + update_register, +) +from hypogenic.algorithm.replace import DefaultReplace +from hypogenic.algorithm.inference import ( + DefaultInference, +) +from hypogenic.algorithm.generation import ( + DefaultGeneration, + DefaultGenerationContinuous, + generation_register, +) +from hypogenic.algorithm.summary_information import SummaryInformation + +from hypothesis_agent.data_analysis_agent.generation import ( + TestGeneration, + OnlyPaperGeneration, + ZeroShotGeneration, +) +from hypothesis_agent.data_analysis_agent.inference import MultiHypDefaultInference +from hypothesis_agent.data_analysis_agent.update import TestUpdate +from hypothesis_agent.literature_review_agent import LiteratureAgent +from hypothesis_agent.literature_review_agent.literature_processor.extract_info import ( + BaseExtractor, + WholeExtractor, +) +from hypothesis_agent.literature_review_agent.literature_processor.summarize import ( + BaseSummarize, + LLMSummarize, +) +from hypothesis_agent.data_analysis_agent.utils import multiple_hypotheses_remove_repetition +from hypothesis_agent.data_analysis_agent.prompt import TestPrompt + + +from IO_prompting.prompt import IOPrompt +from IO_prompting.update import IOUpdate +from IO_prompting.generation import IOGeneration + + +parser = argparse.ArgumentParser() + +# Required base arguments +parser.add_argument("--model_type", type=str, required=True) +parser.add_argument("--model_name", type=str, required=True) +parser.add_argument("--task_name", type=str, required=True) +# This is needed for local models +parser.add_argument("--model_path", type=str) +parser.add_argument("--do_train", action="store_true", default=False) + +# Method toggle arguments +parser.add_argument("--run_zero_shot", action="store_true", help="Run zero-shot baseline") +parser.add_argument("--run_few_shot", action="store_true", help="Run few-shot baseline") +parser.add_argument("--run_zero_shot_gen", action="store_true", help="Run zero-shot generation") +parser.add_argument("--run_only_paper", action="store_true", help="Run literature-only") +parser.add_argument("--run_hyperwrite", action="store_true", help="Run HyperWrite") +parser.add_argument("--run_notebooklm", action="store_true", help="Run NotebookLM") +parser.add_argument("--run_hypogenic", action="store_true", help="Run original HypoGeniC") +parser.add_argument("--run_hyporefine", action="store_true", help="Run HypoRefine") +parser.add_argument("--run_union_hypo", action="store_true", help="Run Union HypoGeniC and Paper") +parser.add_argument("--run_union_refine", action="store_true", help="Run Union HypoRefine and Paper") +parser.add_argument("--run_cross_model", action="store_true", help="Run cross-model evaluation") +parser.add_argument("--run_io_refine", action="store_true", help="Run IO iterative refinement") + +# All algorithm-related arguments +parser.add_argument("--cross_model_postfix", type=str, default="hypogenic_and_paper") +parser.add_argument("--multihyp", action="store_true", default=True) +parser.add_argument("--use_val", action="store_true", default=False) +parser.add_argument("--max_num_hypotheses", type=int, default=10) +parser.add_argument("--num_init", type=int, default=10) +parser.add_argument("--num_train", type=int, default=20) +parser.add_argument("--num_test", type=int, default=20) +parser.add_argument("--num_val", type=int, default=200) +parser.add_argument("--k", type=int, default=10) +parser.add_argument("--alpha", type=float, default=5e-1) +parser.add_argument("--update_batch_size", type=int, default=10) +parser.add_argument("--num_hypotheses_to_update", type=int, default=1) +parser.add_argument("--update_hypotheses_per_batch", type=int, default=10) +parser.add_argument("--save_every_10_examples", type=int, default=10) +parser.add_argument("--init_batch_size", type=int, default=10) +parser.add_argument("--init_hypotheses_per_batch", type=int, default=10) +parser.add_argument("--cache_seed", type=int, default=None) +parser.add_argument("--temperature", type=float, default=1e-5) +parser.add_argument("--max_tokens", type=int, default=8000) +parser.add_argument("--use_refine", action="store_true", default=False) +parser.add_argument("--max_refine", type=int, default=6) +parser.add_argument("--seed", type=int, default=42) +parser.add_argument("--use_ood", action="store_true", default=False, help="Use out-of-distribution data for testing") +parser.add_argument("--regression", action="store_true", default=False, help="Use regression task") + +args = parser.parse_args() + + +cross_model_postfix = args.cross_model_postfix +multihyp = args.multihyp +use_val = args.use_val +use_ood = args.use_ood +regression = args.regression +max_num_hypotheses = args.max_num_hypotheses +num_init = args.num_init +num_train = args.num_train +num_test = args.num_test +num_val = args.num_val +k = args.k +alpha = args.alpha +update_batch_size = args.update_batch_size +num_hypotheses_to_update = args.num_hypotheses_to_update +update_hypotheses_per_batch = args.update_hypotheses_per_batch +save_every_10_examples = args.save_every_10_examples +init_batch_size = args.init_batch_size +init_hypotheses_per_batch = args.init_hypotheses_per_batch +cache_seed = args.cache_seed +temperature = args.temperature +max_tokens = args.max_tokens +use_refine = args.use_refine +max_refine = args.max_refine +seed = args.seed + + +def zero_shot_hyp(task_name, api, model_name): + output_folder = ( + f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_zero_shot/" + ) + os.makedirs(output_folder, exist_ok=True) + set_seed(seed) + + task = BaseTask( + config_path=f"./data/{task_name}/config.yaml", + from_register=extract_label_register, + use_ood=use_ood, + regression=regression + ) + + train_data, _, _ = task.get_data(num_train, num_test, num_val, seed=seed) + + prompt_class = TestPrompt(task) + inference_class = DefaultInference(api, prompt_class, train_data, task) + + generation_class = ZeroShotGeneration( + api=api, + prompt_class=prompt_class, + inference_class=inference_class, + task=task, + ) + + update_class = TestUpdate( + generation_class=generation_class, + inference_class=inference_class, + replace_class=DefaultReplace(max_num_hypotheses), + save_path=output_folder, + num_init=num_init, + k=k, + alpha=alpha, + update_batch_size=update_batch_size, + update_hypotheses_per_batch=update_hypotheses_per_batch, + num_hypotheses_to_update=num_hypotheses_to_update, + save_every_n_examples=save_every_10_examples, + ) + + hypotheses_bank = generation_class.initialize_hypotheses_0_shot( + num_hypotheses_generate=init_hypotheses_per_batch, + cache_seed=cache_seed, + temperature=temperature, + max_tokens=max_tokens, + max_concurrent=64, + ) + hypotheses_bank = {hyp: SummaryInformation() for hyp in hypotheses_bank} + update_class.save_to_json(hypotheses_bank, sample=0, seed=seed, epoch=0) + + +def only_paper(task_name, api, model_name): + output_folder = ( + f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_only_paper/" + ) + os.makedirs(output_folder, exist_ok=True) + set_seed(seed) + + task = BaseTask( + config_path=f"./data/{task_name}/config.yaml", + from_register=extract_label_register, + use_ood=use_ood, + regression=regression + ) + + train_data, _, _ = task.get_data(num_train, num_test, num_val, seed=seed) + + prompt_class = TestPrompt(task) + inference_class = DefaultInference(api, prompt_class, train_data, task) + summarize_class = LLMSummarize( + extractor=WholeExtractor(), api=api, prompt_class=prompt_class + ) + + literature_agent = LiteratureAgent( + api=api, + prompt_class=prompt_class, + summizer=summarize_class, + ) + literature_agent.summarize_papers( + data_file=f"./literature/{task_name}/processed", + cache_seed=cache_seed, + max_tokens=max_tokens, + temperature=temperature, + ) + literature_agent.save_paper_infos(os.path.join(output_folder, "paper_infos.json")) + + generation_class = OnlyPaperGeneration( + api=api, + prompt_class=prompt_class, + inference_class=inference_class, + task=task, + literature_agent=literature_agent, + ) + + update_class = TestUpdate( + generation_class=generation_class, + inference_class=inference_class, + replace_class=DefaultReplace(max_num_hypotheses), + save_path=output_folder, + num_init=num_init, + k=k, + alpha=alpha, + update_batch_size=update_batch_size, + num_hypotheses_to_update=num_hypotheses_to_update, + update_hypotheses_per_batch=update_hypotheses_per_batch, + save_every_n_examples=save_every_10_examples, + ) + + hypotheses_bank = generation_class.initialize_hypotheses_only_paper( + num_hypotheses_generate=init_hypotheses_per_batch, + cache_seed=cache_seed, + temperature=temperature, + max_tokens=max_tokens, + ) + hypotheses_bank = {hyp: SummaryInformation() for hyp in hypotheses_bank} + update_class.save_to_json(hypotheses_bank, sample=0, seed=seed, epoch=0) + + +def with_paper(task_name, api, model_name): + output_folder = ( + f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_with_paper/" + ) + os.makedirs(output_folder, exist_ok=True) + set_seed(seed) + + task = BaseTask( + config_path=f"./data/{task_name}/config.yaml", + from_register=extract_label_register, + use_ood=use_ood, + regression=regression + ) + + train_data, _, _ = task.get_data(num_train, num_test, num_val, seed=seed) + + prompt_class = TestPrompt(task) + inference_class = DefaultInference(api, prompt_class, train_data, task) + summarize_class = LLMSummarize( + extractor=WholeExtractor(), api=api, prompt_class=prompt_class + ) + + literature_agent = LiteratureAgent( + api=api, + prompt_class=prompt_class, + summizer=summarize_class, + ) + literature_agent.summarize_papers( + data_file=f"./literature/{task_name}/processed", + cache_seed=cache_seed, + max_tokens=max_tokens, + temperature=temperature, + ) + literature_agent.save_paper_infos(os.path.join(output_folder, "paper_infos.json")) + + generation_class = TestGeneration( + api=api, + prompt_class=prompt_class, + inference_class=inference_class, + task=task, + literature_agent=literature_agent, + max_refine=max_refine, + ) + + update_class = TestUpdate( + generation_class=generation_class, + inference_class=inference_class, + replace_class=DefaultReplace(max_num_hypotheses), + save_path=output_folder, + num_init=num_init, + k=k, + alpha=alpha, + update_batch_size=update_batch_size, + num_hypotheses_to_update=num_hypotheses_to_update, + update_hypotheses_per_batch=update_hypotheses_per_batch, + save_every_n_examples=save_every_10_examples, + ) + + hypotheses_bank = update_class.batched_initialize_hypotheses_with_paper( + num_init=num_init, + init_batch_size=init_batch_size, + init_hypotheses_per_batch=init_hypotheses_per_batch, + cache_seed=cache_seed, + temperature=temperature, + max_tokens=max_tokens, + max_concurrent=64, + ) + update_class.save_to_json(hypotheses_bank, sample=num_init, seed=seed, epoch=0) + + for epoch in range(1): + hypotheses_bank = update_class.update( + current_epoch=epoch, + hypotheses_bank=hypotheses_bank, + current_seed=seed, + cache_seed=cache_seed, + temperature=temperature, + max_tokens=max_tokens, + max_concurrent=64, + ) + update_class.save_to_json( + hypotheses_bank, + sample="final", + seed=seed, + epoch=epoch, + ) + + +def original_hypogenic(task_name, api, model_name): + output_folder = f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}/" + + os.makedirs(output_folder, exist_ok=True) + + task = BaseTask( + config_path=f"./data/{task_name}/config.yaml", + from_register=extract_label_register, + use_ood=use_ood, + regression=regression + ) + + set_seed(seed) + train_data, _, _ = task.get_data(num_train, num_test, num_val, seed) + prompt_class = BasePrompt(task) + inference_class = DefaultInference(api, prompt_class, train_data, task) + + class_build = "default" + if regression: + class_build = "default_continuous" + generation_class = generation_register.build(class_build)(api, prompt_class, inference_class, task) + + update_class = update_register.build(class_build)( + generation_class=generation_class, + inference_class=inference_class, + replace_class=DefaultReplace(max_num_hypotheses), + save_path=output_folder, + num_init=num_init, + k=k, + alpha=alpha, + update_batch_size=update_batch_size, + num_hypotheses_to_update=num_hypotheses_to_update, + save_every_n_examples=save_every_10_examples, + ) + + hypotheses_bank = {} + hypotheses_bank = update_class.batched_initialize_hypotheses( + num_init, + init_batch_size=init_batch_size, + init_hypotheses_per_batch=init_hypotheses_per_batch, + cache_seed=cache_seed, + temperature=temperature, + max_tokens=max_tokens, + max_concurrent=64, + ) + update_class.save_to_json( + hypotheses_bank, + sample=num_init, + seed=seed, + epoch=0, + ) + for epoch in range(1): + hypotheses_bank = update_class.update( + current_epoch=epoch, + hypotheses_bank=hypotheses_bank, + current_seed=seed, + cache_seed=cache_seed, + temperature=temperature, + max_tokens=max_tokens, + max_concurrent=64, + ) + update_class.save_to_json( + hypotheses_bank, + sample="final", + seed=seed, + epoch=epoch, + ) + + +def IO_iterative_refinement(task_name, api, model_name): + output_folder = f"./results/{task_name}/{model_name}/IO_refinement/" + + os.makedirs(output_folder, exist_ok=True) + + task = BaseTask( + config_path=f"./data/{task_name}/config.yaml", + from_register=extract_label_register, + use_ood=use_ood, + regression=regression + ) + + set_seed(seed) + train_data, _, _ = task.get_data(10, num_test, num_val, seed) + prompt_class = IOPrompt(task) + inference_class = DefaultInference(api, prompt_class, train_data, task) + generation_class = IOGeneration(api, prompt_class, inference_class, task) + + update_class = IOUpdate( + generation_class=generation_class, + inference_class=inference_class, + replace_class=DefaultReplace(max_num_hypotheses), + save_path=output_folder, + num_init=10, + k=k, + alpha=alpha, + update_batch_size=update_batch_size, + num_hypotheses_to_update=num_hypotheses_to_update, + save_every_n_examples=save_every_10_examples, + ) + + hypotheses_bank = {} + # Use the Qiu et al. 2024 paper hyperparameters + hypotheses_bank = update_class.batched_initialize_hypotheses( + num_init=10, + init_batch_size=num_init, + init_hypotheses_per_batch=5, + cache_seed=cache_seed, + temperature=0.7, + max_tokens=max_tokens, + max_concurrent=64, + ) + # only keep the hypothesis with the highest accuracy + # sorted_hypotheses = sorted( + # hypotheses_bank, key=lambda x: hypotheses_bank[x].acc, reverse=True + # ) + # hypotheses_bank = { + # sorted_hypotheses[0]: hypotheses_bank[sorted_hypotheses[0]] + # } + update_class.save_to_json( + hypotheses_bank, + sample="init", + seed=seed, + epoch=0, + ) + for epoch in range(3): + # if there exist a hypothesis with accuracy 1.0, stop the training + if any(hypotheses_bank[h].acc == 1.0 for h in hypotheses_bank): + update_class.save_to_json( + hypotheses_bank, + sample="final", + seed=seed, + epoch=2, + ) + break + # Else, iteratively refine + hypotheses_bank = update_class.update( + current_epoch=epoch, + hypotheses_bank=hypotheses_bank, + current_seed=seed, + cache_seed=cache_seed, + temperature=temperature, + max_tokens=max_tokens, + max_concurrent=64, + ) + # sorted_hypotheses = sorted( + # hypotheses_bank, key=lambda x: hypotheses_bank[x].acc, reverse=True + # ) + # hypotheses_bank = { + # sorted_hypotheses[0]: hypotheses_bank[sorted_hypotheses[0]] + # } + update_class.save_to_json( + hypotheses_bank, + sample="final", + seed=seed, + epoch=epoch, + ) + + +def union_hypotheses(task_name, api, model_name, use_refine=True, prioritize='balanced'): + + union_postfix = "refine_and_paper" if use_refine else "hypogenic_and_paper" + output_folder = f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_{union_postfix}" + + os.makedirs(output_folder, exist_ok=True) + + os.makedirs(f"./results/{task_name}/{model_name}/dedup_data_only", exist_ok=True) + os.makedirs(f"./results/{task_name}/{model_name}/dedup_paper_only", exist_ok=True) + + task = BaseTask( + config_path=f"./data/{task_name}/config.yaml", + from_register=extract_label_register, + use_ood=use_ood, + regression=regression + ) + + set_seed(seed) + prompt_class = TestPrompt(task) + + if use_refine: + data_hyp_file = f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_with_paper/hypotheses_training_sample_final_seed_{seed}_epoch_0.json" + else: + data_hyp_file = f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json" + + with open(data_hyp_file) as f: + hyp_dict = json.load(f) + data_hyp_bank = {} + for hypothesis in hyp_dict: + data_hyp_bank[hypothesis] = SummaryInformation.from_dict(hyp_dict[hypothesis]) + + paper_hyp_file = f"./results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_only_paper/hypotheses_training_sample_0_seed_{seed}_epoch_0.json" + + with open(paper_hyp_file) as f: + hyp_dict = json.load(f) + paper_hyp_bank = {} + for hypothesis in hyp_dict: + paper_hyp_bank[hypothesis] = SummaryInformation.from_dict(hyp_dict[hypothesis]) + + + unique_data_hyp_bank = multiple_hypotheses_remove_repetition( + prompt_class, + api, + data_hyp_bank, + cache_seed, + max_concurrent=64, + temperature=temperature, + max_tokens=max_tokens, + ) + unique_paper_hyp_bank = multiple_hypotheses_remove_repetition( + prompt_class, + api, + paper_hyp_bank, + cache_seed, + max_concurrent=64, + temperature=temperature, + max_tokens=max_tokens, + ) + data_dump_dict = {} + for hyp in unique_data_hyp_bank: + data_dump_dict[hyp] = {"hypothesis": hyp, "acc": unique_data_hyp_bank[hyp].acc} + + paper_dump_dict = {} + for hyp in unique_paper_hyp_bank: + paper_dump_dict[hyp] = {"hypothesis": hyp, "acc": unique_paper_hyp_bank[hyp].acc} + + with open(f"./results/{task_name}/{model_name}/dedup_data_only/hypotheses_training_sample_final_seed_{seed}_epoch_0.json", 'w') as file: + json.dump(data_dump_dict, file) + + with open(f"./results/{task_name}/{model_name}/dedup_paper_only/hypotheses_training_sample_final_seed_{seed}_epoch_0.json", 'w') as file: + json.dump(paper_dump_dict, file) + + unique_data_hyp_list = sorted(unique_data_hyp_bank, key=lambda x: unique_data_hyp_bank[x].acc, reverse=True) + unique_paper_hyp_list = sorted(unique_paper_hyp_bank, key=lambda x: unique_paper_hyp_bank[x].acc, reverse=True) + union_hyp_bank = {} + + if prioritize == "data": + while len(union_hyp_bank) < max_num_hypotheses and len(unique_data_hyp_list) > 0: + hyp = unique_data_hyp_list.pop(0) + union_hyp_bank[hyp] = unique_data_hyp_bank[hyp] + print("num from data: ", len(union_hyp_bank)) + while len(union_hyp_bank) < max_num_hypotheses and len(unique_paper_hyp_list) > 0: + hyp = unique_paper_hyp_list.pop(0) + union_hyp_bank[hyp] = unique_paper_hyp_bank[hyp] + elif prioritize == "paper": + while len(union_hyp_bank) < max_num_hypotheses and len(unique_paper_hyp_list) > 0: + hyp = unique_paper_hyp_list.pop(0) + union_hyp_bank[hyp] = unique_paper_hyp_bank[hyp] + print("num from paper: ", len(union_hyp_bank)) + while len(union_hyp_bank) < max_num_hypotheses and len(unique_data_hyp_list) > 0: + hyp = unique_data_hyp_list.pop(0) + union_hyp_bank[hyp] = unique_data_hyp_bank[hyp] + else: + while len(union_hyp_bank) < max_num_hypotheses // 2 and len(unique_data_hyp_list) > 0: + hyp = unique_data_hyp_list.pop(0) + union_hyp_bank[hyp] = unique_data_hyp_bank[hyp] + print("num from data: ", len(union_hyp_bank)) + while len(union_hyp_bank) < max_num_hypotheses and len(unique_paper_hyp_list) > 0: + hyp = unique_paper_hyp_list.pop(0) + union_hyp_bank[hyp] = unique_paper_hyp_bank[hyp] + + union_dump_dict = {} + for hyp in union_hyp_bank: + union_dump_dict[hyp] = {"hypothesis": hyp, "acc": union_hyp_bank[hyp].acc} + + with open(f'{output_folder}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json', 'w') as file: + json.dump(union_dump_dict, file) + + +def get_res(filename: str, task_name, api, model_name, use_val=False, multihyp=False): + logger = LoggerConfig.get_logger("Agent - get_res") + + set_seed(seed) + + task = BaseTask( + config_path=f"./data/{task_name}/config.yaml", + from_register=extract_label_register, + use_ood=use_ood, + regression=regression + ) + + train_data, test_data, val_data = task.get_data(num_train, num_test, num_val, seed) + if use_val: + test_data = val_data + + prompt_class = TestPrompt(task) + + + with open(filename) as f: + hyp_dict = json.load(f) + hyp_bank = {} + for hypothesis in hyp_dict: + hyp_bank[hypothesis] = SummaryInformation.from_dict(hyp_dict[hypothesis]) + + + if multihyp: + inference_class = MultiHypDefaultInference(api, prompt_class, train_data, task) + pred_list, label_list = inference_class.run_inference_final( + test_data, + hyp_bank, + cache_seed=cache_seed, + max_concurrent=64, + temperature=temperature, + max_tokens=max_tokens, + ) + + results_dict = get_results_regression(pred_list, label_list) + # f1 = results_dict["f1"] + # acc = results_dict["accuracy"] + mse = results_dict["mse"] + logger_str = "Results:\n" + # logger_str += f"Accuracy: {acc}\n" + # logger_str += f"F1: {f1}\n\n" + logger_str += f"MSE: {mse}\n\n" + logger.info(logger_str) + return results_dict # Return results dictionary + else: + inference_class = DefaultInference(api, prompt_class, train_data, task) + + hyp_list = [] + + for idx, hyp in enumerate(hyp_bank): + pred_list, label_list = inference_class.run_inference_final( + test_data, + {hyp: hyp_bank[hyp]}, + cache_seed=cache_seed, + max_concurrent=64, + temperature=temperature, + max_tokens=max_tokens, + ) + results_dict = get_results_regression(pred_list, label_list) + # hyp_list.append((hyp, results_dict["accuracy"], results_dict["f1"])) + hyp_list.append((hyp, results_dict["mse"])) + + # hyp_list = sorted(hyp_list, key=lambda x: x[2], reverse=True) + hyp_list = sorted(hyp_list, key=lambda x: x[1]) + logger_str = "Results:\n" + # for idx, (hyp, acc, f1) in enumerate(hyp_list): + # logger_str += f"{idx + 1}. {hyp}\n" + # logger_str += f"Train Accuracy: {hyp_bank[hyp].acc}\n" + # logger_str += f"Test Accuracy: {acc}\n" + # logger_str += f"Test F1: {f1}\n\n" + for idx, (hyp, mse) in enumerate(hyp_list): + logger_str += f"{idx + 1}. {hyp}\n" + logger_str += f"Train Exploitation (=-MSE): {hyp_bank[hyp].acc}\n" + logger_str += f"Test MSE: {mse}\n\n" + logger.info(logger_str) + + # Format results in a more structured way + # formatted_results = { + # "hypotheses": [ + # { + # "hypothesis": hyp, + # "train_accuracy": hyp_bank[hyp].acc, + # "test_accuracy": acc, + # "test_f1": f1 + # } for hyp, acc, f1 in hyp_list + # ], + # "best": { + # "hypothesis": hyp_list[0][0], + # "test_accuracy": hyp_list[0][1], + # "test_f1": hyp_list[0][2] + # } if hyp_list else {} + # } + formatted_results = { + "hypotheses": [ + { + "hypothesis": hyp, + "train_exploitation": hyp_bank[hyp].acc, + "test_mse": mse + } for hyp, mse in hyp_list + ], + "best": { + "hypothesis": hyp_list[0][0], + "test_mse": hyp_list[0][1] + } if hyp_list else {} + } + return formatted_results + + +def save_method_results(method_name, results, task_name, model_name, seed=42, timestamp=None, use_ood=False): + """Save results for a specific evaluation method to a JSON file.""" + # Create results directory if it doesn't exist + results_dir = f"./results/{task_name}/{model_name}/evaluation_results" + os.makedirs(results_dir, exist_ok=True) + + # Add OOD suffix if using out-of-distribution data + data_type = "OOD" if use_ood else "IND" + filename = f"{results_dir}/{method_name}_{data_type}_seed_{seed}.json" + current_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + + result_data = { + "method": method_name, + "task": task_name, + "model": model_name, + "data_type": data_type, + "seed": seed, + "timestamp": current_timestamp, + "results": results, + "regression": True # Hardcoded for now + } + + with open(filename, 'w') as f: + json.dump(result_data, f, indent=2) + + return filename + + +def combine_results(task_name, model_name, methods_run=None, seed=42, use_ood=False): + """Combine all evaluation results into a single summary file.""" + results_dir = f"./results/{task_name}/{model_name}/evaluation_results" + if not os.path.exists(results_dir): + return None + + data_type = "OOD" if use_ood else "IND" + result_files = [f for f in os.listdir(results_dir) if f.endswith('.json') and data_type in f] + + if methods_run is not None: + method_files = [] + for method in methods_run: + method_file = f"{method}_{data_type}_seed_{seed}.json" + if method_file in result_files: + method_files.append(method_file) + result_files = method_files + else: + # Filter by seed and data type + result_files = [f for f in result_files if f'seed_{seed}' in f and data_type in f] + + if not result_files: + return None + + # Combine results + combined_results = { + "task": task_name, + "model": model_name, + "data_type": data_type, + "seed": seed, + "timestamp": datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), + "methods": {}, + "regression": True # Hardcoded for now + } + + for filename in result_files: + with open(os.path.join(results_dir, filename), 'r') as f: + data = json.load(f) + method_name = data["method"] + combined_results["methods"][method_name] = data["results"] + + # Save combined results + combined_file = f"./results/{task_name}/{model_name}/combined_results_{data_type}_seed_{seed}.json" + with open(combined_file, 'w') as f: + json.dump(combined_results, f, indent=2) + + return combined_file + + +def log_arguments(logger, args): + """Log all configuration parameters in an organized way.""" + sections = { + "Run Configuration": [ + ("Model Type", args.model_type), + ("Model Name", args.model_name), + ("Model Path", args.model_path), + ("Task Name", args.task_name), + ("Do Train", args.do_train), + ("Use OOD", args.use_ood), + ("Regression", args.regression) + ], + "Method Configuration": [ + ("Run Zero Shot", args.run_zero_shot), + ("Run Few Shot", args.run_few_shot), + ("Run Zero Shot Gen", args.run_zero_shot_gen), + ("Run Only Paper", args.run_only_paper), + ("Run HyperWrite", args.run_hyperwrite), + ("Run NotebookLM", args.run_notebooklm), + ("Run HypoGeniC", args.run_hypogenic), + ("Run HypoRefine", args.run_hyporefine), + ("Run Union HypoGeniC", args.run_union_hypo), + ("Run Union HypoRefine", args.run_union_refine), + ("Run Cross Model", args.run_cross_model), + ("Run IO Refine", args.run_io_refine) + ], + "Algorithm Configuration": [ + ("Cross Model Postfix", cross_model_postfix), + ("Multi Hypothesis", multihyp), + ("Use Validation", use_val), + ("Max Num Hypotheses", max_num_hypotheses), + ("Num Init", num_init), + ("Num Train", num_train), + ("Num Test", num_test), + ("Num Val", num_val), + ("K", k), + ("Alpha", alpha), + ("Update Batch Size", update_batch_size), + ("Num Hypotheses to Update", num_hypotheses_to_update), + ("Update Hypotheses Per Batch", update_hypotheses_per_batch), + ("Save Every N Examples", save_every_10_examples), + ("Init Batch Size", init_batch_size), + ("Init Hypotheses Per Batch", init_hypotheses_per_batch), + ("Cache Seed", cache_seed), + ("Temperature", temperature), + ("Max Tokens", max_tokens), + ("Use Refine", use_refine), + ("Max Refine", max_refine), + ("Seed", seed) + ] + } + + for section, params in sections.items(): + logger.info(f"\n=== {section} ===") + for name, value in params: + logger.info(f"{name}: {value}") + logger.info("=====================\n") + + +def baseline(few_shot_k, task_name, api, model_name, seed=42, use_val=False): + def few_shot( + api: LLMWrapper, + train_data, + test_data, + prompt_class: BasePrompt, + task, + few_shot_k, + cache_seed, + ): + """ + Given one hyothesis and a dataset, return the accuracy of the hypothesis on the dataset. + """ + results = [] + prompt_inputs = [ + prompt_class.few_shot_baseline( + train_data.reset_index(drop=True), few_shot_k, test_data, i + ) + for i in range(len(test_data)) + ] + + # print( + # yaml.dump( + # prompt_inputs[0], + # default_flow_style=False, + # sort_keys=False, + # allow_unicode=True, + # Dumper=yaml.SafeDumper, + # ) + # ) + + responses = api.batched_generate( + prompt_inputs, cache_seed=cache_seed, max_concurrent=64, max_tokens=4000 + ) + for i in range(len(test_data)): + pred = task.extract_label(responses[i]) + label = test_data[task.label_name][i] + + results.append( + { + "prompt": prompt_inputs[i], + "response": responses[i], + "label": label, + "pred": pred, + } + ) + + return results + + def preprocess(train_data, k): + data = [] + + label_nunique = train_data[task.label_name].nunique() + label_unique = train_data[task.label_name].unique() + for i in range(k): + data.append( + train_data[train_data[task.label_name] == label_unique[i % label_nunique]].iloc[ + i // label_nunique + ] + ) + + return pd.DataFrame(data) + + set_seed(seed) + + task = BaseTask( + config_path=f"./data/{task_name}/config.yaml", + from_register=extract_label_register, + use_ood=use_ood, + regression=regression + ) + prompt_class = BasePrompt(task) + results_list = [] + train_data, test_data, val_data = task.get_data(num_train, num_test, num_val, seed) + if use_val: + test_data = val_data + + if few_shot_k > 0: + train_data = preprocess(train_data, few_shot_k) + + results = few_shot( + api, train_data, test_data, prompt_class, task, few_shot_k, cache_seed + ) + + labels = [result["label"] for result in results] + preds = [result["pred"] for result in results] + + results_dict = get_results_regression(preds, labels) + # results_list.append((None, results_dict["accuracy"], results_dict["f1"])) + results_list.append((None, results_dict["mse"])) + + # Format results in a structured way + formatted_results = { + "few_shot_k": few_shot_k, + # "accuracy": results_dict["accuracy"], + # "f1": results_dict["f1"], + # "precision": results_dict.get("precision", None), + # "recall": results_dict.get("recall", None) + "mse": results_dict["mse"], + } + + return formatted_results + + +if __name__ == "__main__": + model_name = args.model_name + model_path = args.model_path + model_type = args.model_type + task_name = args.task_name + DO_TRAIN = args.do_train + + task_folder = ( + f"./results/{task_name}/" + ) + model_folder = ( + f"./results/{task_name}/{model_name}/" + ) + + os.makedirs(task_folder, exist_ok=True) + os.makedirs(model_folder, exist_ok=True) + + LoggerConfig.setup_logger( + logging.INFO, + f"results/{task_name}/{model_name}_seed_{seed}_{datetime.datetime.now().strftime('%Y-%m-%d,%H-%M-%S')}.log", + ) + + logger = LoggerConfig.get_logger("Agent") + log_arguments(logger, args) + + api = llm_wrapper_register.build(model_type)(model=model_name, path_name=model_path) + methods_run = [] # Track which methods were executed + + logger.info(f"=-=-=-=-=-=-=-=-=-=-=-={model_name}=-=-=-=-=-=-=-=-=-=-=-=") + logger.info(f"=-=-=-=-=-=-=-=-=-=-=-={task_name}=-=-=-=-=-=-=-=-=-=-=-=") + + # Modify the execution flow to use toggle arguments and save results + if args.run_zero_shot: + method_name = "zero_shot_baseline" + methods_run.append(method_name) + logger.info(f"=-=-=-=-=-=-=-=-=-=-=-=Zero-shot baseline, seed {seed}=-=-=-=-=-=-=-=-=-=-=-=") + results = baseline( + 0, + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + seed=seed, + ) + logger.info(results) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + if args.run_few_shot: + method_name = "few_shot_baseline" + methods_run.append(method_name) + logger.info(f"=-=-=-=-=-=-=-=-=-=-=-=Few-shot baseline, seed {seed}=-=-=-=-=-=-=-=-=-=-=-=") + results = baseline( + 3, + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + seed=seed, + ) + logger.info(results) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + if args.run_zero_shot_gen: + method_name = "zero_shot_gen" + methods_run.append(method_name) + logger.info("=-=-=-=-=-=-=-=-=-=-=-=Zero-shot generation=-=-=-=-=-=-=-=-=-=-=-=") + if DO_TRAIN: + zero_shot_hyp(task_name=task_name, api=api, model_name=model_name) + results = get_res( + f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_zero_shot/hypotheses_training_sample_0_seed_{seed}_epoch_0.json", + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + multihyp=multihyp, + ) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + if args.run_only_paper: + method_name = "literature_only" + methods_run.append(method_name) + logger.info("=-=-=-=-=-=-=-=-=-=-=-=Literature-only=-=-=-=-=-=-=-=-=-=-=-=") + if DO_TRAIN: + only_paper(task_name=task_name, api=api, model_name=model_name) + results = get_res( + f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_only_paper/hypotheses_training_sample_0_seed_{seed}_epoch_0.json", + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + multihyp=multihyp, + ) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + if args.run_hypogenic: + logger.info("=-=-=-=-=-=-=-=-=-=-=-=Original HypoGeniC=-=-=-=-=-=-=-=-=-=-=-=") + if DO_TRAIN: + original_hypogenic(task_name=task_name, api=api, model_name=model_name) + + method_name = "hypogenic_no_update" + methods_run.append(method_name) + logger.info("=-=-=-=-=-=-=-=-=-=-=-=No Update=-=-=-=-=-=-=-=-=-=-=-=") + results = get_res( + f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}/hypotheses_training_sample_10_seed_{seed}_epoch_0.json", + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + multihyp=multihyp, + ) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + method_name = "hypogenic" + methods_run.append(method_name) + logger.info("=-=-=-=-=-=-=-=-=-=-=-=With Update=-=-=-=-=-=-=-=-=-=-=-=") + results = get_res( + f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json", + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + multihyp=multihyp, + ) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + if args.run_hyporefine: + logger.info("=-=-=-=-=-=-=-=-=-=-=-=HypoRefine=-=-=-=-=-=-=-=-=-=-=-=") + if DO_TRAIN: + with_paper(task_name=task_name, api=api, model_name=model_name) + + method_name = "hyporefine_no_update" + methods_run.append(method_name) + logger.info("=-=-=-=-=-=-=-=-=-=-=-=No Update=-=-=-=-=-=-=-=-=-=-=-=") + results = get_res( + f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_with_paper/hypotheses_training_sample_10_seed_{seed}_epoch_0.json", + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + multihyp=multihyp, + ) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + method_name = "hyporefine" + methods_run.append(method_name) + logger.info("=-=-=-=-=-=-=-=-=-=-=-=With Update=-=-=-=-=-=-=-=-=-=-=-=") + results = get_res( + f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_with_paper/hypotheses_training_sample_final_seed_{seed}_epoch_0.json", + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + multihyp=multihyp, + ) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + if args.run_union_hypo: + method_name = "union_hypogenic_paper" + methods_run.append(method_name) + logger.info("=-=-=-=-=-=-=-=-=-=-=-=Union HypoGeniC and Paper=-=-=-=-=-=-=-=-=-=-=-=") + if DO_TRAIN: + union_hypotheses(task_name=task_name, api=api, model_name=model_name, use_refine=False, prioritize='balanced') + union_postfix = "hypogenic_and_paper" + results = get_res( + f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_{union_postfix}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json", + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + multihyp=multihyp, + ) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + if args.run_union_refine: + method_name = "union_hyporefine_paper" + methods_run.append(method_name) + logger.info("=-=-=-=-=-=-=-=-=-=-=-=Union HypoRefine and Paper=-=-=-=-=-=-=-=-=-=-=-=") + if DO_TRAIN: + union_hypotheses(task_name=task_name, api=api, model_name=model_name, use_refine=True, prioritize='balanced') + union_postfix = "refine_and_paper" + results = get_res( + f"results/{task_name}/{model_name}/hyp_{max_num_hypotheses}_{union_postfix}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json", + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + multihyp=multihyp, + ) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + if args.run_cross_model and "gpt" in model_type: + method_name = "cross_model_llama" + methods_run.append(method_name) + cross_model_name = "meta-llama/Meta-Llama-3.1-70B-Instruct" + logger.info(f"=-=-=-=-=-=-=-=-=-=-=-=Cross model {task_name}=-=-=-=-=-=-=-=-=-=-=-=") + results = get_res( + f"results/{task_name}/{cross_model_name}/hyp_{max_num_hypotheses}_{cross_model_postfix}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json", + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + multihyp=multihyp, + ) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + elif args.run_cross_model: + method_name = "cross_model_gpt" + methods_run.append(method_name) + cross_model_name = "gpt-4o-mini" + logger.info(f"=-=-=-=-=-=-=-=-=-=-=-=Cross model {task_name}=-=-=-=-=-=-=-=-=-=-=-=") + results = get_res( + f"results/{task_name}/{cross_model_name}/hyp_{max_num_hypotheses}_{cross_model_postfix}/hypotheses_training_sample_final_seed_{seed}_epoch_0.json", + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + multihyp=multihyp, + ) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + if args.run_io_refine: + logger.info("=-=-=-=-=-=-=-=-=-=-=-=IO Iterative Refinement=-=-=-=-=-=-=-=-=-=-=-=") + if DO_TRAIN: + IO_iterative_refinement(task_name=task_name, api=api, model_name=model_name) + + method_name = "io_prompting" + methods_run.append(method_name) + logger.info("=-=-=-=-=-=-=-=-=-=-=-=IO Prompting=-=-=-=-=-=-=-=-=-=-=-=") + results = get_res( + f"results/{task_name}/{model_name}/IO_refinement/hypotheses_training_sample_init_seed_{seed}_epoch_0.json", + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + multihyp=False, + ) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + method_name = "io_refinement" + methods_run.append(method_name) + logger.info("=-=-=-=-=-=-=-=-=-=-=-=IO Iterative refinement=-=-=-=-=-=-=-=-=-=-=-=") + results = get_res( + f"results/{task_name}/{model_name}/IO_refinement/hypotheses_training_sample_final_seed_{seed}_epoch_2.json", + task_name=task_name, + api=api, + model_name=model_name, + use_val=use_val, + multihyp=False, + ) + save_method_results(method_name, results, task_name, model_name, seed, use_ood=use_ood) + + # Combine all results into a single summary file + if methods_run: + combined_file = combine_results(task_name, model_name, methods_run, seed, use_ood=use_ood) + if combined_file: + logger.info(f"Combined results saved to: {combined_file}") + + # Log total cost of the run + if model_type == 'gpt': + total_cost = api.get_cost() + logger.info(f"Total cost: {total_cost}") + + # Save cost information to a separate file, using seed and data type + data_type = "OOD" if use_ood else "IND" + cost_file = f"./results/{task_name}/{model_name}/cost_{data_type}_seed_{seed}.json" + with open(cost_file, 'w') as f: + json.dump({ + "model": model_name, + "task": task_name, + "data_type": data_type, + "seed": seed, + "timestamp": datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), + "total_cost_usd": total_cost + }, f, indent=2) + diff --git a/run_pipeline_regression.sh b/run_pipeline_regression.sh new file mode 100755 index 0000000..5f556b9 --- /dev/null +++ b/run_pipeline_regression.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +# Model settings +MODEL_TYPE="gpt" +MODEL_NAME="gpt-4o-mini" + +# MODEL_TYPE="vllm" +# MODEL_NAME="meta-llama/Meta-Llama-3.1-70B-Instruct" +# MODEL_PATH="/net/scratch/llama/Meta-Llama-3.1-70B-Instruct" # only needed for local models + +# MODEL_TYPE="vllm" +# MODEL_NAME="Qwen/Qwen2.5-72B-Instruct" +# MODEL_PATH="/net/projects/chai-lab/shared_models/Qwen2.5-72B-Instruct" # only needed for local models + +# MODEL_TYPE="vllm-dpskr1" +# MODEL_NAME="DeepSeek/DeepSeek-R1-Distill-Llama-70B-local" +# MODEL_PATH="/net/projects/chai-lab/shared_models/DeepSeek-R1-Distill-Llama-70B-local" # only needed for local models + +# Define list of tasks to run +TASKS=( + "regression/db-synth/adventure-travel_0_0" +) + +# Define methods to run + +METHODS=( + "zero_shot" + "few_shot" + # "zero_shot_gen" + # "only_paper" + "hypogenic" + # "hyporefine" + # "union_hypo" + # "union_refine" + # "io_refine" +) + +# Algorithm settings +MAX_NUM_HYPOTHESES=5 #20 +NUM_TRAIN=20 #200 +NUM_TEST=30 #300 +SEED=42 + +# Iterate through each task +for TASK_NAME in "${TASKS[@]}"; do + echo "Running pipeline for task: $TASK_NAME" + + # Create command with base required arguments + CMD="python regression_pipeline.py \ + --model_type ${MODEL_TYPE} \ + --model_name ${MODEL_NAME} \ + --task_name ${TASK_NAME} \ + --seed ${SEED} \ + --max_num_hypotheses ${MAX_NUM_HYPOTHESES} \ + --num_train ${NUM_TRAIN} \ + --num_test ${NUM_TEST} \ + --regression" + + if [ "${MODEL_TYPE}" = "vllm" ]; then + CMD="${CMD} --model_path ${MODEL_PATH}" + fi + + # Default methods to run in pipeline + # Original commented version kept for reference + # CMD="${CMD} \ + # --run_zero_shot \ + # --run_few_shot \ + # --run_zero_shot_gen \ + # --run_hypogenic \ + # --do_train" + + # Add methods dynamically + for METHOD in "${METHODS[@]}"; do + CMD="${CMD} --run_${METHOD}" + done + # IND setup + CMD="${CMD} + --do_train + " + + # OOD setup + # CMD="${CMD} + # --use_ood + # " + + # Additional methods + # Uncomment if needed + # --run_only_paper \ + # --run_hyperwrite \ + # --run_notebooklm \ + # --run_hyporefine \ + # --run_union_hypo \ + # --run_union_refine \ + # --run_io_refine \ + # --run_cross_model \ + # --use_val \ + # --multihyp \ + # --use_refine + + echo "Executing command: $CMD" + eval $CMD + + echo "Completed task: $TASK_NAME" + echo "----------------------------------------" +done