arena-python/tests/gai_test.py at main · SoftwareObservatorium/arena-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import logging
import sys
import time

import pandas as pd
import pytest
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import OllamaLLM

from arena.arena import parse_stimulus_matrix, SheetInvocation, Sheet, run_sheets, collect_actuation_sheets
from arena.engine.adaptation import SingleFunctionAdaptationStrategy
from arena.engine.artifacts import write_modules_and_import_cuts
from arena.engine.ssntestdriver import InvocationListener
from arena.provider.gai import prompt_code_units

# logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)


@pytest.mark.skip(reason="depends on external Ollama service")
def test_prompt_code():
    """
    Obtain code solutions from LLM model via Ollama and then run tests

    :return:
    """

    # use this for OpenAI instead of Ollama
    # os.environ["OPENAI_API_KEY"] = "demo" # FIXME your API KEY
    # llm = OpenAI(
    #     model="gpt-4o-mini"
    # )

    llm = OllamaLLM(model="llama3.1") # localhost
    template = 'def greatest_common_divisor(a: int, b: int) -> int: """ Return a greatest common divisor of two integers a and b >>> greatest_common_divisor(3, 5) 1 >>> greatest_common_divisor(25, 15) 5 """'
    prompt = ChatPromptTemplate.from_template(template)

    code_solutions = prompt_code_units(llm, prompt, samples = 3) # let's obtain 3

    logger.debug(f"solutions {code_solutions}")

    target_folder = f"/tmp/arena-python-{round(time.time() * 1000)}"
    # classes under test
    cuts = write_modules_and_import_cuts(target_folder, code_solutions)

    logger.debug(f"cuts {cuts}")

    # lql (interface specification)
    lql = """GCD {
            greatest_common_divisor(int,int)->int
        }
    """

    # stimulus sheet
    # ssn_jsonl = """
    #             {"cells": {"A1": {}, "B1": "create", "C1": "GCD"}}
    #             {"cells": {"A2": "5", "B2": "greatest_common_divisor", "C2": "A1", "D2": "25", "E2": "15"}}
    # """

    # stimulus sheet (as a data frame)
    ss = pd.DataFrame([
        {"A": {}, "B": "create", "C": "GCD", "D": None, "E": None},
        {"A": "5", "B": "greatest_common_divisor", "C": "A1", "D": "25", "E": "15"}
    ])

    # create stimulus matrix
    sm = parse_stimulus_matrix([Sheet("test1()", ss, lql)], cuts, [SheetInvocation("test1", "")])
    logger.debug(sm.to_string())

    assert len(sm.columns) == 3
    assert len(sm.index) == 1

    # run stimulus matrix
    invocation_listener = InvocationListener()
    srm = run_sheets(sm, 1, invocation_listener)
    # results based on internal ExecutedInvocation
    logger.debug(srm.to_string())

    assert len(srm.columns) == 3
    assert len(srm.index) == 1

    # create actuation sheets, now we have the real stimulus response matrix (SRM)
    srm_actuations = collect_actuation_sheets(srm)

    logger.debug(srm_actuations.to_string())

    assert len(srm_actuations.columns) == 3
    assert len(srm_actuations.index) == 1


@pytest.mark.skip(reason="depends on external Ollama service")
def test_prompt_code_SingleFunctionAdaptationStrategy():
    """
    Obtain code solutions from LLM model via Ollama and then run tests.

    BUT: Function names are (possibly) different, so we need to ADAPT! Uses SingleFunctionAdaptationStrategy()

    :return:
    """

    # use this for OpenAI instead of Ollama
    # os.environ["OPENAI_API_KEY"] = "demo" # FIXME your API KEY
    # llm = OpenAI(
    #     model="gpt-4o-mini"
    # )

    llm = OllamaLLM(model="llama3.1") # localhost
    template = 'Return a greatest common divisor of two integers a and b."""' # does not assume any function name
    prompt = ChatPromptTemplate.from_template(template)

    code_solutions = prompt_code_units(llm, prompt, samples = 3) # let's obtain 3

    logger.debug(f"solutions {code_solutions}")

    target_folder = f"/tmp/arena-python-{round(time.time() * 1000)}"
    # classes under test
    cuts = write_modules_and_import_cuts(target_folder, code_solutions)

    logger.debug(f"cuts {cuts}")

    # lql (interface specification)
    lql = """GCD {
            greatest_common_divisor(int,int)->int
        }
    """

    # stimulus sheet
    # ssn_jsonl = """
    #             {"cells": {"A1": {}, "B1": "create", "C1": "GCD"}}
    #             {"cells": {"A2": "5", "B2": "greatest_common_divisor", "C2": "A1", "D2": "25", "E2": "15"}}
    # """

    # stimulus sheet (as a data frame)
    ss = pd.DataFrame([
        {"A": {}, "B": "create", "C": "GCD", "D": None, "E": None},
        {"A": "5", "B": "greatest_common_divisor", "C": "A1", "D": "25", "E": "15"}
    ])

    # create stimulus matrix
    sm = parse_stimulus_matrix([Sheet("test1()", ss, lql)], cuts, [SheetInvocation("test1", "")])
    logger.debug(sm.to_string())

    assert len(sm.columns) == 3
    assert len(sm.index) == 1

    # run stimulus matrix
    invocation_listener = InvocationListener()
    adaptation_strategy = SingleFunctionAdaptationStrategy()
    srm = run_sheets(sm, 1, invocation_listener, False, adaptation_strategy)
    # results based on internal ExecutedInvocation
    logger.debug(srm.to_string())

    assert len(srm.columns) == 3
    assert len(srm.index) == 1

    # create actuation sheets, now we have the real stimulus response matrix (SRM)
    srm_actuations = collect_actuation_sheets(srm)

    logger.debug(srm_actuations.to_string())

    assert len(srm_actuations.columns) == 3
    assert len(srm_actuations.index) == 1