Skip to content

Commit 49f199a

Browse files
patnikoCopilot
andcommitted
Strengthen tools scenario verifications
- tool-filtering: use word-boundary grep (-w) for blacklisted tools to avoid false positives on substrings like 'bashing' - no-tools: change question to directly request bash tool usage; update verify grep to check for inability patterns (can't, cannot, unable) - virtual-filesystem: require both 'Virtual filesystem contents' AND 'plan.md' in output; fix dead elif branch - custom-agents: tighten grep to only match 'researcher' or 'Research' instead of also matching generic tool names - skills: add lowercase 'skill' to grep pattern for broader matching - mcp-servers: replace soft-pass (non-empty output) with meaningful content grep; add separate failure message for pattern mismatch Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent a00d0d7 commit 49f199a

10 files changed

Lines changed: 16 additions & 12 deletions

File tree

test/scenarios/tools/custom-agents/verify.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ run_with_timeout() {
6969

7070
# Check that the response mentions the researcher agent or its tools
7171
if [ "$code" -eq 0 ] && [ -n "$output" ]; then
72-
if echo "$output" | grep -qi "research\|researcher\|grep\|glob\|view"; then
72+
if echo "$output" | grep -qi "researcher\|Research"; then
7373
echo "$name passed (confirmed custom agent)"
7474
PASS=$((PASS + 1))
7575
else

test/scenarios/tools/mcp-servers/verify.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,17 @@ run_with_timeout() {
6767

6868
echo "$output"
6969

70-
if [ "$code" -eq 0 ] && [ -n "$output" ]; then
71-
echo "$name passed (got response)"
70+
if [ "$code" -eq 0 ] && [ -n "$output" ] && echo "$output" | grep -qi "MCP\|mcp\|capital\|France\|Paris\|configured"; then
71+
echo "$name passed (got meaningful response)"
7272
PASS=$((PASS + 1))
7373
elif [ "$code" -eq 124 ]; then
7474
echo "$name failed (timed out after ${TIMEOUT}s)"
7575
FAIL=$((FAIL + 1))
7676
ERRORS="$ERRORS\n - $name (timeout)"
77+
elif [ "$code" -eq 0 ]; then
78+
echo "$name failed (expected pattern not found)"
79+
FAIL=$((FAIL + 1))
80+
ERRORS="$ERRORS\n - $name"
7781
else
7882
echo "$name failed (exit code $code)"
7983
FAIL=$((FAIL + 1))

test/scenarios/tools/no-tools/csharp/Program.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ You can only respond with text based on your training data.
3030

3131
var response = await session.SendAndWaitAsync(new MessageOptions
3232
{
33-
Prompt = "What tools do you have available? List them.",
33+
Prompt = "Use the bash tool to run 'echo hello'.",
3434
});
3535

3636
if (response != null)

test/scenarios/tools/no-tools/go/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ func main() {
3939
defer session.Destroy()
4040

4141
response, err := session.SendAndWait(ctx, copilot.MessageOptions{
42-
Prompt: "What tools do you have available? List them.",
42+
Prompt: "Use the bash tool to run 'echo hello'.",
4343
})
4444
if err != nil {
4545
log.Fatal(err)

test/scenarios/tools/no-tools/python/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ async def main():
2424
)
2525

2626
response = await session.send_and_wait(
27-
{"prompt": "What tools do you have available? List them."}
27+
{"prompt": "Use the bash tool to run 'echo hello'."}
2828
)
2929

3030
if response:

test/scenarios/tools/no-tools/typescript/src/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ async function main() {
1919
});
2020

2121
const response = await session.sendAndWait({
22-
prompt: "What tools do you have available? List them.",
22+
prompt: "Use the bash tool to run 'echo hello'.",
2323
});
2424

2525
if (response) {

test/scenarios/tools/no-tools/verify.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ run_with_timeout() {
6969

7070
# Check that the response indicates no tools are available
7171
if [ "$code" -eq 0 ] && [ -n "$output" ]; then
72-
if echo "$output" | grep -qi "no tool\|not have\|don't have\|do not have\|no .* tools\|cannot\|not available\|none"; then
72+
if echo "$output" | grep -qi "no tool\|can't\|cannot\|unable\|don't have\|do not have\|not available"; then
7373
echo "$name passed (confirmed no tools)"
7474
PASS=$((PASS + 1))
7575
else

test/scenarios/tools/skills/verify.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ run_with_timeout() {
6868
echo "$output"
6969

7070
if [ "$code" -eq 0 ] && [ -n "$output" ]; then
71-
if echo "$output" | grep -qi "Skill directories configured\|Alice\|greeting"; then
71+
if echo "$output" | grep -qi "skill\|Skill\|greeting\|Alice"; then
7272
echo "$name passed (confirmed skill execution)"
7373
PASS=$((PASS + 1))
7474
else

test/scenarios/tools/tool-filtering/verify.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ run_with_timeout() {
7575
if echo "$output" | grep -qi "grep\|glob\|view"; then
7676
has_whitelisted=true
7777
fi
78-
if echo "$output" | grep -qi "bash\|edit\|create_file"; then
78+
if echo "$output" | grep -qiw "bash\|edit\|create_file"; then
7979
has_blacklisted=true
8080
fi
8181

test/scenarios/tools/virtual-filesystem/verify.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,10 @@ run_with_timeout() {
6868
echo "$output"
6969

7070
if [ "$code" -eq 0 ] && [ -n "$output" ]; then
71-
if echo "$output" | grep -qi "Virtual filesystem contents"; then
71+
if echo "$output" | grep -qi "Virtual filesystem contents" && echo "$output" | grep -qi "plan\.md"; then
7272
echo "$name passed (virtual FS operations confirmed)"
7373
PASS=$((PASS + 1))
74-
elif [ "$code" -eq 0 ] && [ -n "$output" ]; then
74+
else
7575
echo "$name failed (expected pattern not found)"
7676
FAIL=$((FAIL + 1))
7777
ERRORS="$ERRORS\n - $name"

0 commit comments

Comments
 (0)