eliza/packages/benchmarks/bfcl/__init__.py at develop · elizaOS/eliza · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
BFCL Benchmark - Berkeley Function-Calling Leaderboard

This benchmark evaluates LLMs' function-calling (tool use) capabilities
across multiple dimensions including AST correctness, execution success,
and relevance detection.

Based on the BFCL benchmark from UC Berkeley's Sky Computing Lab.

Key Features:
- Multi-language support: Python, Java, JavaScript, SQL, REST API
- Multiple evaluation types: AST, Execution, Relevance Detection
- Parallel and sequential function calling
- Leaderboard-compatible scoring

Usage:
    from benchmarks.bfcl import BFCLRunner, BFCLConfig

    config = BFCLConfig()
    runner = BFCLRunner(config)
    results = await runner.run()

    print(f"Overall Score: {results.metrics.overall_score:.2%}")

CLI Usage:
    python -m benchmarks.bfcl --help
    python -m benchmarks.bfcl run --sample 50
    python -m benchmarks.bfcl run --full

Resources:
- Leaderboard: https://gorilla.cs.berkeley.edu/leaderboard
- GitHub: https://github.com/ShishirPatil/gorilla
- Dataset: https://huggingface.co/datasets/gorilla-llm/Berkeley-Function-Calling-Leaderboard
"""

from benchmarks.bfcl.types import (
    ArgumentValue,
    BFCLCategory,
    BFCLConfig,
    BFCLLanguage,
    BFCLMetrics,
    BFCLResult,
    BFCLTestCase,
    BFCLBenchmarkResults,
    BaselineScore,
    CategoryMetrics,
    EvaluationType,
    FunctionCall,
    FunctionDefinition,
    FunctionParameter,
    ResultDetails,
    LEADERBOARD_SCORES,
)
from benchmarks.bfcl.dataset import BFCLDataset
from benchmarks.bfcl.parser import FunctionCallParser
from benchmarks.bfcl.plugin import (
    BFCLPluginFactory,
    FunctionCallCapture,
    create_function_action,
    generate_function_schema,
    generate_openai_tools_format,
    get_call_capture,
)
from benchmarks.bfcl.agent import BFCLAgent, MockBFCLAgent
from benchmarks.bfcl.evaluators import (
    ASTEvaluator,
    ExecutionEvaluator,
    RelevanceEvaluator,
)
from benchmarks.bfcl.runner import BFCLRunner, run_bfcl_benchmark
from benchmarks.bfcl.metrics import MetricsCalculator
from benchmarks.bfcl.reporting import BFCLReporter, print_results

__version__ = "1.0.0"

__all__ = [
    # Version
    "__version__",
    # Types
    "ArgumentValue",
    "BFCLCategory",
    "BFCLConfig",
    "BFCLLanguage",
    "BFCLMetrics",
    "BFCLResult",
    "BFCLTestCase",
    "BFCLBenchmarkResults",
    "BaselineScore",
    "CategoryMetrics",
    "EvaluationType",
    "FunctionCall",
    "FunctionDefinition",
    "FunctionParameter",
    "ResultDetails",
    "LEADERBOARD_SCORES",
    # Dataset
    "BFCLDataset",
    # Parser
    "FunctionCallParser",
    # Plugin
    "BFCLPluginFactory",
    "FunctionCallCapture",
    "create_function_action",
    "generate_function_schema",
    "generate_openai_tools_format",
    "get_call_capture",
    # Agent
    "BFCLAgent",
    "MockBFCLAgent",
    # Evaluators
    "ASTEvaluator",
    "ExecutionEvaluator",
    "RelevanceEvaluator",
    # Runner
    "BFCLRunner",
    "run_bfcl_benchmark",
    # Metrics
    "MetricsCalculator",
    # Reporting
    "BFCLReporter",
    "print_results",
]