-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Expand file tree
/
Copy path__init__.py
More file actions
122 lines (111 loc) · 3.02 KB
/
__init__.py
File metadata and controls
122 lines (111 loc) · 3.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
BFCL Benchmark - Berkeley Function-Calling Leaderboard
This benchmark evaluates LLMs' function-calling (tool use) capabilities
across multiple dimensions including AST correctness, execution success,
and relevance detection.
Based on the BFCL benchmark from UC Berkeley's Sky Computing Lab.
Key Features:
- Multi-language support: Python, Java, JavaScript, SQL, REST API
- Multiple evaluation types: AST, Execution, Relevance Detection
- Parallel and sequential function calling
- Leaderboard-compatible scoring
Usage:
from benchmarks.bfcl import BFCLRunner, BFCLConfig
config = BFCLConfig()
runner = BFCLRunner(config)
results = await runner.run()
print(f"Overall Score: {results.metrics.overall_score:.2%}")
CLI Usage:
python -m benchmarks.bfcl --help
python -m benchmarks.bfcl run --sample 50
python -m benchmarks.bfcl run --full
Resources:
- Leaderboard: https://gorilla.cs.berkeley.edu/leaderboard
- GitHub: https://github.com/ShishirPatil/gorilla
- Dataset: https://huggingface.co/datasets/gorilla-llm/Berkeley-Function-Calling-Leaderboard
"""
from benchmarks.bfcl.types import (
ArgumentValue,
BFCLCategory,
BFCLConfig,
BFCLLanguage,
BFCLMetrics,
BFCLResult,
BFCLTestCase,
BFCLBenchmarkResults,
BaselineScore,
CategoryMetrics,
EvaluationType,
FunctionCall,
FunctionDefinition,
FunctionParameter,
ResultDetails,
LEADERBOARD_SCORES,
)
from benchmarks.bfcl.dataset import BFCLDataset
from benchmarks.bfcl.parser import FunctionCallParser
from benchmarks.bfcl.plugin import (
BFCLPluginFactory,
FunctionCallCapture,
create_function_action,
generate_function_schema,
generate_openai_tools_format,
get_call_capture,
)
from benchmarks.bfcl.agent import BFCLAgent, MockBFCLAgent
from benchmarks.bfcl.evaluators import (
ASTEvaluator,
ExecutionEvaluator,
RelevanceEvaluator,
)
from benchmarks.bfcl.runner import BFCLRunner, run_bfcl_benchmark
from benchmarks.bfcl.metrics import MetricsCalculator
from benchmarks.bfcl.reporting import BFCLReporter, print_results
__version__ = "1.0.0"
__all__ = [
# Version
"__version__",
# Types
"ArgumentValue",
"BFCLCategory",
"BFCLConfig",
"BFCLLanguage",
"BFCLMetrics",
"BFCLResult",
"BFCLTestCase",
"BFCLBenchmarkResults",
"BaselineScore",
"CategoryMetrics",
"EvaluationType",
"FunctionCall",
"FunctionDefinition",
"FunctionParameter",
"ResultDetails",
"LEADERBOARD_SCORES",
# Dataset
"BFCLDataset",
# Parser
"FunctionCallParser",
# Plugin
"BFCLPluginFactory",
"FunctionCallCapture",
"create_function_action",
"generate_function_schema",
"generate_openai_tools_format",
"get_call_capture",
# Agent
"BFCLAgent",
"MockBFCLAgent",
# Evaluators
"ASTEvaluator",
"ExecutionEvaluator",
"RelevanceEvaluator",
# Runner
"BFCLRunner",
"run_bfcl_benchmark",
# Metrics
"MetricsCalculator",
# Reporting
"BFCLReporter",
"print_results",
]