mcp-client/test_precise_tokens.py at main · iOSDevSK/mcp-client · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env python3
"""
Test script for precise token counting implementation.

This script tests the new precise token counting features including:
- Non-streaming API calls for accurate token usage
- Hybrid token counter with caching
- Real vs estimated token comparison
"""

import asyncio
import logging
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

async def test_precise_token_counting():
    """Test precise token counting functionality."""

    print("🎯 TESTING PRECISE TOKEN COUNTING")
    print("=" * 60)

    try:
        # Import required modules
        from langchain_openai import ChatOpenAI
        from mcp_use import MCPClient, MCPAgent
        from mcp_use.connectors.http import HttpConnector
        from mcp_use.token_counting.precise_counting import PreciseTokenCounter, HybridTokenCounter

        # Initialize LLM
        llm = ChatOpenAI(
            model="gpt-4o",
            temperature=0,
            openai_api_key=os.getenv("OPENAI_API_KEY")
        )

        print(f"✅ LLM initialized: {llm.model_name}")

        # Test 1: Direct precise token counter
        print("\n📊 Test 1: Direct Precise Token Counter")
        print("-" * 40)

        precise_counter = PreciseTokenCounter(llm)

        from langchain.schema import HumanMessage, SystemMessage, AIMessage
        test_messages = [
            SystemMessage(content="You are a helpful assistant."),
            HumanMessage(content="What is the capital of France?")
        ]

        print("🎯 Testing precise token counting with minimal messages...")
        precise_usage = await precise_counter.count_tokens_precise(test_messages)

        print(f"✅ Precise token count:")
        print(f"   • Input tokens: {precise_usage.input_tokens}")
        print(f"   • Output tokens: {precise_usage.output_tokens}")
        print(f"   • Tool tokens: {precise_usage.tool_tokens}")
        print(f"   • Total tokens: {precise_usage.total_tokens}")

        # Test 2: Hybrid token counter with caching
        print("\n📊 Test 2: Hybrid Token Counter with Caching")
        print("-" * 40)

        hybrid_counter = HybridTokenCounter(llm)

        print("🎯 First call (should make API request)...")
        usage1 = await hybrid_counter.count_with_caching(test_messages, cache_key="test1")

        print("🎯 Second call with same cache key (should use cache)...")
        usage2 = await hybrid_counter.count_with_caching(test_messages, cache_key="test1")

        print(f"✅ First call: {usage1.total_tokens} tokens")
        print(f"✅ Second call: {usage2.total_tokens} tokens")
        print(f"✅ Cache working: {usage1.total_tokens == usage2.total_tokens}")

        # Test 3: Advanced Precise Token Counter Features
        print("\n📊 Test 3: Advanced Precise Token Counter Features")
        print("-" * 40)

        # Test with different message types
        complex_messages = [
            SystemMessage(content="You are a helpful AI assistant with access to various tools for data analysis and web search."),
            HumanMessage(content="Can you help me analyze some data?"),
            AIMessage(content="I'd be happy to help you analyze data. What type of data would you like to analyze?"),
            HumanMessage(content="I have sales data from the last quarter that I need to understand better.")
        ]

        print("🎯 Testing with complex conversation history...")
        complex_usage = await precise_counter.count_tokens_precise(complex_messages)

        print(f"✅ Complex conversation token count:")
        print(f"   • Input tokens: {complex_usage.input_tokens}")
        print(f"   • Output tokens: {complex_usage.output_tokens}")
        print(f"   • Tool tokens: {complex_usage.tool_tokens}")
        print(f"   • Total tokens: {complex_usage.total_tokens}")

        # Test 4: Provider-specific token extraction
        print("\n📊 Test 4: Provider-specific Token Extraction")
        print("-" * 40)

        # Test OpenAI token extraction
        print("🎯 Testing OpenAI token extraction methods...")

        # Create a minimal call to test real token extraction
        simple_messages = [HumanMessage(content="Hello")]
        openai_usage = await precise_counter.count_tokens_precise(simple_messages)

        print(f"✅ OpenAI extraction result:")
        print(f"   • Input tokens: {openai_usage.input_tokens}")
        print(f"   • Output tokens: {openai_usage.output_tokens}")
        print(f"   • Method: {'Real API data' if openai_usage.input_tokens > 0 else 'Fallback estimation'}")

        # Test 5: Cache performance with hybrid counter
        print("\n📊 Test 5: Hybrid Counter Cache Performance")
        print("-" * 40)

        import time

        # Clear cache first
        hybrid_counter.clear_cache()

        # Time first call (no cache)
        start_time = time.time()
        usage1 = await hybrid_counter.count_with_caching(test_messages, cache_key="perf_test")
        time1 = time.time() - start_time

        # Time second call (with cache)
        start_time = time.time()
        usage2 = await hybrid_counter.count_with_caching(test_messages, cache_key="perf_test")
        time2 = time.time() - start_time

        print(f"✅ Cache performance:")
        print(f"   First call (no cache):  {time1:.3f}s")
        print(f"   Second call (cached):   {time2:.3f}s")
        if time2 > 0:
            print(f"   Speed improvement:      {time1/time2:.1f}x faster")
        print(f"   Results match:          {usage1.total_tokens == usage2.total_tokens}")

        # Test 6: Token estimation vs precise counting
        print("\n📊 Test 6: Estimation vs Precise Counting Accuracy")
        print("-" * 40)

        # Test with different message lengths
        test_cases = [
            "Short query",
            "This is a medium length query that contains more words and should result in more tokens being counted.",
            "This is a very long query that contains many words and should demonstrate the difference between estimation and precise counting methods. It includes various types of content and should help us understand how accurate our token counting mechanisms are when compared to the real API responses from OpenAI."
        ]

        for i, test_query in enumerate(test_cases, 1):
            test_msg = [HumanMessage(content=test_query)]

            # Get precise count
            precise = await precise_counter.count_tokens_precise(test_msg)

            # Get estimated count using tiktoken
            try:
                import tiktoken
                encoding = tiktoken.encoding_for_model("gpt-4")
                estimated_tokens = len(encoding.encode(test_query))

                accuracy = (min(precise.input_tokens, estimated_tokens) /
                           max(precise.input_tokens, estimated_tokens)) * 100

                print(f"   Test case {i}:")
                print(f"     • Precise tokens:    {precise.input_tokens}")
                print(f"     • Estimated tokens:  {estimated_tokens}")
                print(f"     • Accuracy:          {accuracy:.1f}%")

            except ImportError:
                print(f"   Test case {i}: tiktoken not available for comparison")

        print("\n🎉 ALL TESTS COMPLETED SUCCESSFULLY!")
        print("=" * 60)

        return True

    except Exception as e:
        print(f"❌ Test failed with error: {e}")
        import traceback
        traceback.print_exc()
        return False

async def main():
    """Main test function."""
    print("🚀 Starting Precise Token Counting Tests")
    print("=" * 60)

    success = await test_precise_token_counting()

    if success:
        print("\n✅ All tests passed! Precise token counting is working correctly.")
    else:
        print("\n❌ Some tests failed. Check the output above for details.")

    return success

if __name__ == "__main__":
    asyncio.run(main())