mcp_server_code_extractor/test_semantic_search.py at master · ctoth/mcp_server_code_extractor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
#!/usr/bin/env python3
"""
Quick test script for semantic search functionality.
Run with: python test_semantic_search.py
"""

import sys
import os
sys.path.insert(0, os.path.dirname(__file__))

from code_extractor.search_engine import SearchEngine
from code_extractor.models import SearchParameters

def test_function_calls():
    """Test function call search on our own codebase."""

    print("Testing semantic search functionality...")
    print("=" * 50)

    # Test searching for get_file_content calls in our server.py
    print("\n1. Testing search for 'get_file_content' calls in server.py (single file):")
    params = SearchParameters(
        search_type="function-calls",
        target="get_file_content",
        scope="code_extractor/server.py"
    )

    engine = SearchEngine()
    results = engine.search_file("code_extractor/server.py", params)

    print(f"Found {len(results)} results for 'get_file_content' calls:")
    for i, result in enumerate(results, 1):
        print(f"  {i}. Line {result.start_line}: {result.match_text}")
        if result.context_before:
            print(f"     Context before: {result.context_before[-1] if result.context_before else 'None'}")

    # Test searching for requests calls (should find none in our codebase)
    print("\n2. Testing search for 'requests.get' calls (should find 0):")
    params.target = "requests.get"
    results = engine.search_file("code_extractor/server.py", params)
    print(f"Found {len(results)} results for 'requests.get' calls (expected: 0)")

    # Test with a Python file that has function calls
    print("\n3. Testing search for 'get_symbols' calls in server.py:")
    params.target = "get_symbols"
    results = engine.search_file("code_extractor/server.py", params)
    print(f"Found {len(results)} results for 'get_symbols' calls:")
    for i, result in enumerate(results, 1):
        print(f"  {i}. Line {result.start_line}: {result.match_text}")

def test_directory_search():
    """Test directory search functionality."""

    print("\n" + "=" * 50)
    print("Testing DIRECTORY SEARCH functionality...")
    print("=" * 50)

    engine = SearchEngine()

    # Test 1: Search for get_file_content calls across the entire code_extractor directory
    print("\n1. Testing directory search for 'get_file_content' calls in code_extractor/:")
    params = SearchParameters(
        search_type="function-calls",
        target="get_file_content",
        scope="code_extractor",
        file_patterns=["*.py"],  # Only Python files
        max_results=50
    )

    results = engine.search_directory("code_extractor", params)
    print(f"Found {len(results)} results for 'get_file_content' calls across directory:")
    for i, result in enumerate(results, 1):
        print(f"  {i}. {result.file_path}:{result.start_line} - {result.match_text}")

    # Test 2: Search for SearchEngine calls (should find imports and instantiations)
    print("\n2. Testing directory search for 'SearchEngine' usage:")
    params.target = "SearchEngine"
    results = engine.search_directory("code_extractor", params)
    print(f"Found {len(results)} results for 'SearchEngine' usage:")
    for i, result in enumerate(results, 1):
        print(f"  {i}. {result.file_path}:{result.start_line} - {result.match_text}")

    # Test 3: Search with file pattern restrictions
    print("\n3. Testing directory search with pattern filtering (only search_engine.py):")
    params = SearchParameters(
        search_type="function-calls",
        target="get_language_for_file",
        scope="code_extractor",
        file_patterns=["search_engine.py"],  # Only search_engine.py
        max_results=10
    )
    results = engine.search_directory("code_extractor", params)
    print(f"Found {len(results)} results for 'get_language_for_file' in search_engine.py only:")
    for i, result in enumerate(results, 1):
        print(f"  {i}. {result.file_path}:{result.start_line} - {result.match_text}")

    # Test 4: Test exclusion patterns
    print("\n4. Testing exclusion patterns (exclude __pycache__ and .pyc files):")
    params = SearchParameters(
        search_type="function-calls",
        target="print",
        scope="code_extractor",
        file_patterns=["*.py"],
        exclude_patterns=["__pycache__/*", "*.pyc", "test_*"],  # Exclude test files too
        max_results=20
    )
    results = engine.search_directory("code_extractor", params)
    print(f"Found {len(results)} results for 'print' calls (excluding test files):")
    for i, result in enumerate(results, 1):
        print(f"  {i}. {result.file_path}:{result.start_line} - {result.match_text}")

def test_language_detection():
    """Test language detection functionality."""

    print("\n" + "=" * 30)
    print("Testing language detection:")
    print("=" * 30)

    from code_extractor.languages import get_language_for_file
    test_files = [
        "server.py",
        "models.py",
        "search_engine.py",
        "README.md"
    ]
    for file in test_files:
        lang = get_language_for_file(f"code_extractor/{file}")
        print(f"  {file}: {lang}")

def test_performance_benchmarks():
    """Test performance with larger file sets."""

    print("\n" + "=" * 50)
    print("Testing PERFORMANCE BENCHMARKS...")
    print("=" * 50)

    import time
    import tempfile
    from pathlib import Path

    engine = SearchEngine()

    # Create temporary directory with many files
    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_path = Path(tmp_dir)

        print(f"\n1. Creating test directory with 100 Python files...")

        # Create 100 files with function calls
        for i in range(100):
            file_content = f"""
def function_{i}():
    '''Function {i} for testing.'''
    get_file_content('data_{i}.json')
    process_data({{'{i}': 'value_{i}'}})
    print(f'Processing item {{i}}')
    return True

class TestClass_{i}:
    def method_{i}(self):
        get_file_content('config_{i}.json')
        return self.value
"""
            (tmp_path / f"module_{i:03d}.py").write_text(file_content)

        print(f"Created 100 files in {tmp_dir}")

        # Test directory search performance
        print("\n2. Testing directory search performance...")

        params = SearchParameters(
            search_type="function-calls",
            target="get_file_content",
            scope=str(tmp_path),
            file_patterns=["*.py"],
            max_results=500
        )

        start_time = time.time()
        results = engine.search_directory(str(tmp_path), params)
        end_time = time.time()

        duration = end_time - start_time
        print(f"  Search completed in {duration:.2f} seconds")
        print(f"  Found {len(results)} results across 100 files")
        print(f"  Performance: {len(results)/duration:.1f} results/second")

        # Verify results quality
        file_count = len(set(r.file_path for r in results))
        print(f"  Results span {file_count} different files")

        # Test with different search targets
        print("\n3. Testing multiple search patterns...")

        search_targets = ["print", "process_data", "return"]
        for target in search_targets:
            params.target = target
            start_time = time.time()
            results = engine.search_directory(str(tmp_path), params)
            end_time = time.time()

            print(f"  '{target}': {len(results)} results in {end_time - start_time:.2f}s")

def test_memory_usage():
    """Test memory usage with large result sets."""

    print("\n" + "=" * 50)
    print("Testing MEMORY USAGE...")
    print("=" * 50)

    import tempfile
    import os
    from pathlib import Path

    try:
        import psutil
        has_psutil = True
    except ImportError:
        has_psutil = False
        print("  psutil not available, skipping detailed memory analysis")

    engine = SearchEngine()

    if has_psutil:
        # Get initial memory usage
        process = psutil.Process(os.getpid())
        initial_memory = process.memory_info().rss / 1024 / 1024  # MB
        print(f"\n1. Initial memory usage: {initial_memory:.1f} MB")

    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_path = Path(tmp_dir)

        # Create files with many function calls each
        print("\n2. Creating files with high match density...")
        for i in range(20):
            lines = []
            for j in range(100):  # 100 function calls per file
                lines.append(f"    get_data('item_{i}_{j}')")

            content = f"def process_file_{i}():\n" + "\n".join(lines)
            (tmp_path / f"dense_{i:02d}.py").write_text(content)

        print("Created 20 files with ~2000 total function calls")

        params = SearchParameters(
            search_type="function-calls",
            target="get_data",
            scope=str(tmp_path),
            max_results=2000  # Allow many results
        )

        # Measure memory during search
        if has_psutil:
            before_search = process.memory_info().rss / 1024 / 1024

        results = engine.search_directory(str(tmp_path), params)

        if has_psutil:
            after_search = process.memory_info().rss / 1024 / 1024
            memory_increase = after_search - before_search

            print(f"\n3. Memory usage after search:")
            print(f"  Before search: {before_search:.1f} MB")
            print(f"  After search: {after_search:.1f} MB")
            print(f"  Memory increase: {memory_increase:.1f} MB")
            print(f"  Results found: {len(results)}")
            print(f"  Memory per result: {memory_increase * 1024 / len(results):.1f} KB" if results else "N/A")
        else:
            print(f"\n3. Search completed with {len(results)} results")

def test_error_resilience():
    """Test error handling and resilience."""

    print("\n" + "=" * 50)
    print("Testing ERROR RESILIENCE...")
    print("=" * 50)

    import tempfile
    from pathlib import Path

    engine = SearchEngine()

    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_path = Path(tmp_dir)

        # Create mix of valid and problematic files
        print("\n1. Creating mixed file types...")

        # Valid Python files
        (tmp_path / "valid1.py").write_text("print('valid')")
        (tmp_path / "valid2.py").write_text("get_data('test')")

        # Binary file
        (tmp_path / "binary.bin").write_bytes(b'\x00\x01\x02\x03Binary data\x00')

        # Empty file
        (tmp_path / "empty.py").touch()

        # Very large file (if memory allows)
        try:
            large_content = "print('line')\n" * 10000
            (tmp_path / "large.py").write_text(large_content)
            print("  Created large file with 10,000 lines")
        except MemoryError:
            print("  Skipped large file creation (memory limited)")

        # File with unicode
        (tmp_path / "unicode.py").write_text("print('Hello 世界 🌍')")

        print("  Created mix of valid, binary, empty, and unicode files")

        # Test search resilience
        print("\n2. Testing search across problematic files...")

        params = SearchParameters(
            search_type="function-calls",
            target="print",
            scope=str(tmp_path),
            file_patterns=["*"]  # Include all files to test filtering
        )

        try:
            results = engine.search_directory(str(tmp_path), params)
            print(f"  Search completed successfully with {len(results)} results")

            # Check that binary files were excluded
            binary_results = [r for r in results if "binary" in r.file_path]
            print(f"  Binary file results (should be 0): {len(binary_results)}")

            # Check that valid files were processed
            valid_results = [r for r in results if "valid" in r.file_path or "unicode" in r.file_path]
            print(f"  Valid file results: {len(valid_results)}")

        except Exception as e:
            print(f"  ERROR: Search failed with {e}")

def test_real_world_scenarios():
    """Test realistic code search scenarios."""

    print("\n" + "=" * 50)
    print("Testing REAL-WORLD SCENARIOS...")
    print("=" * 50)

    engine = SearchEngine()

    # Test 1: Search for error handling patterns in actual codebase
    print("\n1. Searching for error handling patterns in actual codebase...")

    params = SearchParameters(
        search_type="function-calls",
        target="Exception",
        scope="code_extractor",
        file_patterns=["*.py"],
        exclude_patterns=["test_*", "__pycache__/*"]
    )

    results = engine.search_directory("code_extractor", params)
    print(f"  Found {len(results)} Exception-related patterns")

    # Test 2: Search for import patterns
    print("\n2. Searching for import patterns...")

    params.target = "from "
    results = engine.search_directory("code_extractor", params)
    print(f"  Found {len(results)} import statements")

    # Test 3: Search for class definitions (simplified)
    print("\n3. Searching for class patterns...")

    params.target = "class "
    results = engine.search_directory("code_extractor", params)
    print(f"  Found {len(results)} class-related patterns")

    # Test 4: Cross-file dependency analysis
    print("\n4. Analyzing cross-file dependencies...")

    params.target = "SearchEngine"
    results = engine.search_directory("code_extractor", params)

    usage_files = set()
    for result in results:
        usage_files.add(os.path.basename(result.file_path))

    print(f"  SearchEngine used in {len(usage_files)} files: {', '.join(sorted(usage_files))}")

    # Test 5: Test with different file patterns
    print("\n5. Testing language-specific searches...")

    # Only Python files
    params = SearchParameters(
        search_type="function-calls",
        target="def ",
        scope="code_extractor",
        file_patterns=["*.py"],
        max_results=20
    )

    results = engine.search_directory("code_extractor", params)
    print(f"  Python function definitions: {len(results)}")

    # Check language consistency
    languages = {r.language for r in results}
    print(f"  Languages detected: {languages}")

if __name__ == "__main__":
    test_function_calls()
    test_directory_search()
    test_language_detection()
    test_performance_benchmarks()
    test_memory_usage()
    test_error_resilience()
    test_real_world_scenarios()

    print("\n" + "=" * 50)
    print("All comprehensive semantic search tests completed!")