Skip to content

Commit 42e8b5b

Browse files
authored
Merge pull request #150 from rostilos/1.5.2-rc
feat: Implement RAG pipeline with new API endpoints, services for duplication and semantic search, and updated core components.
2 parents eaee542 + 465b54f commit 42e8b5b

27 files changed

Lines changed: 3104 additions & 2683 deletions

File tree

java-ecosystem/libs/analysis-engine/src/main/java/org/rostilos/codecrow/analysisengine/service/PrFileEnrichmentService.java

Lines changed: 150 additions & 151 deletions
Large diffs are not rendered by default.

java-ecosystem/libs/rag-engine/src/main/java/org/rostilos/codecrow/ragengine/service/IncrementalRagUpdateService.java

Lines changed: 75 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,17 @@ public class IncrementalRagUpdateService {
2424
private final VcsClientProvider vcsClientProvider;
2525
private final RagPipelineClient ragPipelineClient;
2626
private final RagIndexTrackingService ragIndexTrackingService;
27-
27+
2828
@Value("${codecrow.rag.api.enabled:true}")
2929
private boolean ragApiEnabled;
30-
30+
3131
@Value("${codecrow.rag.parallel.requests:10}")
3232
private int parallelRequests;
3333

3434
public IncrementalRagUpdateService(
3535
VcsClientProvider vcsClientProvider,
3636
RagPipelineClient ragPipelineClient,
37-
RagIndexTrackingService ragIndexTrackingService
38-
) {
37+
RagIndexTrackingService ragIndexTrackingService) {
3938
this.vcsClientProvider = vcsClientProvider;
4039
this.ragPipelineClient = ragPipelineClient;
4140
this.ragIndexTrackingService = ragIndexTrackingService;
@@ -52,12 +51,12 @@ public boolean shouldPerformIncrementalUpdate(Project project) {
5251
log.info("shouldPerformIncrementalUpdate: config is null for project={}", project.getId());
5352
return false;
5453
}
55-
54+
5655
if (config.ragConfig() == null) {
5756
log.info("shouldPerformIncrementalUpdate: ragConfig is null for project={}", project.getId());
5857
return false;
5958
}
60-
59+
6160
if (!config.ragConfig().enabled()) {
6261
log.info("shouldPerformIncrementalUpdate: ragConfig.enabled=false for project={}", project.getId());
6362
return false;
@@ -76,8 +75,7 @@ public Map<String, Object> performIncrementalUpdate(
7675
String branch,
7776
String commitHash,
7877
Set<String> addedOrModifiedFiles,
79-
Set<String> deletedFiles
80-
) throws IOException {
78+
Set<String> deletedFiles) throws IOException {
8179
log.info("Starting incremental RAG update for project {} branch {}: {} files to update, {} to delete",
8280
project.getName(), branch, addedOrModifiedFiles.size(), deletedFiles.size());
8381

@@ -93,8 +91,7 @@ public Map<String, Object> performIncrementalUpdate(
9391
new ArrayList<>(deletedFiles),
9492
projectWorkspace,
9593
projectNamespace,
96-
branch
97-
);
94+
branch);
9895
result.put("deletedFiles", deletedFiles.size());
9996
log.info("Deleted {} files from RAG index", deletedFiles.size());
10097
}
@@ -108,22 +105,20 @@ public Map<String, Object> performIncrementalUpdate(
108105
repoSlug,
109106
branch,
110107
addedOrModifiedFiles,
111-
tempDir
112-
);
108+
tempDir);
113109

114110
Map<String, Object> updateResult = ragPipelineClient.updateFiles(
115111
new ArrayList<>(addedOrModifiedFiles),
116112
tempDir.toString(),
117113
projectWorkspace,
118114
projectNamespace,
119115
branch,
120-
commitHash
121-
);
122-
116+
commitHash);
117+
123118
result.put("updatedFiles", fetchedFiles);
124119
result.putAll(updateResult);
125120
log.info("Updated {} files in RAG index", fetchedFiles);
126-
121+
127122
} finally {
128123
deleteDirectory(tempDir.toFile());
129124
}
@@ -139,51 +134,55 @@ private int fetchFilesToTempDir(
139134
String repoSlug,
140135
String branch,
141136
Set<String> filePaths,
142-
Path tempDir
143-
) throws IOException {
137+
Path tempDir) throws IOException {
144138
VcsClient vcsClient = vcsClientProvider.getClient(vcsConnection);
145-
139+
146140
ExecutorService executor = Executors.newFixedThreadPool(Math.min(parallelRequests, filePaths.size()));
147-
List<CompletableFuture<Boolean>> futures = new ArrayList<>();
141+
try {
142+
List<CompletableFuture<Boolean>> futures = new ArrayList<>();
143+
144+
for (String filePath : filePaths) {
145+
CompletableFuture<Boolean> future = CompletableFuture.supplyAsync(() -> {
146+
try {
147+
String content = vcsClient.getFileContent(workspaceSlug, repoSlug, filePath, branch);
148+
if (content != null) {
149+
Path targetPath = tempDir.resolve(filePath);
150+
Files.createDirectories(targetPath.getParent());
151+
Files.writeString(targetPath, content);
152+
return true;
153+
}
154+
return false;
155+
} catch (IOException e) {
156+
log.warn("Failed to fetch file {}: {}", filePath, e.getMessage());
157+
return false;
158+
}
159+
}, executor);
160+
futures.add(future);
161+
}
148162

149-
for (String filePath : filePaths) {
150-
CompletableFuture<Boolean> future = CompletableFuture.supplyAsync(() -> {
163+
int successCount = 0;
164+
for (CompletableFuture<Boolean> future : futures) {
151165
try {
152-
String content = vcsClient.getFileContent(workspaceSlug, repoSlug, filePath, branch);
153-
if (content != null) {
154-
Path targetPath = tempDir.resolve(filePath);
155-
Files.createDirectories(targetPath.getParent());
156-
Files.writeString(targetPath, content);
157-
return true;
166+
if (future.get(30, TimeUnit.SECONDS)) {
167+
successCount++;
158168
}
159-
return false;
160-
} catch (IOException e) {
161-
log.warn("Failed to fetch file {}: {}", filePath, e.getMessage());
162-
return false;
169+
} catch (Exception e) {
170+
log.warn("File fetch task failed: {}", e.getMessage());
163171
}
164-
}, executor);
165-
futures.add(future);
166-
}
172+
}
167173

168-
int successCount = 0;
169-
for (CompletableFuture<Boolean> future : futures) {
174+
return successCount;
175+
} finally {
176+
executor.shutdownNow();
170177
try {
171-
if (future.get(30, TimeUnit.SECONDS)) {
172-
successCount++;
178+
if (!executor.awaitTermination(10, TimeUnit.SECONDS)) {
179+
log.warn("Some file fetch threads did not terminate within timeout");
173180
}
174-
} catch (Exception e) {
175-
log.warn("File fetch task failed: {}", e.getMessage());
181+
} catch (InterruptedException e) {
182+
Thread.currentThread().interrupt();
183+
log.warn("Interrupted while awaiting executor termination");
176184
}
177185
}
178-
179-
executor.shutdown();
180-
try {
181-
executor.awaitTermination(60, TimeUnit.SECONDS);
182-
} catch (InterruptedException e) {
183-
Thread.currentThread().interrupt();
184-
}
185-
186-
return successCount;
187186
}
188187

189188
public DiffResult parseDiffForRag(String rawDiff) {
@@ -198,17 +197,18 @@ public DiffResult parseDiffForRag(String rawDiff) {
198197
String currentFile = null;
199198
boolean isDelete = false;
200199
boolean fileProcessed = false;
200+
// Track rename operations: old path -> delete, new path -> add
201+
String renameFrom = null;
201202

202203
for (String line : lines) {
203204
if (line.startsWith("diff --git")) {
204205
// Process previous file if we haven't categorized it yet
205206
if (currentFile != null && !fileProcessed) {
206-
// Default to modified if we haven't seen explicit delete marker
207207
if (!isDelete) {
208208
addedOrModified.add(currentFile);
209209
}
210210
}
211-
211+
212212
// Parse new file
213213
String[] parts = line.split("\\s+");
214214
if (parts.length >= 4) {
@@ -219,6 +219,7 @@ public DiffResult parseDiffForRag(String rawDiff) {
219219
}
220220
isDelete = false;
221221
fileProcessed = false;
222+
renameFrom = null;
222223
} else if (line.startsWith("deleted file mode")) {
223224
isDelete = true;
224225
if (currentFile != null) {
@@ -230,26 +231,42 @@ public DiffResult parseDiffForRag(String rawDiff) {
230231
addedOrModified.add(currentFile);
231232
fileProcessed = true;
232233
}
234+
} else if (line.startsWith("rename from ") || line.startsWith("copy from ")) {
235+
// Git rename/copy: "rename from old/path.java"
236+
// The old path should be deleted from the index
237+
renameFrom = line.substring(line.indexOf(' ', line.indexOf(' ') + 1) + 1).trim();
238+
} else if (line.startsWith("rename to ") || line.startsWith("copy to ")) {
239+
// Git rename/copy: "rename to new/path.java"
240+
// The new path should be added/indexed
241+
String renameTo = line.substring(line.indexOf(' ', line.indexOf(' ') + 1) + 1).trim();
242+
if (renameFrom != null && !renameFrom.isEmpty()) {
243+
deleted.add(renameFrom);
244+
}
245+
if (!renameTo.isEmpty()) {
246+
addedOrModified.add(renameTo);
247+
}
248+
fileProcessed = true;
249+
renameFrom = null;
233250
}
234251
}
235-
252+
236253
// Don't forget to process the last file
237254
if (currentFile != null && !fileProcessed) {
238255
if (!isDelete) {
239256
addedOrModified.add(currentFile);
240257
}
241258
}
242-
243-
log.info("Parsed diff: {} added/modified files, {} deleted files",
259+
260+
log.info("Parsed diff: {} added/modified files, {} deleted files",
244261
addedOrModified.size(), deleted.size());
245-
262+
246263
return new DiffResult(addedOrModified, deleted);
247264
}
248265

249266
public record DiffResult(
250267
Set<String> addedOrModified,
251-
Set<String> deleted
252-
) {}
268+
Set<String> deleted) {
269+
}
253270

254271
private void deleteDirectory(java.io.File dir) {
255272
if (dir.exists()) {

python-ecosystem/inference-orchestrator/utils/context_builder.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ def smart_chunk(
332332
imports_section = '\n'.join(imports_lines) + '\n\n' if imports_lines else ""
333333

334334
# Find all logical boundaries
335-
boundaries = cls._find_boundaries(lines)
335+
boundaries = cls._find_boundaries(lines, file_path)
336336

337337
if not boundaries:
338338
# No boundaries found, fall back to line-based chunking
@@ -368,12 +368,14 @@ def smart_chunk(
368368
return chunks if chunks else [content[:max_chars]]
369369

370370
@classmethod
371-
def _find_boundaries(cls, lines: List[str]) -> List[tuple]:
372-
"""Find logical boundaries in code."""
371+
def _find_boundaries(cls, lines: List[str], file_path: str = "") -> List[tuple]:
372+
"""Find logical boundaries in code using language-aware patterns."""
373+
language = cls._detect_language(file_path)
374+
patterns = cls._get_patterns_for_language(language)
373375
boundaries = []
374376

375377
for i, line in enumerate(lines):
376-
for pattern, boundary_type in cls.BOUNDARY_PATTERNS.items():
378+
for pattern, boundary_type in patterns:
377379
if re.match(pattern, line):
378380
boundaries.append((i, boundary_type))
379381
break
@@ -490,7 +492,7 @@ def _make_key(
490492
# Sort files for consistent hashing
491493
sorted_files = sorted(changed_files) if changed_files else []
492494

493-
key_data = f"{workspace}:{project}:{branch}:{','.join(sorted_files)}:{pr_title}:{pr_description[:100]}"
495+
key_data = f"{workspace}:{project}:{branch}:{','.join(sorted_files)}:{pr_title}:{hashlib.md5(pr_description.encode()).hexdigest()}"
494496
return hashlib.md5(key_data.encode()).hexdigest()
495497

496498
def get(

python-ecosystem/inference-orchestrator/utils/file_classifier.py

Lines changed: 29 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -38,57 +38,60 @@ class FileClassifier:
3838
- RAG: 10% of context budget - Additional relevant files
3939
"""
4040

41+
# All source code extensions we support (language-agnostic)
42+
_SRC = r'java|py|pyi|ts|tsx|js|jsx|go|rs|rb|php|phtml|kt|kts|cs|cpp|cc|c|h|hpp|swift|scala|m|mm|lua|pl|groovy'
43+
4144
# Patterns for HIGH priority files (core business logic)
4245
HIGH_PRIORITY_PATTERNS = [
4346
# Service/Business logic
44-
r'.*/service[s]?/.*\.(java|py|ts|js)$',
45-
r'.*/controller[s]?/.*\.(java|py|ts|js)$',
46-
r'.*/handler[s]?/.*\.(java|py|ts|js)$',
47-
r'.*/api/.*\.(java|py|ts|js)$',
48-
r'.*/core/.*\.(java|py|ts|js)$',
49-
r'.*/domain/.*\.(java|py|ts|js)$',
50-
r'.*/business/.*\.(java|py|ts|js)$',
47+
rf'.*/service[s]?/.*\.({_SRC})$',
48+
rf'.*/controller[s]?/.*\.({_SRC})$',
49+
rf'.*/handler[s]?/.*\.({_SRC})$',
50+
rf'.*/api/.*\.({_SRC})$',
51+
rf'.*/core/.*\.({_SRC})$',
52+
rf'.*/domain/.*\.({_SRC})$',
53+
rf'.*/business/.*\.({_SRC})$',
5154
# Security-critical
52-
r'.*[Aa]uth.*\.(java|py|ts|js)$',
53-
r'.*[Ss]ecurity.*\.(java|py|ts|js)$',
54-
r'.*[Pp]ermission.*\.(java|py|ts|js)$',
55-
r'.*[Aa]ccess.*[Cc]ontrol.*\.(java|py|ts|js)$',
55+
rf'.*[Aa]uth.*\.({_SRC})$',
56+
rf'.*[Ss]ecurity.*\.({_SRC})$',
57+
rf'.*[Pp]ermission.*\.({_SRC})$',
58+
rf'.*[Aa]ccess.*[Cc]ontrol.*\.({_SRC})$',
5659
# Database/Repository
57-
r'.*/repository/.*\.(java|py|ts|js)$',
58-
r'.*/dao/.*\.(java|py|ts|js)$',
59-
r'.*[Mm]igration.*\.(sql|py|java)$',
60+
rf'.*/repository/.*\.({_SRC})$',
61+
rf'.*/dao/.*\.({_SRC})$',
62+
rf'.*[Mm]igration.*\.(sql|{_SRC})$',
6063
]
6164

6265
# Patterns for MEDIUM priority files
6366
MEDIUM_PRIORITY_PATTERNS = [
6467
# Models/Entities
65-
r'.*/model[s]?/.*\.(java|py|ts|js)$',
66-
r'.*/entity/.*\.(java|py|ts|js)$',
67-
r'.*/dto/.*\.(java|py|ts|js)$',
68-
r'.*/schema/.*\.(java|py|ts|js)$',
68+
rf'.*/model[s]?/.*\.({_SRC})$',
69+
rf'.*/entity/.*\.({_SRC})$',
70+
rf'.*/dto/.*\.({_SRC})$',
71+
rf'.*/schema/.*\.({_SRC})$',
6972
# Utils/Helpers
70-
r'.*/util[s]?/.*\.(java|py|ts|js)$',
71-
r'.*/helper[s]?/.*\.(java|py|ts|js)$',
72-
r'.*/common/.*\.(java|py|ts|js)$',
73-
r'.*/shared/.*\.(java|py|ts|js)$',
73+
rf'.*/util[s]?/.*\.({_SRC})$',
74+
rf'.*/helper[s]?/.*\.({_SRC})$',
75+
rf'.*/common/.*\.({_SRC})$',
76+
rf'.*/shared/.*\.({_SRC})$',
7477
# Components (Frontend)
7578
r'.*/components?/.*\.(tsx?|jsx?)$',
7679
r'.*/hooks?/.*\.(tsx?|jsx?)$',
7780
# Client/Integration
78-
r'.*/client[s]?/.*\.(java|py|ts|js)$',
79-
r'.*/integration/.*\.(java|py|ts|js)$',
81+
rf'.*/client[s]?/.*\.({_SRC})$',
82+
rf'.*/integration/.*\.({_SRC})$',
8083
]
8184

8285
# Patterns for LOW priority files
8386
LOW_PRIORITY_PATTERNS = [
8487
# Tests
85-
r'.*[Tt]est.*\.(java|py|ts|js)$',
88+
rf'.*[Tt]est.*\.({_SRC})$',
8689
r'.*[Ss]pec.*\.(ts|js)$',
8790
r'.*/test[s]?/.*',
8891
r'.*/__tests__/.*',
8992
# Configs
9093
r'.*\.(json|yaml|yml|toml|ini|cfg)$',
91-
r'.*[Cc]onfig.*\.(java|py|ts|js)$',
94+
rf'.*[Cc]onfig.*\.({_SRC})$',
9295
r'.*/config/.*',
9396
# Documentation
9497
r'.*\.(md|txt|rst)$',

0 commit comments

Comments
 (0)