|
2 | 2 |
|
3 | 3 | import org.apache.commons.lang3.StringUtils; |
4 | 4 | import org.apache.commons.text.similarity.JaccardSimilarity; |
5 | | -import org.apache.commons.text.similarity.LevenshteinDistance; |
6 | 5 |
|
7 | 6 | import java.util.Arrays; |
8 | 7 | import java.util.List; |
|
12 | 11 | import java.util.TreeSet; |
13 | 12 | import java.util.concurrent.ConcurrentHashMap; |
14 | 13 | import java.util.function.UnaryOperator; |
15 | | -import java.util.regex.Matcher; |
16 | 14 | import java.util.regex.Pattern; |
17 | 15 | import java.util.stream.Collectors; |
18 | 16 |
|
@@ -345,136 +343,49 @@ public static List<String> getKeywordsMatching(String response, Set<String> prov |
345 | 343 | } |
346 | 344 |
|
347 | 345 | /** |
348 | | - * Normalize an error message to a structural form: |
349 | | - * - collapse \"...\" content to \"\" |
350 | | - * - replace variable-like tokens with placeholders |
351 | | - * - squash whitespace/noise |
| 346 | + * Detects the casing of a string based on its format. |
352 | 347 | * |
353 | | - * @param s the error message to normalize |
354 | | - * @return the error message normalized |
| 348 | + * @param sample The string to detect the casing of. |
| 349 | + * @return The detected casing as a string. |
355 | 350 | */ |
356 | | - public static String normalizeErrorMessage(String s) { |
357 | | - if (StringUtils.isBlank(s)) { |
358 | | - return ""; |
| 351 | + public static String detectCasingFromString(String sample) { |
| 352 | + if (sample.contains("_") && sample.equals(sample.toUpperCase(Locale.ROOT))) { |
| 353 | + return "UPPER_SNAKE_CASE"; |
| 354 | + } else if (sample.contains("_") && sample.equals(sample.toLowerCase(Locale.ROOT))) { |
| 355 | + return "lower_snake_case"; |
| 356 | + } else if (sample.contains("-")) { |
| 357 | + return "kebab-case"; |
| 358 | + } else if (Character.isLowerCase(sample.charAt(0)) && sample.matches(".*[A-Z].*")) { |
| 359 | + return "camelCase"; |
| 360 | + } else if (Character.isUpperCase(sample.charAt(0)) && sample.matches(".*[a-z].*")) { |
| 361 | + return "PascalCase"; |
| 362 | + } else if (sample.equals(sample.toLowerCase(Locale.ROOT))) { |
| 363 | + return "lowercase"; |
359 | 364 | } |
360 | | - |
361 | | - String r = s; |
362 | | - |
363 | | - // 1) Collapse escaped, inner quoted segments so only the quotes remain. |
364 | | - r = collapseEscapedQuotedSegments(r); |
365 | | - |
366 | | - // 2) Replace common highly-variable substrings with placeholders. |
367 | | - r = TS.matcher(r).replaceAll("TIMESTAMP"); |
368 | | - r = UUID.matcher(r).replaceAll("UUID"); |
369 | | - r = HASH.matcher(r).replaceAll("HASH"); |
370 | | - r = URL.matcher(r).replaceAll("URL"); |
371 | | - r = PATH.matcher(r).replaceAll("PATH"); |
372 | | - r = DIGITS.matcher(r).replaceAll("NUM"); |
373 | | - r = BASE64ISH.matcher(r).replaceAll("TOKEN"); |
374 | | - |
375 | | - // 3) Replace uppercase ID-like tokens (generic, handles DYX/XIIR... cases). |
376 | | - r = replaceUpperTokens(r); |
377 | | - |
378 | | - // 4) Remove zero-width and spacing noise, normalize whitespace. |
379 | | - r = ZCMS.matcher(r).replaceAll(" "); |
380 | | - r = MULTI_SPACE.matcher(r).replaceAll(" ").trim(); |
381 | | - |
382 | | - return r; |
383 | | - } |
384 | | - |
385 | | - // Helper: replace UPPER_TOKEN occurrences with TOKEN unless whitelisted |
386 | | - private static String replaceUpperTokens(String s) { |
387 | | - StringBuilder out = new StringBuilder(s.length()); |
388 | | - Matcher m = UPPER_TOKEN.matcher(s); |
389 | | - while (m.find()) { |
390 | | - String tok = m.group(); |
391 | | - if (UPPER_WHITELIST.contains(tok)) { |
392 | | - m.appendReplacement(out, tok); |
393 | | - } else { |
394 | | - m.appendReplacement(out, "TOKEN"); |
395 | | - } |
396 | | - } |
397 | | - m.appendTail(out); |
398 | | - return out.toString(); |
399 | | - } |
400 | | - |
401 | | - // Helper: collapse occurrences of \" ... \" to \"\" |
402 | | - // (works well for messages like: ... parsing \"👩🏾false\" : invalid syntax) |
403 | | - private static String collapseEscapedQuotedSegments(String s) { |
404 | | - int i = 0; |
405 | | - int n = s.length(); |
406 | | - StringBuilder sb = new StringBuilder(n); |
407 | | - while (i < n) { |
408 | | - int open = s.indexOf("\\\"", i); |
409 | | - if (open < 0) { |
410 | | - sb.append(s, i, n); |
411 | | - break; |
412 | | - } |
413 | | - // copy up to the start of the escaped quote |
414 | | - sb.append(s, i, open); |
415 | | - // write collapsed pair \"\" |
416 | | - sb.append("\\\"\\\""); |
417 | | - // find the next closing escaped quote |
418 | | - int j = open + 2; |
419 | | - int close = s.indexOf("\\\"", j); |
420 | | - if (close < 0) { |
421 | | - // no closing pair; append rest and finish |
422 | | - sb.append(s, j, n); |
423 | | - break; |
424 | | - } |
425 | | - // skip the content and the closing pair |
426 | | - i = close + 2; |
427 | | - } |
428 | | - return sb.toString(); |
| 365 | + return "UPPER_SNAKE_CASE"; // default |
429 | 366 | } |
430 | 367 |
|
431 | 368 | /** |
432 | | - * Main similarity predicate (stable and fast). |
433 | | - * <p> |
434 | | - * - Cheap Jaccard gate on normalized strings. |
435 | | - * - Thresholded Levenshtein (banded) based on what remains necessary. |
| 369 | + * Coverts a string to the detected casing convention. |
436 | 370 | * |
437 | | - * @param a the first error message |
438 | | - * @param b the second error message |
439 | | - * @return true if the error messages are similar, false otherwise |
| 371 | + * @param name the string to convert |
| 372 | + * @param casingConvention the casing to convert to |
| 373 | + * @return the converted string |
440 | 374 | */ |
441 | | - public static boolean areErrorsSimilar(String a, String b) { |
442 | | - if (StringUtils.isBlank(a) || StringUtils.isBlank(b)) { |
443 | | - return false; |
444 | | - } |
445 | | - if (a.equals(b)) { |
446 | | - return true; |
447 | | - } |
448 | | - |
449 | | - // Normalize once with caching |
450 | | - final String na = NORMALIZED_CACHE.computeIfAbsent(a, WordUtils::normalizeErrorMessage); |
451 | | - final String nb = NORMALIZED_CACHE.computeIfAbsent(b, WordUtils::normalizeErrorMessage); |
452 | | - |
453 | | - // Fast structural equality |
454 | | - if (na.equals(nb)) { |
455 | | - return true; |
456 | | - } |
457 | | - |
458 | | - // Cheap token similarity gate |
459 | | - final double token = JS.apply(na, nb); |
460 | | - if (token < JACCARD_THRESHOLD) { |
461 | | - return false; |
462 | | - } |
463 | | - |
464 | | - // Compute minimal LD similarity still needed to reach combined threshold. |
465 | | - // combined = (ldSim + token) / 2 >= COMBINED_THRESHOLD |
466 | | - final double minLdSim = Math.max(0.0, 2 * COMBINED_THRESHOLD - token); |
467 | | - |
468 | | - // Convert to an edit-distance bound over the normalized strings: |
469 | | - final int maxLen = Math.max(na.length(), nb.length()); |
470 | | - final int maxEdits = (int) Math.ceil(maxLen * (1.0 - minLdSim)); |
471 | | - |
472 | | - final Integer dist = new LevenshteinDistance(maxEdits).apply(na, nb); |
473 | | - if (dist < 0) { |
474 | | - return false; // exceeded bound |
475 | | - } |
476 | | - final double ldSim = 1.0 - (dist.doubleValue() / maxLen); |
477 | | - |
478 | | - return (ldSim + token) / 2.0 >= COMBINED_THRESHOLD; |
| 375 | + public static String convertToDetectedCasing(String name, String casingConvention) { |
| 376 | + return switch (casingConvention) { |
| 377 | + case "lower_snake_case" -> name.replaceAll("([a-z])([A-Z])", "$1_$2") |
| 378 | + .replaceAll("([A-Z])([A-Z][a-z])", "$1_$2") |
| 379 | + .toLowerCase(Locale.ROOT); |
| 380 | + case "kebab-case" -> name.replaceAll("([a-z])([A-Z])", "$1-$2") |
| 381 | + .replaceAll("([A-Z])([A-Z][a-z])", "$1-$2") |
| 382 | + .toLowerCase(Locale.ROOT); |
| 383 | + case "camelCase" -> Character.toLowerCase(name.charAt(0)) + name.substring(1); |
| 384 | + case "PascalCase" -> name; |
| 385 | + case "lowercase" -> name.toLowerCase(Locale.ROOT); |
| 386 | + default -> name.replaceAll("([a-z])([A-Z])", "$1_$2") |
| 387 | + .replaceAll("([A-Z])([A-Z][a-z])", "$1_$2") |
| 388 | + .toUpperCase(Locale.ROOT); |
| 389 | + }; |
479 | 390 | } |
480 | 391 | } |
0 commit comments