Skip to content

Commit 6ebffd9

Browse files
Add utilities to detect and replace broken links. (#1366)
* Add utilities to detect and replace broken links. * style: run spotlessApply * Add utilities to detect and replace broken links V2 * Add utilities to detect and replace broken links V2 * Add utilities to detect and replace broken links V2 * Add utilities to detect and replace broken links V2 * Fixed link detection to handle 3xx redirects properly Updated isLinkBroken() to only treat 4xx/5xx status codes as broken. Previously 3xx redirects were incorrectly marked as broken links also improved javadoc clarity throughout LinkDetection class * Apply Spotless formatting and regenerate jOOQ sources * commit:Broken links resolve requested changes - Rename replaceDeadLinks to replaceBrokenLinks for consistency - Use Optional instead of null values in stream processing - Add convenience overload for extractLinks with default filters - Update javadocs to be more generic and future-proof - Move implementation details from javadoc to inline comments - Replace 'ignored' lambda params with '_' Resolves the review comments from @Zabuzard * refactor: apply review feedback from @Zabuzard * New fixed and changes * refactor: simplify link filtering with helper method Use streams thraughout and extract replacement logic into a separate method for better readbility * style(javadoc): remove gap between JavaDoc and method signature Remove a reoccuring line gap introduced between each method's JavaDoc and its corresponding method signatures to adhere to the overall style of the project. Signed-off-by: Chris Sdogkos <work@chris-sdogkos.com> --------- Signed-off-by: Chris Sdogkos <work@chris-sdogkos.com> Co-authored-by: Chris Sdogkos <work@chris-sdogkos.com>
1 parent 4367c2d commit 6ebffd9

File tree

1 file changed

+204
-15
lines changed

1 file changed

+204
-15
lines changed

application/src/main/java/org/togetherjava/tjbot/features/utils/LinkDetection.java

Lines changed: 204 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,61 @@
44
import com.linkedin.urls.detection.UrlDetector;
55
import com.linkedin.urls.detection.UrlDetectorOptions;
66

7+
import java.net.URI;
8+
import java.net.http.HttpClient;
9+
import java.net.http.HttpRequest;
10+
import java.net.http.HttpResponse;
711
import java.util.List;
12+
import java.util.Objects;
813
import java.util.Optional;
914
import java.util.Set;
15+
import java.util.concurrent.CompletableFuture;
1016

1117
/**
12-
* Utility class to detect links.
18+
* Utility methods for working with links inside arbitrary text.
19+
*
20+
* <p>
21+
* This class can:
22+
* <ul>
23+
* <li>Extract HTTP(S) links from text</li>
24+
* <li>Check whether a link is reachable via HTTP</li>
25+
* <li>Replace broken links asynchronously</li>
26+
* </ul>
27+
*
28+
* <p>
29+
* It is intentionally stateless and uses asynchronous HTTP requests to avoid blocking calling
30+
* threads.
1331
*/
1432
public class LinkDetection {
33+
private static final HttpClient HTTP_CLIENT = HttpClient.newHttpClient();
1534

1635
/**
17-
* Possible ways to filter a link.
36+
* Default filters applied when extracting links from text.
1837
*
19-
* @see LinkDetection
38+
* <p>
39+
* Links to intentionally ignore in order to reduce false positives when scanning chat messages
40+
* or source-code snippets.
41+
*/
42+
private static final Set<LinkFilter> DEFAULT_FILTERS =
43+
Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME);
44+
45+
/**
46+
* Filters that control which detected URLs are returned by {@link #extractLinks}.
2047
*/
2148
public enum LinkFilter {
2249
/**
23-
* Filters links suppressed with {@literal <url>}.
50+
* Ignores URLs that are wrapped in angle brackets, e.g. {@code <https://example.com>}.
51+
*
52+
* <p>
53+
* Such links are often intentionally suppressed in chat platforms.
2454
*/
2555
SUPPRESSED,
2656
/**
27-
* Filters links that are not using http scheme.
57+
* Ignores URLs that do not use the HTTP or HTTPS scheme.
58+
*
59+
* <p>
60+
* This helps avoid false positives such as {@code ftp://}, {@code file://}, or scheme-less
61+
* matches.
2862
*/
2963
NON_HTTP_SCHEME
3064
}
@@ -34,11 +68,24 @@ private LinkDetection() {
3468
}
3569

3670
/**
37-
* Extracts all links from the given content.
71+
* Extracts links from the given text.
72+
*
73+
* <p>
74+
* The text is scanned using a URL detector, then filtered and normalized according to the
75+
* provided {@link LinkFilter}s.
76+
*
77+
* <p>
78+
* Example:
79+
*
80+
* <pre>{@code
81+
* Set<LinkFilter> filters = Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME);
82+
* extractLinks("Visit https://example.com and <ftp://skip.me>", filters)
83+
* // returns ["https://example.com"]
84+
* }</pre>
3885
*
39-
* @param content the content to search through
40-
* @param filter the filters applied to the urls
41-
* @return a list of all found links, can be empty
86+
* @param content the text to scan for links
87+
* @param filter a set of filters controlling which detected links are returned
88+
* @return a list of extracted links in the order they appear in the text
4289
*/
4390
public static List<String> extractLinks(String content, Set<LinkFilter> filter) {
4491
return new UrlDetector(content, UrlDetectorOptions.BRACKET_MATCH).detect()
@@ -49,22 +96,166 @@ public static List<String> extractLinks(String content, Set<LinkFilter> filter)
4996
}
5097

5198
/**
52-
* Checks whether the given content contains a link.
99+
* Extracts links from the given text using default filters.
53100
*
54-
* @param content the content to search through
55-
* @return true if the content contains at least one link
101+
* <p>
102+
* This is a convenience method that uses {@link #DEFAULT_FILTERS}.
103+
*
104+
* @param content the text to scan for links
105+
* @return a list of extracted links in the order they appear in the text
106+
* @see #extractLinks(String, Set)
107+
*/
108+
public static List<String> extractLinks(String content) {
109+
return extractLinks(content, DEFAULT_FILTERS);
110+
}
111+
112+
/**
113+
* Checks whether the given text contains at least one detectable URL.
114+
*
115+
* <p>
116+
* This method performs a lightweight detection only and does not apply any {@link LinkFilter}s.
117+
*
118+
* @param content the text to scan
119+
* @return {@code true} if at least one URL-like pattern is detected
56120
*/
57121
public static boolean containsLink(String content) {
58122
return !(new UrlDetector(content, UrlDetectorOptions.BRACKET_MATCH).detect().isEmpty());
59123
}
60124

125+
/**
126+
* Asynchronously checks whether a URL is considered broken.
127+
*
128+
* <p>
129+
* A link is considered broken if:
130+
* <ul>
131+
* <li>The URL is malformed or unreachable</li>
132+
* <li>The HTTP request fails with an exception</li>
133+
* <li>The response status code is 4xx (client error) or 5xx (server error)</li>
134+
* </ul>
135+
*
136+
* <p>
137+
* Successful responses (2xx) and redirects (3xx) are considered valid links. The response body
138+
* is never inspected.
139+
*
140+
* @param url the URL to check
141+
* @return a {@code CompletableFuture} completing with {@code true} if the link is broken,
142+
* {@code false} otherwise
143+
*/
144+
public static CompletableFuture<Boolean> isLinkBroken(String url) {
145+
// Try HEAD request first (cheap and fast)
146+
HttpRequest headRequest = HttpRequest.newBuilder(URI.create(url))
147+
.method("HEAD", HttpRequest.BodyPublishers.noBody())
148+
.build();
149+
150+
return HTTP_CLIENT.sendAsync(headRequest, HttpResponse.BodyHandlers.discarding())
151+
.thenApply(response -> {
152+
int status = response.statusCode();
153+
// 2xx and 3xx are success, 4xx and 5xx are errors
154+
return status >= 400;
155+
})
156+
.exceptionally(_ -> true)
157+
.thenCompose(result -> {
158+
if (!Boolean.TRUE.equals(result)) {
159+
return CompletableFuture.completedFuture(false);
160+
}
161+
// If HEAD fails, fall back to GET request (some servers don't support HEAD)
162+
HttpRequest fallbackGetRequest =
163+
HttpRequest.newBuilder(URI.create(url)).GET().build();
164+
return HTTP_CLIENT
165+
.sendAsync(fallbackGetRequest, HttpResponse.BodyHandlers.discarding())
166+
.thenApply(resp -> resp.statusCode() >= 400)
167+
.exceptionally(_ -> true);
168+
});
169+
}
170+
171+
/**
172+
* Replaces all broken links in the given text.
173+
*
174+
* <p>
175+
* Each detected link is checked asynchronously using {@link #isLinkBroken(String)}. Only links
176+
* confirmed as broken are replaced. Duplicate URLs are checked only once and all occurrences
177+
* are replaced if found to be broken.
178+
*
179+
* <p>
180+
* This method does not block - all link checks are performed asynchronously and combined into a
181+
* single {@code CompletableFuture}.
182+
*
183+
* <p>
184+
* Example:
185+
*
186+
* <pre>{@code
187+
* replaceBrokenLinks("""
188+
* Test
189+
* http://deadlink/1
190+
* http://workinglink/1
191+
* """, "(broken link)")
192+
* }</pre>
193+
*
194+
* <p>
195+
* Results in:
196+
*
197+
* <pre>{@code
198+
* Test
199+
* (broken link)
200+
* http://workinglink/1
201+
* }</pre>
202+
*
203+
* @param text the input text containing URLs
204+
* @param replacement the string used to replace broken links
205+
* @return a {@code CompletableFuture} that completes with the modified text, or the original
206+
* text if no broken links were found
207+
*/
208+
public static CompletableFuture<String> replaceBrokenLinks(String text, String replacement) {
209+
List<String> links = extractLinks(text, DEFAULT_FILTERS);
210+
211+
if (links.isEmpty()) {
212+
return CompletableFuture.completedFuture(text);
213+
}
214+
215+
// Can't filter yet - we won't know which links are broken until the futures complete
216+
List<CompletableFuture<String>> brokenLinkFutures = links.stream()
217+
.distinct()
218+
.map(link -> isLinkBroken(link)
219+
.thenApply(isBroken -> Boolean.TRUE.equals(isBroken) ? link : null))
220+
.toList();
221+
222+
return CompletableFuture.allOf(brokenLinkFutures.toArray(CompletableFuture[]::new))
223+
.thenApply(_ -> brokenLinkFutures.stream()
224+
.map(CompletableFuture::join)
225+
.filter(Objects::nonNull)
226+
.toList())
227+
.thenApply(brokenLinks -> replaceLinks(brokenLinks, text, replacement));
228+
}
229+
230+
private static String replaceLinks(List<String> linksToReplace, String text,
231+
String replacement) {
232+
String result = text;
233+
for (String link : linksToReplace) {
234+
result = result.replace(link, replacement);
235+
}
236+
return result;
237+
}
238+
239+
/**
240+
* Converts a detected {@link Url} into a normalized link string.
241+
*
242+
* <p>
243+
* Applies the provided {@link LinkFilter}s. Additionally removes trailing punctuation such as
244+
* commas or periods from the detected URL.
245+
*
246+
* @param url the detected URL
247+
* @param filter active link filters to apply
248+
* @return an {@link Optional} containing the normalized link, or {@code Optional.empty()} if
249+
* the link should be filtered out
250+
*/
61251
private static Optional<String> toLink(Url url, Set<LinkFilter> filter) {
62252
String raw = url.getOriginalUrl();
63253
if (filter.contains(LinkFilter.SUPPRESSED) && raw.contains(">")) {
64254
// URL escapes, such as "<http://example.com>" should be skipped
65255
return Optional.empty();
66256
}
67-
// Not interested in other schemes, also to filter out matches without scheme.
257+
// Not interested in other schemes, also to filter out matches without scheme (Skip non-HTTP
258+
// schemes)
68259
// It detects a lot of such false-positives in Java snippets
69260
if (filter.contains(LinkFilter.NON_HTTP_SCHEME) && !raw.startsWith("http")) {
70261
return Optional.empty();
@@ -76,8 +267,6 @@ private static Optional<String> toLink(Url url, Set<LinkFilter> filter) {
76267
// Remove trailing punctuation
77268
link = link.substring(0, link.length() - 1);
78269
}
79-
80270
return Optional.of(link);
81271
}
82-
83272
}

0 commit comments

Comments
 (0)