Skip to content

Commit 9414ead

Browse files
committed
Fixed link detection to handle 3xx redirects properly
Updated isLinkBroken() to only treat 4xx/5xx status codes as broken. Previously 3xx redirects were incorrectly marked as broken links also improved javadoc clarity throughout LinkDetection class
1 parent d556a33 commit 9414ead

File tree

1 file changed

+146
-77
lines changed

1 file changed

+146
-77
lines changed

application/src/main/java/org/togetherjava/tjbot/features/utils/LinkDetection.java

Lines changed: 146 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -15,26 +15,54 @@
1515
import java.util.concurrent.CompletableFuture;
1616

1717
/**
18-
* Utility class to detect links.
18+
* Utility methods for working with links inside arbitrary text.
19+
*
20+
* <p>This class can:
21+
* <ul>
22+
* <li>Extract HTTP(S) links from text</li>
23+
* <li>Check whether a link is reachable via HTTP</li>
24+
* <li>Replace broken links asynchronously</li>
25+
* </ul>
26+
*
27+
* <p>It is intentionally stateless and uses asynchronous HTTP requests
28+
* to avoid blocking calling threads.
1929
*/
30+
2031
public class LinkDetection {
2132
private static final HttpClient HTTP_CLIENT = HttpClient.newHttpClient();
2233

34+
/**
35+
* Default filters applied when extracting links from text.
36+
*
37+
* <p>These filters intentionally ignore:
38+
* <ul>
39+
* <li>Suppressed links like {@code <https://example.com>}</li>
40+
* <li>Non-HTTP(S) schemes such as {@code ftp://} or {@code file://}</li>
41+
* </ul>
42+
*
43+
* <p>This reduces false positives when scanning chat messages
44+
* or source-code snippets.
45+
*/
46+
2347
private static final Set<LinkFilter> DEFAULT_FILTERS =
2448
Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME);
2549

2650
/**
27-
* Possible ways to filter a link.
28-
*
29-
* @see LinkDetection
51+
* Filters that control which detected URLs are returned by {@link #extractLinks}.
3052
*/
3153
public enum LinkFilter {
3254
/**
33-
* Filters links suppressed with {@literal <url>}.
55+
* Ignores URLs that are wrapped in angle brackets,
56+
* e.g. {@code <https://example.com>}.
57+
*
58+
* <p>Such links are often intentionally suppressed in chat platforms.
3459
*/
3560
SUPPRESSED,
3661
/**
37-
* Filters links that are not using http scheme.
62+
* Ignores URLs that do not use the HTTP or HTTPS scheme.
63+
*
64+
* <p>This helps avoid false positives such as {@code ftp://},
65+
* {@code file://}, or scheme-less matches.
3866
*/
3967
NON_HTTP_SCHEME
4068
}
@@ -44,102 +72,126 @@ private LinkDetection() {
4472
}
4573

4674
/**
47-
* Extracts all links from the given content.
75+
* Extracts HTTP(S) links from the given text.
76+
*
77+
* <p>The text is scanned using a URL detector, then filtered and normalized
78+
* according to the provided {@link LinkFilter}s.
79+
*
80+
* <p>Example:
81+
* <pre>{@code
82+
* Set<LinkFilter> filters = Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME);
83+
* extractLinks("Visit https://example.com and <ftp://skip.me>", filters)
84+
* // returns ["https://example.com"]
85+
* }</pre>
4886
*
49-
* @param content the content to search through
50-
* @param filter the filters applied to the urls
51-
* @return a list of all found links, can be empty
87+
* @param content the text to scan for links
88+
* @param filter a set of filters controlling which detected links are returned
89+
* @return a list of extracted links in the order they appear in the text
5290
*/
91+
5392
public static List<String> extractLinks(String content, Set<LinkFilter> filter) {
5493
return new UrlDetector(content, UrlDetectorOptions.BRACKET_MATCH).detect()
55-
.stream()
56-
.map(url -> toLink(url, filter))
57-
.flatMap(Optional::stream)
58-
.toList();
94+
.stream()
95+
.map(url -> toLink(url, filter))
96+
.flatMap(Optional::stream)
97+
.toList();
5998
}
6099

61100
/**
62-
* Checks whether the given content contains a link.
101+
* Checks whether the given text contains at least one detectable URL.
63102
*
64-
* @param content the content to search through
65-
* @return true if the content contains at least one link
103+
* <p>This method performs a lightweight detection only and does not
104+
* apply any {@link LinkFilter}s.
105+
*
106+
* @param content the text to scan
107+
* @return {@code true} if at least one URL-like pattern is detected
66108
*/
109+
67110
public static boolean containsLink(String content) {
68111
return !(new UrlDetector(content, UrlDetectorOptions.BRACKET_MATCH).detect().isEmpty());
69112
}
70113

71114
/**
72-
* Checks whether the given URL is considered broken.
115+
* Asynchronously checks whether a URL is considered broken.
73116
*
74-
* <p>
75-
* A link is considered broken if:
76-
* <ul>
77-
* <li>The URL is invalid or malformed</li>
78-
* <li>An HTTP request fails</li>
79-
* <li>The HTTP response status code is outside the 200–399 range</li>
80-
* </ul>
117+
* <p>The check is performed in two steps:
118+
* <ol>
119+
* <li>A {@code HEAD} request is sent first (cheap and fast)</li>
120+
* <li>If that fails or returns an error, a {@code GET} request is used as a fallback</li>
121+
* </ol>
81122
*
82-
* <p>
83-
* Notes:
123+
* <p>A link is considered broken if:
84124
* <ul>
85-
* <li>Status code {@code 200} is considered valid, even if the response body is empty</li>
86-
* <li>The response body content is not inspected</li>
125+
* <li>The URL is malformed or unreachable</li>
126+
* <li>The HTTP request fails with an exception</li>
127+
* <li>The response status code is 4xx (client error) or 5xx (server error)</li>
87128
* </ul>
88129
*
130+
* <p>Successful responses (2xx) and redirects (3xx) are considered valid links.
131+
* The response body is never inspected.
132+
*
89133
* @param url the URL to check
90-
* @return a future completing with {@code true} if the link is broken
134+
* @return a {@code CompletableFuture} completing with {@code true} if the link is broken,
135+
* {@code false} otherwise
91136
*/
92137

93138
public static CompletableFuture<Boolean> isLinkBroken(String url) {
94139
HttpRequest headRequest = HttpRequest.newBuilder(URI.create(url))
95-
.method("HEAD", HttpRequest.BodyPublishers.noBody())
96-
.build();
140+
.method("HEAD", HttpRequest.BodyPublishers.noBody())
141+
.build();
97142

98143
return HTTP_CLIENT.sendAsync(headRequest, HttpResponse.BodyHandlers.discarding())
99-
.thenApply(response -> {
100-
int status = response.statusCode();
101-
return status < 200 || status >= 400;
102-
})
103-
.exceptionally(ignored -> true)
104-
.thenCompose(result -> {
105-
if (!Boolean.TRUE.equals(result)) {
106-
return CompletableFuture.completedFuture(false);
107-
}
108-
HttpRequest getRequest = HttpRequest.newBuilder(URI.create(url)).GET().build();
109-
return HTTP_CLIENT.sendAsync(getRequest, HttpResponse.BodyHandlers.discarding())
110-
.thenApply(resp -> resp.statusCode() >= 400)
111-
.exceptionally(ignored -> true); // still never null
112-
});
144+
.thenApply(response -> {
145+
int status = response.statusCode();
146+
// 2xx and 3xx are success, 4xx and 5xx are errors
147+
return status >= 400;
148+
})
149+
.exceptionally(ignored -> true)
150+
.thenCompose(result -> {
151+
if (!Boolean.TRUE.equals(result)) {
152+
return CompletableFuture.completedFuture(false);
153+
}
154+
HttpRequest fallbackGetRequest = HttpRequest.newBuilder(URI.create(url)).GET().build();
155+
return HTTP_CLIENT.sendAsync(fallbackGetRequest, HttpResponse.BodyHandlers.discarding())
156+
.thenApply(resp -> resp.statusCode() >= 400)
157+
.exceptionally(ignored -> true);
158+
});
113159
}
114160

115161
/**
116-
* Replaces all broken links in the given text with the provided replacement string.
162+
* Replaces all broken HTTP(S) links in the given text.
163+
*
164+
* <p>Each detected link is checked asynchronously using
165+
* {@link #isLinkBroken(String)}. Only links confirmed as broken
166+
* are replaced. Duplicate URLs are checked only once and all occurrences
167+
* are replaced if found to be broken.
117168
*
118-
* <p>
119-
* Example:
120-
*
169+
* <p>This method does not block - all link checks are performed
170+
* asynchronously and combined into a single {@code CompletableFuture}.
171+
*
172+
* <p>Example:
121173
* <pre>{@code
122174
* replaceDeadLinks("""
123-
* Test
124-
* http://deadlink/1
125-
* http://workinglink/1
126-
* """, "broken")
175+
* Test
176+
* http://deadlink/1
177+
* http://workinglink/1
178+
* """, "(broken link)")
127179
* }</pre>
128180
*
129-
* <p>
130-
* Results in:
131-
*
181+
* <p>Results in:
132182
* <pre>{@code
133183
* Test
134-
* broken
184+
* (broken link)
135185
* http://workinglink/1
136186
* }</pre>
137187
*
138188
* @param text the input text containing URLs
139-
* @param replacement the string to replace broken links with
140-
* @return a future containing the modified text
189+
* @param replacement the string used to replace broken links
190+
* @return a {@code CompletableFuture} that completes with the modified text,
191+
* or the original text if no broken links were found
141192
*/
142193

194+
143195
public static CompletableFuture<String> replaceDeadLinks(String text, String replacement) {
144196
List<String> links = extractLinks(text, DEFAULT_FILTERS);
145197

@@ -148,26 +200,44 @@ public static CompletableFuture<String> replaceDeadLinks(String text, String rep
148200
}
149201

150202
List<CompletableFuture<String>> deadLinkFutures = links.stream()
151-
.distinct()
152-
.map(link -> isLinkBroken(link)
153-
.thenApply(isBroken -> Boolean.TRUE.equals(isBroken) ? link : null))
203+
.distinct()
204+
.map(link -> isLinkBroken(link)
205+
.thenApply(isBroken -> Boolean.TRUE.equals(isBroken) ? link : null))
154206

155-
.toList();
207+
.toList();
156208

157209
return CompletableFuture.allOf(deadLinkFutures.toArray(new CompletableFuture[0]))
158-
.thenApply(ignored -> deadLinkFutures.stream()
159-
.map(CompletableFuture::join)
160-
.filter(Objects::nonNull)
161-
.toList())
162-
.thenApply(deadLinks -> {
163-
String result = text;
164-
for (String deadLink : deadLinks) {
165-
result = result.replace(deadLink, replacement);
166-
}
167-
return result;
168-
});
210+
.thenApply(ignored -> deadLinkFutures.stream()
211+
.map(CompletableFuture::join)
212+
.filter(Objects::nonNull)
213+
.toList())
214+
.thenApply(deadLinks -> {
215+
String result = text;
216+
for (String deadLink : deadLinks) {
217+
result = result.replace(deadLink, replacement);
218+
}
219+
return result;
220+
});
169221
}
170222

223+
/**
224+
* Converts a detected {@link Url} into a normalized link string.
225+
*
226+
* <p>Applies the provided {@link LinkFilter}s:
227+
* <ul>
228+
* <li>{@link LinkFilter#SUPPRESSED} - filters URLs wrapped in angle brackets</li>
229+
* <li>{@link LinkFilter#NON_HTTP_SCHEME} - filters non-HTTP(S) schemes</li>
230+
* </ul>
231+
*
232+
* <p>Additionally removes trailing punctuation such as commas or periods
233+
* from the detected URL.
234+
*
235+
* @param url the detected URL
236+
* @param filter active link filters to apply
237+
* @return an {@link Optional} containing the normalized link,
238+
* or {@code Optional.empty()} if the link should be filtered out
239+
*/
240+
171241
private static Optional<String> toLink(Url url, Set<LinkFilter> filter) {
172242
String raw = url.getOriginalUrl();
173243
if (filter.contains(LinkFilter.SUPPRESSED) && raw.contains(">")) {
@@ -188,5 +258,4 @@ private static Optional<String> toLink(Url url, Set<LinkFilter> filter) {
188258
}
189259
return Optional.of(link);
190260
}
191-
192-
}
261+
}

0 commit comments

Comments
 (0)