1515import java .util .concurrent .CompletableFuture ;
1616
1717/**
18- * Utility class to detect links.
18+ * Utility methods for working with links inside arbitrary text.
19+ *
20+ * <p>This class can:
21+ * <ul>
22+ * <li>Extract HTTP(S) links from text</li>
23+ * <li>Check whether a link is reachable via HTTP</li>
24+ * <li>Replace broken links asynchronously</li>
25+ * </ul>
26+ *
27+ * <p>It is intentionally stateless and uses asynchronous HTTP requests
28+ * to avoid blocking calling threads.
1929 */
30+
2031public class LinkDetection {
2132 private static final HttpClient HTTP_CLIENT = HttpClient .newHttpClient ();
2233
34+ /**
35+ * Default filters applied when extracting links from text.
36+ *
37+ * <p>These filters intentionally ignore:
38+ * <ul>
39+ * <li>Suppressed links like {@code <https://example.com>}</li>
40+ * <li>Non-HTTP(S) schemes such as {@code ftp://} or {@code file://}</li>
41+ * </ul>
42+ *
43+ * <p>This reduces false positives when scanning chat messages
44+ * or source-code snippets.
45+ */
46+
2347 private static final Set <LinkFilter > DEFAULT_FILTERS =
2448 Set .of (LinkFilter .SUPPRESSED , LinkFilter .NON_HTTP_SCHEME );
2549
2650 /**
27- * Possible ways to filter a link.
28- *
29- * @see LinkDetection
51+ * Filters that control which detected URLs are returned by {@link #extractLinks}.
3052 */
3153 public enum LinkFilter {
3254 /**
33- * Filters links suppressed with {@literal <url>}.
55+ * Ignores URLs that are wrapped in angle brackets,
56+ * e.g. {@code <https://example.com>}.
57+ *
58+ * <p>Such links are often intentionally suppressed in chat platforms.
3459 */
3560 SUPPRESSED ,
3661 /**
37- * Filters links that are not using http scheme.
62+ * Ignores URLs that do not use the HTTP or HTTPS scheme.
63+ *
64+ * <p>This helps avoid false positives such as {@code ftp://},
65+ * {@code file://}, or scheme-less matches.
3866 */
3967 NON_HTTP_SCHEME
4068 }
@@ -44,102 +72,126 @@ private LinkDetection() {
4472 }
4573
4674 /**
47- * Extracts all links from the given content.
75+ * Extracts HTTP(S) links from the given text.
76+ *
77+ * <p>The text is scanned using a URL detector, then filtered and normalized
78+ * according to the provided {@link LinkFilter}s.
79+ *
80+ * <p>Example:
81+ * <pre>{@code
82+ * Set<LinkFilter> filters = Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME);
83+ * extractLinks("Visit https://example.com and <ftp://skip.me>", filters)
84+ * // returns ["https://example.com"]
85+ * }</pre>
4886 *
49- * @param content the content to search through
50- * @param filter the filters applied to the urls
51- * @return a list of all found links, can be empty
87+ * @param content the text to scan for links
88+ * @param filter a set of filters controlling which detected links are returned
89+ * @return a list of extracted links in the order they appear in the text
5290 */
91+
5392 public static List <String > extractLinks (String content , Set <LinkFilter > filter ) {
5493 return new UrlDetector (content , UrlDetectorOptions .BRACKET_MATCH ).detect ()
55- .stream ()
56- .map (url -> toLink (url , filter ))
57- .flatMap (Optional ::stream )
58- .toList ();
94+ .stream ()
95+ .map (url -> toLink (url , filter ))
96+ .flatMap (Optional ::stream )
97+ .toList ();
5998 }
6099
61100 /**
62- * Checks whether the given content contains a link .
101+ * Checks whether the given text contains at least one detectable URL .
63102 *
64- * @param content the content to search through
65- * @return true if the content contains at least one link
103+ * <p>This method performs a lightweight detection only and does not
104+ * apply any {@link LinkFilter}s.
105+ *
106+ * @param content the text to scan
107+ * @return {@code true} if at least one URL-like pattern is detected
66108 */
109+
67110 public static boolean containsLink (String content ) {
68111 return !(new UrlDetector (content , UrlDetectorOptions .BRACKET_MATCH ).detect ().isEmpty ());
69112 }
70113
71114 /**
72- * Checks whether the given URL is considered broken.
115+ * Asynchronously checks whether a URL is considered broken.
73116 *
74- * <p>
75- * A link is considered broken if:
76- * <ul>
77- * <li>The URL is invalid or malformed</li>
78- * <li>An HTTP request fails</li>
79- * <li>The HTTP response status code is outside the 200–399 range</li>
80- * </ul>
117+ * <p>The check is performed in two steps:
118+ * <ol>
119+ * <li>A {@code HEAD} request is sent first (cheap and fast)</li>
120+ * <li>If that fails or returns an error, a {@code GET} request is used as a fallback</li>
121+ * </ol>
81122 *
82- * <p>
83- * Notes:
123+ * <p>A link is considered broken if:
84124 * <ul>
85- * <li>Status code {@code 200} is considered valid, even if the response body is empty</li>
86- * <li>The response body content is not inspected</li>
125+ * <li>The URL is malformed or unreachable</li>
126+ * <li>The HTTP request fails with an exception</li>
127+ * <li>The response status code is 4xx (client error) or 5xx (server error)</li>
87128 * </ul>
88129 *
130+ * <p>Successful responses (2xx) and redirects (3xx) are considered valid links.
131+ * The response body is never inspected.
132+ *
89133 * @param url the URL to check
90- * @return a future completing with {@code true} if the link is broken
134+ * @return a {@code CompletableFuture} completing with {@code true} if the link is broken,
135+ * {@code false} otherwise
91136 */
92137
93138 public static CompletableFuture <Boolean > isLinkBroken (String url ) {
94139 HttpRequest headRequest = HttpRequest .newBuilder (URI .create (url ))
95- .method ("HEAD" , HttpRequest .BodyPublishers .noBody ())
96- .build ();
140+ .method ("HEAD" , HttpRequest .BodyPublishers .noBody ())
141+ .build ();
97142
98143 return HTTP_CLIENT .sendAsync (headRequest , HttpResponse .BodyHandlers .discarding ())
99- .thenApply (response -> {
100- int status = response .statusCode ();
101- return status < 200 || status >= 400 ;
102- })
103- .exceptionally (ignored -> true )
104- .thenCompose (result -> {
105- if (!Boolean .TRUE .equals (result )) {
106- return CompletableFuture .completedFuture (false );
107- }
108- HttpRequest getRequest = HttpRequest .newBuilder (URI .create (url )).GET ().build ();
109- return HTTP_CLIENT .sendAsync (getRequest , HttpResponse .BodyHandlers .discarding ())
110- .thenApply (resp -> resp .statusCode () >= 400 )
111- .exceptionally (ignored -> true ); // still never null
112- });
144+ .thenApply (response -> {
145+ int status = response .statusCode ();
146+ // 2xx and 3xx are success, 4xx and 5xx are errors
147+ return status >= 400 ;
148+ })
149+ .exceptionally (ignored -> true )
150+ .thenCompose (result -> {
151+ if (!Boolean .TRUE .equals (result )) {
152+ return CompletableFuture .completedFuture (false );
153+ }
154+ HttpRequest fallbackGetRequest = HttpRequest .newBuilder (URI .create (url )).GET ().build ();
155+ return HTTP_CLIENT .sendAsync (fallbackGetRequest , HttpResponse .BodyHandlers .discarding ())
156+ .thenApply (resp -> resp .statusCode () >= 400 )
157+ .exceptionally (ignored -> true );
158+ });
113159 }
114160
115161 /**
116- * Replaces all broken links in the given text with the provided replacement string.
162+ * Replaces all broken HTTP(S) links in the given text.
163+ *
164+ * <p>Each detected link is checked asynchronously using
165+ * {@link #isLinkBroken(String)}. Only links confirmed as broken
166+ * are replaced. Duplicate URLs are checked only once and all occurrences
167+ * are replaced if found to be broken.
117168 *
118- * <p>
119- * Example:
120- *
169+ * <p>This method does not block - all link checks are performed
170+ * asynchronously and combined into a single {@code CompletableFuture}.
171+ *
172+ * <p>Example:
121173 * <pre>{@code
122174 * replaceDeadLinks("""
123- * Test
124- * http://deadlink/1
125- * http://workinglink/1
126- * """, "broken")
175+ * Test
176+ * http://deadlink/1
177+ * http://workinglink/1
178+ * """, "( broken link) ")
127179 * }</pre>
128180 *
129- * <p>
130- * Results in:
131- *
181+ * <p>Results in:
132182 * <pre>{@code
133183 * Test
134- * broken
184+ * ( broken link)
135185 * http://workinglink/1
136186 * }</pre>
137187 *
138188 * @param text the input text containing URLs
139- * @param replacement the string to replace broken links with
140- * @return a future containing the modified text
189+ * @param replacement the string used to replace broken links
190+ * @return a {@code CompletableFuture} that completes with the modified text,
191+ * or the original text if no broken links were found
141192 */
142193
194+
143195 public static CompletableFuture <String > replaceDeadLinks (String text , String replacement ) {
144196 List <String > links = extractLinks (text , DEFAULT_FILTERS );
145197
@@ -148,26 +200,44 @@ public static CompletableFuture<String> replaceDeadLinks(String text, String rep
148200 }
149201
150202 List <CompletableFuture <String >> deadLinkFutures = links .stream ()
151- .distinct ()
152- .map (link -> isLinkBroken (link )
153- .thenApply (isBroken -> Boolean .TRUE .equals (isBroken ) ? link : null ))
203+ .distinct ()
204+ .map (link -> isLinkBroken (link )
205+ .thenApply (isBroken -> Boolean .TRUE .equals (isBroken ) ? link : null ))
154206
155- .toList ();
207+ .toList ();
156208
157209 return CompletableFuture .allOf (deadLinkFutures .toArray (new CompletableFuture [0 ]))
158- .thenApply (ignored -> deadLinkFutures .stream ()
159- .map (CompletableFuture ::join )
160- .filter (Objects ::nonNull )
161- .toList ())
162- .thenApply (deadLinks -> {
163- String result = text ;
164- for (String deadLink : deadLinks ) {
165- result = result .replace (deadLink , replacement );
166- }
167- return result ;
168- });
210+ .thenApply (ignored -> deadLinkFutures .stream ()
211+ .map (CompletableFuture ::join )
212+ .filter (Objects ::nonNull )
213+ .toList ())
214+ .thenApply (deadLinks -> {
215+ String result = text ;
216+ for (String deadLink : deadLinks ) {
217+ result = result .replace (deadLink , replacement );
218+ }
219+ return result ;
220+ });
169221 }
170222
223+ /**
224+ * Converts a detected {@link Url} into a normalized link string.
225+ *
226+ * <p>Applies the provided {@link LinkFilter}s:
227+ * <ul>
228+ * <li>{@link LinkFilter#SUPPRESSED} - filters URLs wrapped in angle brackets</li>
229+ * <li>{@link LinkFilter#NON_HTTP_SCHEME} - filters non-HTTP(S) schemes</li>
230+ * </ul>
231+ *
232+ * <p>Additionally removes trailing punctuation such as commas or periods
233+ * from the detected URL.
234+ *
235+ * @param url the detected URL
236+ * @param filter active link filters to apply
237+ * @return an {@link Optional} containing the normalized link,
238+ * or {@code Optional.empty()} if the link should be filtered out
239+ */
240+
171241 private static Optional <String > toLink (Url url , Set <LinkFilter > filter ) {
172242 String raw = url .getOriginalUrl ();
173243 if (filter .contains (LinkFilter .SUPPRESSED ) && raw .contains (">" )) {
@@ -188,5 +258,4 @@ private static Optional<String> toLink(Url url, Set<LinkFilter> filter) {
188258 }
189259 return Optional .of (link );
190260 }
191-
192- }
261+ }
0 commit comments