44import com .linkedin .urls .detection .UrlDetector ;
55import com .linkedin .urls .detection .UrlDetectorOptions ;
66
7+ import java .net .URI ;
8+ import java .net .http .HttpClient ;
9+ import java .net .http .HttpRequest ;
10+ import java .net .http .HttpResponse ;
711import java .util .List ;
12+ import java .util .Objects ;
813import java .util .Optional ;
914import java .util .Set ;
15+ import java .util .concurrent .CompletableFuture ;
1016
1117/**
12- * Utility class to detect links.
18+ * Utility methods for working with links inside arbitrary text.
19+ *
20+ * <p>
21+ * This class can:
22+ * <ul>
23+ * <li>Extract HTTP(S) links from text</li>
24+ * <li>Check whether a link is reachable via HTTP</li>
25+ * <li>Replace broken links asynchronously</li>
26+ * </ul>
27+ *
28+ * <p>
29+ * It is intentionally stateless and uses asynchronous HTTP requests to avoid blocking calling
30+ * threads.
1331 */
1432public class LinkDetection {
33+ private static final HttpClient HTTP_CLIENT = HttpClient .newHttpClient ();
1534
1635 /**
17- * Possible ways to filter a link .
36+ * Default filters applied when extracting links from text .
1837 *
19- * @see LinkDetection
38+ * <p>
39+ * Links to intentionally ignore in order to reduce false positives when scanning chat messages
40+ * or source-code snippets.
41+ */
42+ private static final Set <LinkFilter > DEFAULT_FILTERS =
43+ Set .of (LinkFilter .SUPPRESSED , LinkFilter .NON_HTTP_SCHEME );
44+
45+ /**
46+ * Filters that control which detected URLs are returned by {@link #extractLinks}.
2047 */
2148 public enum LinkFilter {
2249 /**
23- * Filters links suppressed with {@literal <url>}.
50+ * Ignores URLs that are wrapped in angle brackets, e.g. {@code <https://example.com>}.
51+ *
52+ * <p>
53+ * Such links are often intentionally suppressed in chat platforms.
2454 */
2555 SUPPRESSED ,
2656 /**
27- * Filters links that are not using http scheme.
57+ * Ignores URLs that do not use the HTTP or HTTPS scheme.
58+ *
59+ * <p>
60+ * This helps avoid false positives such as {@code ftp://}, {@code file://}, or scheme-less
61+ * matches.
2862 */
2963 NON_HTTP_SCHEME
3064 }
@@ -34,11 +68,24 @@ private LinkDetection() {
3468 }
3569
3670 /**
37- * Extracts all links from the given content.
71+ * Extracts links from the given text.
72+ *
73+ * <p>
74+ * The text is scanned using a URL detector, then filtered and normalized according to the
75+ * provided {@link LinkFilter}s.
76+ *
77+ * <p>
78+ * Example:
79+ *
80+ * <pre>{@code
81+ * Set<LinkFilter> filters = Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME);
82+ * extractLinks("Visit https://example.com and <ftp://skip.me>", filters)
83+ * // returns ["https://example.com"]
84+ * }</pre>
3885 *
39- * @param content the content to search through
40- * @param filter the filters applied to the urls
41- * @return a list of all found links, can be empty
86+ * @param content the text to scan for links
87+ * @param filter a set of filters controlling which detected links are returned
88+ * @return a list of extracted links in the order they appear in the text
4289 */
4390 public static List <String > extractLinks (String content , Set <LinkFilter > filter ) {
4491 return new UrlDetector (content , UrlDetectorOptions .BRACKET_MATCH ).detect ()
@@ -49,22 +96,166 @@ public static List<String> extractLinks(String content, Set<LinkFilter> filter)
4996 }
5097
5198 /**
52- * Checks whether the given content contains a link .
99+ * Extracts links from the given text using default filters .
53100 *
54- * @param content the content to search through
55- * @return true if the content contains at least one link
101+ * <p>
102+ * This is a convenience method that uses {@link #DEFAULT_FILTERS}.
103+ *
104+ * @param content the text to scan for links
105+ * @return a list of extracted links in the order they appear in the text
106+ * @see #extractLinks(String, Set)
107+ */
108+ public static List <String > extractLinks (String content ) {
109+ return extractLinks (content , DEFAULT_FILTERS );
110+ }
111+
112+ /**
113+ * Checks whether the given text contains at least one detectable URL.
114+ *
115+ * <p>
116+ * This method performs a lightweight detection only and does not apply any {@link LinkFilter}s.
117+ *
118+ * @param content the text to scan
119+ * @return {@code true} if at least one URL-like pattern is detected
56120 */
57121 public static boolean containsLink (String content ) {
58122 return !(new UrlDetector (content , UrlDetectorOptions .BRACKET_MATCH ).detect ().isEmpty ());
59123 }
60124
125+ /**
126+ * Asynchronously checks whether a URL is considered broken.
127+ *
128+ * <p>
129+ * A link is considered broken if:
130+ * <ul>
131+ * <li>The URL is malformed or unreachable</li>
132+ * <li>The HTTP request fails with an exception</li>
133+ * <li>The response status code is 4xx (client error) or 5xx (server error)</li>
134+ * </ul>
135+ *
136+ * <p>
137+ * Successful responses (2xx) and redirects (3xx) are considered valid links. The response body
138+ * is never inspected.
139+ *
140+ * @param url the URL to check
141+ * @return a {@code CompletableFuture} completing with {@code true} if the link is broken,
142+ * {@code false} otherwise
143+ */
144+ public static CompletableFuture <Boolean > isLinkBroken (String url ) {
145+ // Try HEAD request first (cheap and fast)
146+ HttpRequest headRequest = HttpRequest .newBuilder (URI .create (url ))
147+ .method ("HEAD" , HttpRequest .BodyPublishers .noBody ())
148+ .build ();
149+
150+ return HTTP_CLIENT .sendAsync (headRequest , HttpResponse .BodyHandlers .discarding ())
151+ .thenApply (response -> {
152+ int status = response .statusCode ();
153+ // 2xx and 3xx are success, 4xx and 5xx are errors
154+ return status >= 400 ;
155+ })
156+ .exceptionally (_ -> true )
157+ .thenCompose (result -> {
158+ if (!Boolean .TRUE .equals (result )) {
159+ return CompletableFuture .completedFuture (false );
160+ }
161+ // If HEAD fails, fall back to GET request (some servers don't support HEAD)
162+ HttpRequest fallbackGetRequest =
163+ HttpRequest .newBuilder (URI .create (url )).GET ().build ();
164+ return HTTP_CLIENT
165+ .sendAsync (fallbackGetRequest , HttpResponse .BodyHandlers .discarding ())
166+ .thenApply (resp -> resp .statusCode () >= 400 )
167+ .exceptionally (_ -> true );
168+ });
169+ }
170+
171+ /**
172+ * Replaces all broken links in the given text.
173+ *
174+ * <p>
175+ * Each detected link is checked asynchronously using {@link #isLinkBroken(String)}. Only links
176+ * confirmed as broken are replaced. Duplicate URLs are checked only once and all occurrences
177+ * are replaced if found to be broken.
178+ *
179+ * <p>
180+ * This method does not block - all link checks are performed asynchronously and combined into a
181+ * single {@code CompletableFuture}.
182+ *
183+ * <p>
184+ * Example:
185+ *
186+ * <pre>{@code
187+ * replaceBrokenLinks("""
188+ * Test
189+ * http://deadlink/1
190+ * http://workinglink/1
191+ * """, "(broken link)")
192+ * }</pre>
193+ *
194+ * <p>
195+ * Results in:
196+ *
197+ * <pre>{@code
198+ * Test
199+ * (broken link)
200+ * http://workinglink/1
201+ * }</pre>
202+ *
203+ * @param text the input text containing URLs
204+ * @param replacement the string used to replace broken links
205+ * @return a {@code CompletableFuture} that completes with the modified text, or the original
206+ * text if no broken links were found
207+ */
208+ public static CompletableFuture <String > replaceBrokenLinks (String text , String replacement ) {
209+ List <String > links = extractLinks (text , DEFAULT_FILTERS );
210+
211+ if (links .isEmpty ()) {
212+ return CompletableFuture .completedFuture (text );
213+ }
214+
215+ // Can't filter yet - we won't know which links are broken until the futures complete
216+ List <CompletableFuture <String >> brokenLinkFutures = links .stream ()
217+ .distinct ()
218+ .map (link -> isLinkBroken (link )
219+ .thenApply (isBroken -> Boolean .TRUE .equals (isBroken ) ? link : null ))
220+ .toList ();
221+
222+ return CompletableFuture .allOf (brokenLinkFutures .toArray (CompletableFuture []::new ))
223+ .thenApply (_ -> brokenLinkFutures .stream ()
224+ .map (CompletableFuture ::join )
225+ .filter (Objects ::nonNull )
226+ .toList ())
227+ .thenApply (brokenLinks -> replaceLinks (brokenLinks , text , replacement ));
228+ }
229+
230+ private static String replaceLinks (List <String > linksToReplace , String text ,
231+ String replacement ) {
232+ String result = text ;
233+ for (String link : linksToReplace ) {
234+ result = result .replace (link , replacement );
235+ }
236+ return result ;
237+ }
238+
239+ /**
240+ * Converts a detected {@link Url} into a normalized link string.
241+ *
242+ * <p>
243+ * Applies the provided {@link LinkFilter}s. Additionally removes trailing punctuation such as
244+ * commas or periods from the detected URL.
245+ *
246+ * @param url the detected URL
247+ * @param filter active link filters to apply
248+ * @return an {@link Optional} containing the normalized link, or {@code Optional.empty()} if
249+ * the link should be filtered out
250+ */
61251 private static Optional <String > toLink (Url url , Set <LinkFilter > filter ) {
62252 String raw = url .getOriginalUrl ();
63253 if (filter .contains (LinkFilter .SUPPRESSED ) && raw .contains (">" )) {
64254 // URL escapes, such as "<http://example.com>" should be skipped
65255 return Optional .empty ();
66256 }
67- // Not interested in other schemes, also to filter out matches without scheme.
257+ // Not interested in other schemes, also to filter out matches without scheme (Skip non-HTTP
258+ // schemes)
68259 // It detects a lot of such false-positives in Java snippets
69260 if (filter .contains (LinkFilter .NON_HTTP_SCHEME ) && !raw .startsWith ("http" )) {
70261 return Optional .empty ();
@@ -76,8 +267,6 @@ private static Optional<String> toLink(Url url, Set<LinkFilter> filter) {
76267 // Remove trailing punctuation
77268 link = link .substring (0 , link .length () - 1 );
78269 }
79-
80270 return Optional .of (link );
81271 }
82-
83272}
0 commit comments