Skip to content
Merged
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,69 @@
import com.linkedin.urls.detection.UrlDetector;
import com.linkedin.urls.detection.UrlDetectorOptions;

import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletableFuture;

/**
* Utility class to detect links.
* Utility methods for working with links inside arbitrary text.
*
* <p>
* This class can:
* <ul>
* <li>Extract HTTP(S) links from text</li>
* <li>Check whether a link is reachable via HTTP</li>
* <li>Replace broken links asynchronously</li>
* </ul>
*
* <p>
* It is intentionally stateless and uses asynchronous HTTP requests to avoid blocking calling
* threads.
*/

public class LinkDetection {
private static final HttpClient HTTP_CLIENT = HttpClient.newHttpClient();

/**
* Possible ways to filter a link.
* Default filters applied when extracting links from text.
*
* <p>
* These filters intentionally ignore:
* <ul>
* <li>Suppressed links like {@code <https://example.com>}</li>
* <li>Non-HTTP(S) schemes such as {@code ftp://} or {@code file://}</li>
* </ul>
*
* @see LinkDetection
* <p>
* This reduces false positives when scanning chat messages or source-code snippets.
*/

private static final Set<LinkFilter> DEFAULT_FILTERS =
Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME);
Comment thread
barsh404error marked this conversation as resolved.
Comment thread
Zabuzard marked this conversation as resolved.

/**
* Filters that control which detected URLs are returned by {@link #extractLinks}.
*/
public enum LinkFilter {
/**
* Filters links suppressed with {@literal <url>}.
* Ignores URLs that are wrapped in angle brackets, e.g. {@code <https://example.com>}.
*
* <p>
* Such links are often intentionally suppressed in chat platforms.
*/
SUPPRESSED,
/**
* Filters links that are not using http scheme.
* Ignores URLs that do not use the HTTP or HTTPS scheme.
*
* <p>
* This helps avoid false positives such as {@code ftp://}, {@code file://}, or scheme-less
* matches.
*/
NON_HTTP_SCHEME
}
Expand All @@ -34,12 +76,26 @@ private LinkDetection() {
}

/**
* Extracts all links from the given content.
* Extracts HTTP(S) links from the given text.
Comment thread
Zabuzard marked this conversation as resolved.
Outdated
*
* @param content the content to search through
* @param filter the filters applied to the urls
* @return a list of all found links, can be empty
* <p>
* The text is scanned using a URL detector, then filtered and normalized according to the
* provided {@link LinkFilter}s.
*
* <p>
* Example:
*
* <pre>{@code
* Set<LinkFilter> filters = Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME);
* extractLinks("Visit https://example.com and <ftp://skip.me>", filters)
* // returns ["https://example.com"]
* }</pre>
*
* @param content the text to scan for links
* @param filter a set of filters controlling which detected links are returned
* @return a list of extracted links in the order they appear in the text
*/

public static List<String> extractLinks(String content, Set<LinkFilter> filter) {
return new UrlDetector(content, UrlDetectorOptions.BRACKET_MATCH).detect()
.stream()
Comment thread
Zabuzard marked this conversation as resolved.
Expand All @@ -49,15 +105,157 @@ public static List<String> extractLinks(String content, Set<LinkFilter> filter)
}

/**
* Checks whether the given content contains a link.
* Checks whether the given text contains at least one detectable URL.
*
* @param content the content to search through
* @return true if the content contains at least one link
* <p>
* This method performs a lightweight detection only and does not apply any {@link LinkFilter}s.
*
* @param content the text to scan
* @return {@code true} if at least one URL-like pattern is detected
*/

public static boolean containsLink(String content) {
return !(new UrlDetector(content, UrlDetectorOptions.BRACKET_MATCH).detect().isEmpty());
}

/**
* Asynchronously checks whether a URL is considered broken.
*
* <p>
* The check is performed in two steps:
* <ol>
* <li>A {@code HEAD} request is sent first (cheap and fast)</li>
* <li>If that fails or returns an error, a {@code GET} request is used as a fallback</li>
* </ol>
Comment thread
Zabuzard marked this conversation as resolved.
Outdated
*
* <p>
* A link is considered broken if:
* <ul>
* <li>The URL is malformed or unreachable</li>
* <li>The HTTP request fails with an exception</li>
* <li>The response status code is 4xx (client error) or 5xx (server error)</li>
* </ul>
*
* <p>
* Successful responses (2xx) and redirects (3xx) are considered valid links. The response body
* is never inspected.
*
* @param url the URL to check
* @return a {@code CompletableFuture} completing with {@code true} if the link is broken,
* {@code false} otherwise
*/

public static CompletableFuture<Boolean> isLinkBroken(String url) {
HttpRequest headRequest = HttpRequest.newBuilder(URI.create(url))
Comment thread
barsh404error marked this conversation as resolved.
Comment thread
Zabuzard marked this conversation as resolved.
.method("HEAD", HttpRequest.BodyPublishers.noBody())
.build();

return HTTP_CLIENT.sendAsync(headRequest, HttpResponse.BodyHandlers.discarding())
.thenApply(response -> {
int status = response.statusCode();
// 2xx and 3xx are success, 4xx and 5xx are errors
return status >= 400;
})
.exceptionally(ignored -> true)
Comment thread
Zabuzard marked this conversation as resolved.
Outdated
.thenCompose(result -> {
if (!Boolean.TRUE.equals(result)) {
return CompletableFuture.completedFuture(false);
}
HttpRequest fallbackGetRequest =
HttpRequest.newBuilder(URI.create(url)).GET().build();
return HTTP_CLIENT
.sendAsync(fallbackGetRequest, HttpResponse.BodyHandlers.discarding())
.thenApply(resp -> resp.statusCode() >= 400)
.exceptionally(ignored -> true);
});
Comment thread
barsh404error marked this conversation as resolved.
}

/**
* Replaces all broken HTTP(S) links in the given text.
*
* <p>
* Each detected link is checked asynchronously using {@link #isLinkBroken(String)}. Only links
* confirmed as broken are replaced. Duplicate URLs are checked only once and all occurrences
* are replaced if found to be broken.
*
* <p>
* This method does not block - all link checks are performed asynchronously and combined into a
* single {@code CompletableFuture}.
*
* <p>
* Example:
*
* <pre>{@code
* replaceDeadLinks("""
* Test
* http://deadlink/1
* http://workinglink/1
* """, "(broken link)")
* }</pre>
*
* <p>
* Results in:
*
* <pre>{@code
* Test
* (broken link)
* http://workinglink/1
* }</pre>
*
* @param text the input text containing URLs
* @param replacement the string used to replace broken links
* @return a {@code CompletableFuture} that completes with the modified text, or the original
* text if no broken links were found
*/


public static CompletableFuture<String> replaceDeadLinks(String text, String replacement) {
Comment thread
tj-wazei marked this conversation as resolved.
Outdated
Comment thread
Zabuzard marked this conversation as resolved.
Outdated
List<String> links = extractLinks(text, DEFAULT_FILTERS);

if (links.isEmpty()) {
return CompletableFuture.completedFuture(text);
}

List<CompletableFuture<String>> deadLinkFutures = links.stream()
.distinct()
.map(link -> isLinkBroken(link)
.thenApply(isBroken -> Boolean.TRUE.equals(isBroken) ? link : null))
Comment thread
Zabuzard marked this conversation as resolved.
Comment thread
Zabuzard marked this conversation as resolved.
Outdated

Comment thread
barsh404error marked this conversation as resolved.
.toList();

return CompletableFuture.allOf(deadLinkFutures.toArray(new CompletableFuture[0]))
Comment thread
barsh404error marked this conversation as resolved.
Outdated
.thenApply(ignored -> deadLinkFutures.stream()
Comment thread
Zabuzard marked this conversation as resolved.
Outdated
.map(CompletableFuture::join)
.filter(Objects::nonNull)
Comment thread
Zabuzard marked this conversation as resolved.
.toList())
.thenApply(deadLinks -> {
String result = text;
for (String deadLink : deadLinks) {
result = result.replace(deadLink, replacement);
Comment thread
Zabuzard marked this conversation as resolved.
Outdated
}
return result;
});
}

/**
* Converts a detected {@link Url} into a normalized link string.
*
* <p>
* Applies the provided {@link LinkFilter}s:
* <ul>
* <li>{@link LinkFilter#SUPPRESSED} - filters URLs wrapped in angle brackets</li>
* <li>{@link LinkFilter#NON_HTTP_SCHEME} - filters non-HTTP(S) schemes</li>
* </ul>
Comment thread
Zabuzard marked this conversation as resolved.
Outdated
*
* <p>
* Additionally removes trailing punctuation such as commas or periods from the detected URL.
*
* @param url the detected URL
* @param filter active link filters to apply
* @return an {@link Optional} containing the normalized link, or {@code Optional.empty()} if
* the link should be filtered out
*/

private static Optional<String> toLink(Url url, Set<LinkFilter> filter) {
String raw = url.getOriginalUrl();
if (filter.contains(LinkFilter.SUPPRESSED) && raw.contains(">")) {
Expand All @@ -76,8 +274,6 @@ private static Optional<String> toLink(Url url, Set<LinkFilter> filter) {
// Remove trailing punctuation
link = link.substring(0, link.length() - 1);
}

return Optional.of(link);
}

}
Loading