diff --git a/.github/workflows/ci-release.yml b/.github/workflows/ci-release.yml index 5b6f9de372..3afe6856e7 100644 --- a/.github/workflows/ci-release.yml +++ b/.github/workflows/ci-release.yml @@ -114,7 +114,7 @@ jobs: # Thus, we add `continue-on-error: true` here, but we should remove it # when either the issue is fixed (see: https://github.com/aquasecurity/trivy-action/issues/389) # or we self-host trivy database. - uses: aquasecurity/trivy-action@0.32.0 + uses: aquasecurity/trivy-action@v0.35.0 continue-on-error: true with: image-ref: 'ghcr.io/janusgraph/janusgraph:${{ env.JG_VER }}${{ matrix.tag_suffix }}' diff --git a/docs/changelog.md b/docs/changelog.md index 50682bcae9..298d39f208 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -105,6 +105,23 @@ For more information on features and bug fixes in 1.2.0, see the GitHub mileston Starting from version 1.2.0 JanusGraph supports ElasticSearch 9. +##### Zombie instances auto-close during index status update operations + +Starting from version 1.2.0 JanusGraph can automatically force-close JanusGraph instances that are unreachable +during index status update operations. + +To enable this behavior, users can set the following configuration: +``` +graph.management-auto-close-stale-instances=true +``` + +By default, an instance is considered stale if it is not reachable for more than 2 minutes. However, this +threshold can be adjusted using the following configuration: +``` +graph.management-ack-timeout=240000 ms +``` +This is a breaking change for users who use the `JanusGraphIndexStatusUpdate` interface. + ### Version 1.1.0 (Release Date: November 7, 2024) /// tab | Maven diff --git a/docs/configs/janusgraph-cfg.md b/docs/configs/janusgraph-cfg.md index 283604f4ce..2b6125cadf 100644 --- a/docs/configs/janusgraph-cfg.md +++ b/docs/configs/janusgraph-cfg.md @@ -52,6 +52,8 @@ General configuration options | graph.allow-upgrade | Setting this to true will allow certain fixed values to be updated such as storage-version. This should only be used for upgrading. | Boolean | false | MASKABLE | | graph.assign-timestamp | Whether to use JanusGraph generated client-side timestamp in mutations if the backend supports it. When enabled, JanusGraph assigns one timestamp to all insertions and another slightly earlier timestamp to all deletions in the same batch. When this is disabled, mutation behavior depends on the backend. Some might use server-side timestamp (e.g. HBase) while others might use client-side timestamp generated by driver (CQL). | Boolean | true | LOCAL | | graph.graphname | This config option is an optional configuration setting that you may supply when opening a graph. The String value you provide will be the name of your graph. If you use the ConfigurationManagement APIs, then you will be able to access your graph by this String representation using the ConfiguredGraphFactory APIs. | String | (no default value) | LOCAL | +| graph.management-ack-timeout | Maximum time to wait for an instance to acknowledge a schema change before treating it as stale. Only effective when graph.management-auto-close-stale-instances is enabled. | Duration | 120000 ms | MASKABLE | +| graph.management-auto-close-stale-instances | Whether to automatically force-close instances that fail to acknowledge schema changes within the configured timeout (graph.management-ack-timeout). When enabled, unresponsive instances are removed from the registration store so they no longer block schema status transitions. | Boolean | false | MASKABLE | | graph.replace-instance-if-exists | If a JanusGraph instance with the same instance identifier already exists, the usage of this configuration option results in the opening of this graph anyway. | Boolean | false | LOCAL | | graph.set-vertex-id | Whether user provided vertex ids should be enabled and JanusGraph's automatic vertex id allocation be disabled. Useful when operating JanusGraph in concert with another storage system that assigns long ids but disables some of JanusGraph's advanced features which can lead to inconsistent data. For example, users must ensure the vertex ids are unique to avoid duplication. Must use `graph.getIDManager().toVertexId(long)` to convert your id first. Once this is enabled, you have to provide vertex id when creating new vertices. EXPERT FEATURE - USE WITH GREAT CARE. | Boolean | false | GLOBAL_OFFLINE | | graph.storage-version | The version of JanusGraph storage schema with which this database was created. Automatically set on first start of graph. Should only ever be changed if upgrading to a new major release version of JanusGraph that contains schema changes | String | (no default value) | FIXED | diff --git a/janusgraph-core/src/main/java/org/janusgraph/graphdb/configuration/GraphDatabaseConfiguration.java b/janusgraph-core/src/main/java/org/janusgraph/graphdb/configuration/GraphDatabaseConfiguration.java old mode 100644 new mode 100755 index 7c51ad087a..29f96555f7 --- a/janusgraph-core/src/main/java/org/janusgraph/graphdb/configuration/GraphDatabaseConfiguration.java +++ b/janusgraph-core/src/main/java/org/janusgraph/graphdb/configuration/GraphDatabaseConfiguration.java @@ -1124,6 +1124,19 @@ public boolean apply(@Nullable String s) { public static final ConfigNamespace LOG_NS = new ConfigNamespace(GraphDatabaseConfiguration.ROOT_NS,"log","Configuration options for JanusGraph's logging system",true); + public static final ConfigOption MANAGEMENT_AUTO_CLOSE_STALE_INSTANCES = new ConfigOption<>(GRAPH_NS, + "management-auto-close-stale-instances", + "Whether to automatically force-close instances that fail to acknowledge schema changes within the " + + "configured timeout (graph.management-ack-timeout). When enabled, unresponsive instances are removed " + + "from the registration store so they no longer block schema status transitions.", + ConfigOption.Type.MASKABLE, false); + + public static final ConfigOption MANAGEMENT_ACK_TIMEOUT = new ConfigOption<>(GRAPH_NS, + "management-ack-timeout", + "Maximum time to wait for an instance to acknowledge a schema change before treating it as stale. " + + "Only effective when graph.management-auto-close-stale-instances is enabled.", + ConfigOption.Type.MASKABLE, Duration.ofSeconds(120)); + public static final String MANAGEMENT_LOG = "janusgraph"; public static final String TRANSACTION_LOG = "tx"; public static final String USER_LOG = "user"; diff --git a/janusgraph-core/src/main/java/org/janusgraph/graphdb/database/StandardJanusGraph.java b/janusgraph-core/src/main/java/org/janusgraph/graphdb/database/StandardJanusGraph.java old mode 100644 new mode 100755 index 05692129c7..cc04a46dda --- a/janusgraph-core/src/main/java/org/janusgraph/graphdb/database/StandardJanusGraph.java +++ b/janusgraph-core/src/main/java/org/janusgraph/graphdb/database/StandardJanusGraph.java @@ -115,6 +115,7 @@ import java.io.Closeable; import java.io.IOException; +import java.time.Duration; import java.time.Instant; import java.util.ArrayList; import java.util.Collection; @@ -134,6 +135,8 @@ import javax.script.Bindings; import javax.script.ScriptException; +import static org.janusgraph.graphdb.configuration.GraphDatabaseConfiguration.MANAGEMENT_ACK_TIMEOUT; +import static org.janusgraph.graphdb.configuration.GraphDatabaseConfiguration.MANAGEMENT_AUTO_CLOSE_STALE_INSTANCES; import static org.janusgraph.graphdb.configuration.GraphDatabaseConfiguration.REGISTRATION_TIME; import static org.janusgraph.graphdb.configuration.GraphDatabaseConfiguration.REPLACE_INSTANCE_IF_EXISTS; import static org.janusgraph.graphdb.configuration.GraphDatabaseConfiguration.SCRIPT_EVAL_ENABLED; @@ -254,7 +257,9 @@ public StandardJanusGraph(GraphDatabaseConfiguration configuration) { globalConfig.set(REGISTRATION_TIME, times.getTime(), uniqueInstanceId); Log managementLog = backend.getSystemMgmtLog(); - managementLogger = new ManagementLogger(this, managementLog, schemaCache, this.times); + Duration ackTimeout = configuration.getConfiguration().get(MANAGEMENT_ACK_TIMEOUT); + boolean autoCloseStaleInstances = configuration.getConfiguration().get(MANAGEMENT_AUTO_CLOSE_STALE_INSTANCES); + managementLogger = new ManagementLogger(this, managementLog, schemaCache, this.times, ackTimeout, autoCloseStaleInstances); managementLog.registerReader(ReadMarker.fromNow(), managementLogger); shutdownHook = new ShutdownThread(this); diff --git a/janusgraph-core/src/main/java/org/janusgraph/graphdb/database/management/ManagementLogger.java b/janusgraph-core/src/main/java/org/janusgraph/graphdb/database/management/ManagementLogger.java old mode 100644 new mode 100755 index 1ccc0363a1..fb0cf5693a --- a/janusgraph-core/src/main/java/org/janusgraph/graphdb/database/management/ManagementLogger.java +++ b/janusgraph-core/src/main/java/org/janusgraph/graphdb/database/management/ManagementLogger.java @@ -37,6 +37,7 @@ import org.slf4j.LoggerFactory; import java.time.Duration; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; @@ -71,11 +72,21 @@ public class ManagementLogger implements MessageReader { private final AtomicInteger evictionTriggerCounter = new AtomicInteger(0); private final ConcurrentMap evictionTriggerMap = new ConcurrentHashMap<>(); + private final Duration ackTimeout; + private final boolean autoCloseStaleInstances; + public ManagementLogger(StandardJanusGraph graph, Log sysLog, SchemaCache schemaCache, TimestampProvider times) { + this(graph, sysLog, schemaCache, times, Duration.ofSeconds(120), true); + } + + public ManagementLogger(StandardJanusGraph graph, Log sysLog, SchemaCache schemaCache, TimestampProvider times, + Duration ackTimeout, boolean autoCloseStaleInstances) { this.graph = graph; this.schemaCache = schemaCache; this.sysLog = sysLog; this.times = times; + this.ackTimeout = ackTimeout; + this.autoCloseStaleInstances = autoCloseStaleInstances; Preconditions.checkNotNull(times); } @@ -163,11 +174,13 @@ private class EvictionTrigger { final List> updatedTypeTriggers; final StandardJanusGraph graph; final Set instancesToBeAcknowledged; + final long createdAtMillis; private EvictionTrigger(long evictionId, List> updatedTypeTriggers, StandardJanusGraph graph) { this.graph = graph; this.evictionId = evictionId; this.updatedTypeTriggers = updatedTypeTriggers; + this.createdAtMillis = System.currentTimeMillis(); final JanusGraphManagement mgmt = graph.openManagement(); this.instancesToBeAcknowledged = ConcurrentHashMap.newKeySet(); instancesToBeAcknowledged.addAll(((ManagementSystem) mgmt).getOpenInstancesInternal()); @@ -204,8 +217,34 @@ int removeDroppedInstances() { final String instanceRemovedMsg = "Instance [{}] was removed list of open instances and therefore dropped from list of instances to be acknowledged."; instancesToBeAcknowledged.stream().filter(it -> !updatedInstances.contains(it)).filter(instancesToBeAcknowledged::remove).forEach(it -> log.debug(instanceRemovedMsg, it)); mgmt.rollback(); + + if (autoCloseStaleInstances && !instancesToBeAcknowledged.isEmpty() + && (System.currentTimeMillis() - createdAtMillis) > ackTimeout.toMillis()) { + log.warn("ACK timeout ({}s) exceeded for eviction [{}]. Force-closing unresponsive instances: {}", + ackTimeout.getSeconds(), evictionId, instancesToBeAcknowledged); + forceCloseStaleInstances(new HashSet<>(instancesToBeAcknowledged)); + instancesToBeAcknowledged.clear(); + } + return instancesToBeAcknowledged.size(); } + + private void forceCloseStaleInstances(Set staleInstances) { + try { + final JanusGraphManagement mgmt = graph.openManagement(); + for (String instanceId : staleInstances) { + try { + mgmt.forceCloseInstance(instanceId); + log.warn("Force-closed stale instance [{}]", instanceId); + } catch (IllegalArgumentException e) { + log.debug("Could not force-close instance [{}]: {}", instanceId, e.getMessage()); + } + } + mgmt.commit(); + } catch (Exception e) { + log.error("Failed to force-close stale instances", e); + } + } } private class SendAckOnTxClose implements Runnable { diff --git a/janusgraph-core/src/main/java/org/janusgraph/graphdb/olap/job/IndexRepairJob.java b/janusgraph-core/src/main/java/org/janusgraph/graphdb/olap/job/IndexRepairJob.java old mode 100644 new mode 100755 index 43144a066c..c337b0a9bf --- a/janusgraph-core/src/main/java/org/janusgraph/graphdb/olap/job/IndexRepairJob.java +++ b/janusgraph-core/src/main/java/org/janusgraph/graphdb/olap/job/IndexRepairJob.java @@ -229,7 +229,11 @@ public void process(JanusGraphVertex vertex, ScanMetrics metrics) { @Override public void workerIterationEnd(final ScanMetrics metrics) { try { - if (index instanceof JanusGraphIndex) { + // If writeTx is `null` it means that `workerIterationStart` failed previously and write transaction was + // never assigned. In this case, instead of raising NPE here we skip this block which will + // properly propagate root cause exception instead of NPE exception. + // See the first `catch` block in `StandardScannerExecutor.run`. + if (index instanceof JanusGraphIndex && writeTx != null) { BackendTransaction mutator = writeTx.getTxHandle(); IndexType indexType = managementSystem.getSchemaVertex(index).asIndexType(); if (indexType.isMixedIndex() && documentsPerStore.size() > 0) {