cloudstack/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java at 9cebe7223b9bbb8d7c4dfe1afd1f7a75663e7c64 · apache/cloudstack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
package com.cloud.ha;

import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Cluster;

import com.cloud.deploy.DeploymentPlanner;
import com.cloud.host.HostVO;
import com.cloud.host.Status;
import com.cloud.storage.Storage.StoragePoolType;
import com.cloud.utils.component.Manager;
import com.cloud.vm.VMInstanceVO;
import org.apache.cloudstack.framework.config.ConfigKey;

import java.util.List;

/**
 * HighAvailabilityManager checks to make sure the VMs are running fine.
 */
public interface HighAvailabilityManager extends Manager {

    List<StoragePoolType> LIBVIRT_STORAGE_POOL_TYPES_WITH_HA_SUPPORT = List.of(StoragePoolType.NetworkFilesystem, StoragePoolType.SharedMountPoint);

    ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
        "Force High-Availability to happen even if the VM says no.", true, Cluster);

    ConfigKey<Integer> HAWorkers = new ConfigKey<>("Advanced", Integer.class, "ha.workers", "5",
        "The number of High-Availability worker threads.", true, Cluster);

    ConfigKey<Integer> InvestigateRetryInterval = new ConfigKey<>("Advanced", Integer.class, "investigate.retry.interval",
        "60", "The time (in seconds) between VM pings when the agent is disconnected.", true, Cluster);

    ConfigKey<Integer> MigrateRetryInterval = new ConfigKey<>("Advanced", Integer.class, "migrate.retry.interval",
        "120", "The time (in seconds) between migration retries.", true, Cluster);

    ConfigKey<Integer> RestartRetryInterval = new ConfigKey<>("Advanced", Integer.class, "restart.retry.interval",
        "600", "The time (in seconds) between retries to restart a VM.", true, Cluster);

    ConfigKey<Integer> StopRetryInterval = new ConfigKey<>("Advanced", Integer.class, "stop.retry.interval",
        "600", "The time in seconds between retries to stop or destroy a VM.", true, Cluster);

    ConfigKey<Long> TimeBetweenCleanup = new ConfigKey<>("Advanced", Long.class,
        "time.between.cleanup", "86400", "The time in seconds to wait before the"
        + " cleanup thread runs for the different HA-Worker-Threads. The cleanup thread finds all the work items "
        + "that were successful and is now ready to be purged from the database (table: op_ha_work).",
        true, Cluster);

    ConfigKey<Integer> MaxRetries = new ConfigKey<>("Advanced", Integer.class, "max.retries",
        "5", "The number of times to try a restart for the different Work-Types: "
        + "Migrating - VMs off of a host, Destroy - a VM, Stop - a VM for storage pool migration purposes,"
        + " CheckStop - checks if a VM has been stopped, ForceStop - force a VM to stop even if the "
        + "states don't allow it, Destroy - a VM and HA - restart a VM.", true, Cluster);

    ConfigKey<Long> TimeToSleep = new ConfigKey<>("Advanced", Long.class, "time.to.sleep",
        "60", "The time in seconds to sleep before checking the database (table: op_ha_work) "
        + "for new working types (Migration, Stop, CheckStop, ForceStop, Destroy and HA), if no work items are found.",
        true, Cluster);

    ConfigKey<Long> TimeBetweenFailures = new ConfigKey<>("Advanced", Long.class,
        "time.between.failures", "3600", "The time in seconds before try to cleanup all the VMs"
        + " which are registered for the HA event that were successful and are now ready to be purged.",
        true, Cluster);

    public static final ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
            "Proceed fencing the host even the heartbeat failed for only one storage pool", false, ConfigKey.Scope.Zone);

    public enum WorkType {
        Migration,  // Migrating VMs off of a host.
        Stop,       // Stops a VM for storage pool migration purposes.  This should be obsolete now.
        CheckStop,  // Checks if a VM has been stopped.
        ForceStop,  // Force a VM to stop even if the states don't allow it.  Use this only if you know the VM is stopped on the physical hypervisor.
        Destroy,    // Destroy a VM.
        HA;         // Restart a VM.
    }

    enum ReasonType {
        Unknown,
        HostMaintenance,
        HostDown,
        HostDegraded;
    }

    enum Step {
        Scheduled, Investigating, Fencing, Stopping, Restarting, Migrating, Cancelled, Done, Error,
    }

    /**
     * Investigate why a host has disconnected and migrate the VMs on it
     * if necessary.
     *
     * @param hostId - the id of the host that has disconnected.
     */
    Status investigate(long hostId);

    /**
     * Restart a vm that has gone away due to various reasons.  Whether a
     * VM is restarted depends on various reasons.
     *   1. Is the VM really dead.  This method will try to find out.
     *   2. Is the VM HA enabled?  If not, the VM is simply stopped.
     *
     * All VMs that enter HA mode is not allowed to be operated on until it
     * has been determined that the VM is dead.
     *
     * @param vm the vm that has gone away.
     * @param investigate must be investigated before we do anything with this vm.
     */
    void scheduleRestart(VMInstanceVO vm, boolean investigate);
    void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType);

    void cancelDestroy(VMInstanceVO vm, Long hostId);

    boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType);

    /**
     * Schedule restarts for all vms running on the host.
     * @param host host.
     * @param investigate whether to investigate
     * @param reasonType reason for HA work
     */
    void scheduleRestartForVmsOnHost(HostVO host, boolean investigate, ReasonType reasonType);

    /**
     * Schedule the vm for migration.
     *
     * @param vm
     * @return true if schedule worked.
     */
    boolean scheduleMigration(VMInstanceVO vm);
    boolean scheduleMigration(VMInstanceVO vm, ReasonType reasonType);

    List<VMInstanceVO> findTakenMigrationWork();

    /**
     * Schedules a work item to stop a VM.  This method schedules a work
     * item to do one of three things.
     *
     * 1. Perform a regular stop of a VM: WorkType.Stop
     * 2. Perform a force stop of a VM: WorkType.ForceStop
     * 3. Check if a VM has been stopped: WorkType.CheckStop
     *
     * @param vm virtual machine to stop.
     * @param hostId the id of the host the virtual machine is on.
     * @param type which type of stop is requested.
     */
    boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
    boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType);

    void cancelScheduledMigrations(HostVO host);

    boolean hasPendingHaWork(long vmId);

    boolean hasPendingMigrationsWork(long vmId);
    /**
     * @return
     */
    String getHaTag();

    DeploymentPlanner getHAPlanner();
    int expungeWorkItemsByVmList(List<Long> vmIds, Long batchSize);
}