systemctl --type=service | grep Ceph
systemctl status ceph-mgr.target
systemctl status ceph-mon.target
systemctl status ceph-osd.target
sudo ceph -s
sudo ceph -w
sudo ceph status
sudo ceph df
sudo ceph mon stat
sudo ceph mon dump
ceph quorum_status -f json-pretty
ceph daemon mon.pm1 help
ceph daemon mon.pm1 mon_status
# Different way to see the status
# Check filename with
ls /var/run/ceph/ceph- + TAB
# eg ceph-osd.0.asok
ceph daemon /var/run/ceph/ceph-osd.0.asok status
ceph osd stat
ceph osd dump
sudo ceph osd df
ceph pg dump
ceph pg map 128
ceph pg stat
# Checking remapping of PGs
ceph pg dump | grep remap
# Check performance of the disks
# On the ssd side:
iostat -xNmy 1 5
# On the ceph side (maybe this command does not exist):
# https://docs.ceph.com/en/latest/dev/perf_counters/
# Health
ceph health
ceph health detail
# Get the map of Placement Groups
ceph pg dump > /tmp/pg_dump.1
# Others
ceph osd tree
ceph pg map
# Debug chepx authentication
ceph -s --debug_ms=1 --debug_monc=25 --debug_auth=25
# Adding lost keyring
In /etc/pve/ceph/ceph.conf (or /etc/ceph/ceph.conf). Under [Global]
auth_client_required = none
auth_cluster_required = none
auth_service_required = none
Reboot
Then at the prompt type
ceph auth import -i /etc/pve/priv/ceph.client.admin.keyring
Remove the three lines of conf above
Reboot
# Past this as one command in the shell to get PGs per OSD
ceph pg dump | awk '
BEGIN { IGNORECASE = 1 }
/^PG_STAT/ { col=1; while($col!="UP") {col++}; col++ }
/^[0-9a-f]+\.[0-9a-f]+/ { match($0,/^[0-9a-f]+/); pool=substr($0, RSTART, RLENGTH); poollist[pool]=0;
up=$col; i=0; RSTART=0; RLENGTH=0; delete osds; while(match(up,/[0-9]+/)>0) { osds[++i]=substr(up,RSTART,RLENGTH); up = substr(up, RSTART+RLENGTH) }
for(i in osds) {array[osds[i],pool]++; osdlist[osds[i]];}
}
END {
printf("\n");
printf("pool :\t"); for (i in poollist) printf("%s\t",i); printf("| SUM \n");
for (i in poollist) printf("--------"); printf("----------------\n");
for (i in osdlist) { printf("osd.%i\t", i); sum=0;
for (j in poollist) { printf("%i\t", array[i,j]); sum+=array[i,j]; sumpool[j]+=array[i,j] }; printf("| %i\n",sum) }
for (i in poollist) printf("--------"); printf("----------------\n");
printf("SUM :\t"); for (i in poollist) printf("%s\t",sumpool[i]); printf("|\n");
}'
It take more than below. This is a start https://access.redhat.com/documentation/zh-tw/red_hat_ceph_storage/5/html/administration_guide/understanding-process-management-for-ceph#powering-down-and-rebooting-a-red-hat-ceph-storage-cluster_admin
If you use the Ceph File System (CephFS), bring down the CephFS cluster:
eph fs set cephfs max_mds 1
ceph fs fail cephfs
ceph status
ceph fs set cephfs joinable false
ceph osd set noout
ceph osd set norecover
ceph osd set norebalance
ceph osd set nobackfill
ceph osd set nodown
ceph osd set pause
# If the MDS and Ceph Object Gateway nodes are on their own dedicated nodes, power them off.
# Shut down the OSD nodes one by one
systemctl stop ceph-499829b4-832f-11eb-8d6d-001a4a000635@osd.2.service
# Shut down the monitor nodes one by one:
systemctl stop ceph-499829b4-832f-11eb-8d6d-001a4a000635@mon.host01.service
# Reboot
systemctl start ceph-499829b4-832f-11eb-8d6d-001a4a000635@mon.host01.service
systemctl start ceph-499829b4-832f-11eb-8d6d-001a4a000635@osd.2.service
sudo ceph -s
ceph osd unset noout
ceph osd unset norecover
ceph osd unset norebalance
ceph osd unset nobackfill
ceph osd unset nodown
ceph osd unset pause
# If you use the Ceph File System (CephFS), bring the CephFS cluster back up by setting the joinable flag to true
ceph fs set cephfs joinable true
# 1. If a restart of the server is needed
ceph osd set noout
ceph osd set norecover
ceph osd set norebalance
ceph osd set nobackfill
ceph osd set nodown
ceph osd set pause
# 2. Install it physically
# 3. If restarted
ceph osd unset noout
ceph osd unset norecover
ceph osd unset norebalance
ceph osd unset nobackfill
ceph osd unset nodown
ceph osd unset pause
# 4. Clear the disk (run commands in the shell)
ceph-volume lvm zap /dev/sd[X] --destroy
# 5. Create a new osd
pveceph osd create /dev/sd[X]
# 6. Set the weight (temporarily) to 0 for the old disks ([Y] = OSD number)
ceph osd crush reweight osd.[Y] 0
# 7. When all data is moved.
ceph osd out [Y]
# 8. When it is time to remove the disk. Stop the osd service
systemctl stop ceph-osd@[Y].service
# List profiles
ceph osd erasure-code-profile ls
# Display profile settings (replace {name} with your profile name)
ceph osd erasure-code-profile get {name}
# Display profile settings of the default profile
ceph osd erasure-code-profile get default
# configure (in the config file) the crush-failure-domain to impact how the data is spread
# Choose to survive from outage of: osd, host, rack. Example, survive 1 server/host outage:
crush-failure-domain=host
# K and M
# K: data chunks, M: coding chunks
# Sustain loss of 2 OSD (M = 2), Spread over 5 OSD => K = 3, as K+M=5 (the 5 OSD spread)
# Overhead
# The overhead factor (space amplification) of an erasure-coded pool is (k+m) / k
m=1 m=2 m=3 m=4 m=5 m=6 m=7 m=8 m=9 m=10 m=11
k=1 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 10.00 11.00 12.00
k=2 1.50 2.00 2.50 3.00 3.50 4.00 4.50 5.00 5.50 6.00 6.50
k=3 1.33 1.67 2.00 2.33 2.67 3.00 3.33 3.67 4.00 4.33 4.67
k=4 1.25 1.50 1.75 2.00 2.25 2.50 2.75 3.00 3.25 3.50 3.75
k=5 1.20 1.40 1.60 1.80 2.00 2.20 2.40 2.60 2.80 3.00 3.20
k=6 1.16 1.33 1.50 1.66 1.83 2.00 2.17 2.33 2.50 2.66 2.83
k=7 1.14 1.29 1.43 1.58 1.71 1.86 2.00 2.14 2.29 2.43 2.58
k=8 1.13 1.25 1.38 1.50 1.63 1.75 1.88 2.00 2.13 2.25 2.38
k=9 1.11 1.22 1.33 1.44 1.56 1.67 1.78 1.88 2.00 2.11 2.22
k=10 1.10 1.20 1.30 1.40 1.50 1.60 1.70 1.80 1.90 2.00 2.10
k=11 1.09 1.18 1.27 1.36 1.45 1.54 1.63 1.72 1.82 1.91 2.00