Skip to content

Commit a0dd999

Browse files
welkinhemonrog2
andauthored
APIC DB Size check (#266)
* APIC DB size check + pytests * run_cmd function + pytests --------- Co-authored-by: Gabriel <gmonroy@cisco.com>
1 parent 22104d7 commit a0dd999

7 files changed

Lines changed: 744 additions & 5 deletions

File tree

aci-preupgrade-validation-script.py

Lines changed: 94 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1260,6 +1260,28 @@ def icurl(apitype, query, page_size=100000):
12601260
return total_imdata
12611261

12621262

1263+
def run_cmd(cmd, splitlines=True):
1264+
"""
1265+
Run a shell command.
1266+
:param cmd: Command to run, can be a string or a list.
1267+
:param splitlines: If True, splits the output into a list of lines.
1268+
If False, returns the raw text output as a single string.
1269+
Returns the output of the command.
1270+
"""
1271+
if isinstance(cmd, list):
1272+
cmd = ' '.join(cmd)
1273+
try:
1274+
log.info('run_cmd = ' + cmd)
1275+
response = subprocess.check_output(cmd, shell=True).decode('utf-8')
1276+
log.debug('response: ' + str(response))
1277+
if splitlines:
1278+
return response.splitlines()
1279+
return response
1280+
except subprocess.CalledProcessError as e:
1281+
log.error("Command '%s' failed with error: %s", cmd, str(e))
1282+
raise e
1283+
1284+
12631285
def get_credentials():
12641286
prints('To use a non-default Login Domain, enter apic#DOMAIN\\\\USERNAME')
12651287
while True:
@@ -3166,6 +3188,7 @@ def cimc_compatibilty_check(tversion, **kwargs):
31663188
return Result(result=result, headers=headers, data=data, recommended_action=recommended_action, doc_url=doc_url)
31673189

31683190

3191+
# Subprocess Check - icurl
31693192
@check_wrapper(check_title="Intersight Device Connector upgrade status")
31703193
def intersight_upgrade_status_check(**kwargs):
31713194
result = FAIL_UF
@@ -3518,6 +3541,7 @@ def internal_vlanpool_check(tversion, **kwargs):
35183541
return Result(result=result, headers=headers, data=data, recommended_action=recommended_action, doc_url=doc_url)
35193542

35203543

3544+
# Subprocess check - openssl
35213545
@check_wrapper(check_title="APIC CA Cert Validation")
35223546
def apic_ca_cert_validation(**kwargs):
35233547
result = FAIL_O
@@ -3549,16 +3573,12 @@ def apic_ca_cert_validation(**kwargs):
35493573
'''
35503574
# Re-run cleanup for Issue #120
35513575
if os.path.exists(cert_gen_filename):
3552-
log.debug('CA CHECK file found and removed: ' + ''.join(cert_gen_filename))
35533576
os.remove(cert_gen_filename)
35543577
if os.path.exists(key_pem):
3555-
log.debug('CA CHECK file found and removed: ' + ''.join(key_pem))
35563578
os.remove(key_pem)
35573579
if os.path.exists(csr_pem):
3558-
log.debug('CA CHECK file found and removed: ' + ''.join(csr_pem))
35593580
os.remove(csr_pem)
35603581
if os.path.exists(sign):
3561-
log.debug('CA CHECK file found and removed: ' + ''.join(sign))
35623582
os.remove(sign)
35633583

35643584
with open(cert_gen_filename, 'w') as f:
@@ -5209,6 +5229,74 @@ def isis_database_byte_check(tversion, **kwargs):
52095229
return Result(result=result, headers=headers, data=data, recommended_action=recommended_action, doc_url=doc_url)
52105230

52115231

5232+
# Subprocess check - cat + acidiag
5233+
@check_wrapper(check_title='APIC Database Size')
5234+
def apic_database_size_check(cversion, **kwargs):
5235+
result = PASS
5236+
headers = ["APIC ID", "DME", "Class Name", "Object Count"]
5237+
data = []
5238+
recommended_action = 'Contact Cisco TAC to investigate all flagged high object counts'
5239+
doc_url = 'https://datacenter.github.io/ACI-Pre-Upgrade-Validation-Script/validations/#apic-database-size'
5240+
5241+
dme_svc_list = ['vmmmgr', 'policymgr', 'eventmgr', 'policydist']
5242+
unique_list = {}
5243+
apic_id_to_name = {}
5244+
apic_node_mo = icurl('class', 'infraWiNode.json')
5245+
for apic in apic_node_mo:
5246+
if apic['infraWiNode']['attributes']['operSt'] == 'available':
5247+
apic_id = apic['infraWiNode']['attributes']['id']
5248+
apic_name = apic['infraWiNode']['attributes']['nodeName']
5249+
if apic_id not in apic_id_to_name:
5250+
apic_id_to_name[apic_id] = apic_name
5251+
5252+
# For 3 APIC cluster, only check APIC Id 2 due to static local shards (R0)
5253+
if len(apic_id_to_name) == 3:
5254+
apic_id_to_name = {"2": apic_id_to_name["2"]}
5255+
5256+
if cversion.older_than("6.1(3a)"):
5257+
for dme in dme_svc_list:
5258+
for id in apic_id_to_name:
5259+
apic_hostname = apic_id_to_name[id]
5260+
collect_stats_cmd = 'cat /debug/'+apic_hostname+'/'+dme+'/mitmocounters/mo | grep -v ALL | sort -rn -k3'
5261+
top_class_stats = run_cmd(collect_stats_cmd, splitlines=True)
5262+
5263+
for svc_stats in top_class_stats[:4]:
5264+
if ":" in svc_stats:
5265+
class_name = svc_stats.split(":")[0].strip()
5266+
mo_count = svc_stats.split(":")[1].strip()
5267+
if int(mo_count) > 1000*1000*1.5:
5268+
unique_list[class_name] = {"id": id, "dme": dme, "checked_val": mo_count}
5269+
else:
5270+
headers = ["APIC ID", "DME", "Shard", "Size"]
5271+
recommended_action = 'Contact Cisco TAC to investigate all flagged large DB sizes'
5272+
for id in apic_id_to_name:
5273+
collect_stats_cmd = "acidiag dbsize --topshard --apic " + id + " -f json"
5274+
try:
5275+
collect_shard_stats_data = run_cmd(collect_stats_cmd, splitlines=False)
5276+
except subprocess.CalledProcessError:
5277+
return Result(result=MANUAL, msg="acidiag command not available to current user")
5278+
top_db_stats = json.loads(collect_shard_stats_data)
5279+
5280+
for db_stats in top_db_stats['dbs']:
5281+
if int(db_stats['size_b']) >= 1073741824 * 5:
5282+
apic_id = db_stats['apic']
5283+
dme = db_stats['dme']
5284+
shard = db_stats['shard_replica']
5285+
size = db_stats['size_h']
5286+
unique_list[shard] = {"id": id, "dme": dme, "checked_val": size}
5287+
5288+
# dedup based on unique_key
5289+
if unique_list:
5290+
for unique_key, details in unique_list.items():
5291+
apic_id = details['id']
5292+
dme = details['dme']
5293+
checked_val = details['checked_val']
5294+
data.append([apic_id, dme, unique_key, checked_val])
5295+
5296+
if data:
5297+
result = FAIL_UF
5298+
return Result(result=result, headers=headers, data=data, recommended_action=recommended_action, doc_url=doc_url)
5299+
52125300
# ---- Script Execution ----
52135301

52145302
def parse_args(args):
@@ -5345,7 +5433,6 @@ def get_checks(api_only, debug_function):
53455433
telemetryStatsServerP_object_check,
53465434
llfc_susceptibility_check,
53475435
internal_vlanpool_check,
5348-
apic_ca_cert_validation,
53495436
fabricdomain_name_check,
53505437
sup_hwrev_check,
53515438
sup_a_high_memory_check,
@@ -5370,13 +5457,15 @@ def get_checks(api_only, debug_function):
53705457
conn_checks = [
53715458
# General
53725459
apic_version_md5_check,
5460+
apic_database_size_check,
53735461

53745462
# Faults
53755463
standby_apic_disk_space_check,
53765464
apic_ssd_check,
53775465

53785466
# Bugs
53795467
observer_db_size_check,
5468+
apic_ca_cert_validation,
53805469

53815470
]
53825471
if debug_function:

docs/docs/validations.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Items | This Script
3535
[Post Upgrade CallBack Integrity][g15] | :white_check_mark: | :no_entry_sign: | :no_entry_sign:
3636
[6.0(2)+ requires 32 and 64 bit switch images][g16] | :white_check_mark: | :no_entry_sign: | :no_entry_sign:
3737
[Fabric Link Redundancy][g17] | :white_check_mark: | :no_entry_sign: | :no_entry_sign:
38+
[APIC Database Size][g18] | :white_check_mark: | :no_entry_sign: | :no_entry_sign:
3839

3940
[g1]: #compatibility-target-aci-version
4041
[g2]: #compatibility-cimc-version
@@ -53,6 +54,7 @@ Items | This Script
5354
[g15]: #post-upgrade-callback-integrity
5455
[g16]: #602-requires-32-and-64-bit-switch-images
5556
[g17]: #fabric-link-redundancy
57+
[g18]: #apic-database-size
5658

5759
### Fault Checks
5860
Items | Faults | This Script | APIC built-in | Pre-Upgrade Validator (App)
@@ -471,6 +473,24 @@ When upgrading the switches, traffic traversing a Leaf Switch that is connected
471473

472474
To prevent this scenario, ensure that every leaf is connected to at least two Spine Switches (or tier-1 Leaf Switches). This check will alert if any Leaf Switches are found to only be connected to a single Spine Switch (or tier-1 Leaf Switch).
473475

476+
### APIC Database Size
477+
478+
APIC Database Shard sizing is generally expected to remain below 5G in steady-state conditions, even in the case of high scale setups. Database shard sizing directly influences the Upgrade Workflow timing, as the database conversion phase directly involves parsing through the entire DB contents. Large shards lead to longer upgrade timings, which in some cases have been seen to lead to upgrade failures. In most cases, a large shard size has been mapped to an underlying condition that needs to be addressed.
479+
480+
The script performs 2 different checks depending on the version you are running.
481+
482+
For current versions below 6.1(3):
483+
484+
- The script checks all APICs' class's object count for a subset of services (DMEs) via a file scan.
485+
- If the count is found to be above `150*1000*1000`, then that class will be flagged for further investigation.
486+
487+
For current version is 6.1(3f):
488+
489+
- 6.1(3f) introduces a new `acidiag dbsize` command which displays the top largest DB sizes.
490+
- The script will utilize the new command and flag any DB shard which have surpassed 5G.
491+
492+
In either scenario, contact TAC to collect a database dump of the flagged DME(s) and shard(s) for further analysis.
493+
474494

475495
## Fault Check Details
476496

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
{
2+
"totalCount": "9",
3+
"imdata": [
4+
{
5+
"infraWiNode": {
6+
"attributes": {
7+
"dn": "topology/pod-1/node-1/av/node-1",
8+
"id": "1",
9+
"nodeName": "apic1",
10+
"operSt": "available"
11+
}
12+
}
13+
},
14+
{
15+
"infraWiNode": {
16+
"attributes": {
17+
18+
"dn": "topology/pod-1/node-1/av/node-2",
19+
"id": "2",
20+
"nodeName": "apic2",
21+
"operSt": "available"
22+
}
23+
}
24+
},
25+
{
26+
"infraWiNode": {
27+
"attributes": {
28+
"dn": "topology/pod-1/node-1/av/node-3",
29+
"id": "3",
30+
"nodeName": "apic3",
31+
"operSt": "available"
32+
}
33+
}
34+
},
35+
{
36+
"infraWiNode": {
37+
"attributes": {
38+
"dn": "topology/pod-1/node-2/av/node-1",
39+
"id": "1",
40+
"nodeName": "apic1",
41+
"operSt": "available"
42+
}
43+
}
44+
},
45+
{
46+
"infraWiNode": {
47+
"attributes": {
48+
"dn": "topology/pod-1/node-2/av/node-2",
49+
"id": "2",
50+
"nodeName": "apic2",
51+
"operSt": "available"
52+
}
53+
}
54+
},
55+
{
56+
"infraWiNode": {
57+
"attributes": {
58+
"dn": "topology/pod-1/node-2/av/node-3",
59+
"id": "3",
60+
"nodeName": "apic3",
61+
"operSt": "available"
62+
63+
}
64+
}
65+
},
66+
{
67+
"infraWiNode": {
68+
"attributes": {
69+
"dn": "topology/pod-1/node-3/av/node-3",
70+
"id": "3",
71+
"nodeName": "apic3",
72+
"operSt": "available"
73+
}
74+
}
75+
},
76+
{
77+
"infraWiNode": {
78+
"attributes": {
79+
"dn": "topology/pod-1/node-3/av/node-1",
80+
"id": "1",
81+
"nodeName": "apic1",
82+
"operSt": "available"
83+
}
84+
}
85+
},
86+
{
87+
"infraWiNode": {
88+
"attributes": {
89+
"dn": "topology/pod-1/node-3/av/node-2",
90+
"id": "2",
91+
"nodeName": "apic2",
92+
"operSt": "available"
93+
}
94+
}
95+
}
96+
]
97+
}

0 commit comments

Comments
 (0)