Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 33 additions & 13 deletions aci-preupgrade-validation-script.py
Original file line number Diff line number Diff line change
Expand Up @@ -2974,7 +2974,7 @@ def scalability_faults_check(**kwargs):


@check_wrapper(check_title="APIC Disk Space Usage (F1527, F1528, F1529 equipment-full)")
def apic_disk_space_faults_check(cversion, **kwargs):
def apic_disk_space_faults_check(cversion, tversion, **kwargs):
result = FAIL_UF
headers = ['Fault', 'Pod', 'Node', 'Mount Point', 'Current Usage %', 'Recommended Action']
data = []
Expand All @@ -2983,29 +2983,49 @@ def apic_disk_space_faults_check(cversion, **kwargs):
doc_url = "https://datacenter.github.io/ACI-Pre-Upgrade-Validation-Script/validations/#apic-disk-space-usage"
recommended_action = {
'/firmware': 'Remove unneeded images',
'/techsupport': 'Remove unneeded techsupports/cores'
'/techsupport': 'Remove unneeded techsupports/cores',
'/tmp': 'Contact Cisco TAC for assistance. The /tmp directory may need cleanup or the upgrade may require special handling.'
}
default_action = 'Contact Cisco TAC.'
if cversion.same_as('4.0(1h)') or cversion.older_than('3.2(6i)'):
default_action += ' A typical issue is CSCvn13119.'

dn_regex = node_regex + r'/.+p-\[(?P<mountpoint>.+)\]-f'
desc_regex = r'is (?P<usage>\d{2}%) full'
desc_regex = r'is (?P<usage>\d{2,3}%) full'

tmp_faults_skipped = False # Track if we skip /tmp faults for tversion >= 6.1(4a)
faultInsts = icurl('class',
'faultInst.json?query-target-filter=or(eq(faultInst.code,"F1527"),eq(faultInst.code,"F1528"),eq(faultInst.code,"F1529"))')
for faultInst in faultInsts:
fc = faultInst['faultInst']['attributes']['code']
dn = re.search(dn_regex, faultInst['faultInst']['attributes']['dn'])
desc = re.search(desc_regex, faultInst['faultInst']['attributes']['descr'])
if dn and desc:
data.append([fc, dn.group('pod'), dn.group('node'), dn.group('mountpoint'),
desc.group('usage'),
recommended_action.get(dn.group('mountpoint'), default_action)])
else:
unformatted_data.append([fc, faultInst['faultInst']['attributes']['dn'], default_action])
lc = faultInst['faultInst']['attributes'].get('lc','')

# Only process raised faults
if lc == 'raised':
fc = faultInst['faultInst']['attributes']['code']
dn = re.search(dn_regex, faultInst['faultInst']['attributes']['dn'])
desc = re.search(desc_regex, faultInst['faultInst']['attributes']['descr'])

if dn:
mountpoint = dn.group('mountpoint')
# CSCwo96334: Skip /tmp faults if tversion >= 6.1(4a) (snapshots use /data instead)
if mountpoint == '/tmp' and tversion and not tversion.older_than("6.1(4a)"):
tmp_faults_skipped = True
continue

if desc:
data.append([fc, dn.group('pod'), dn.group('node'), mountpoint,
desc.group('usage'),
recommended_action.get(mountpoint, default_action)])
else:
unformatted_data.append([fc, faultInst['faultInst']['attributes']['dn'],
recommended_action.get(mountpoint, default_action)])

if not data and not unformatted_data:
result = PASS
# If we only found /tmp faults that were skipped (tversion >= 6.1(4a)), return NA
if tmp_faults_skipped:
result = NA
else:
result = PASS
return Result(
result=result,
headers=headers,
Expand Down
53 changes: 49 additions & 4 deletions docs/docs/validations.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ Items | This Script
### Fault Checks
Items | Faults | This Script | APIC built-in
----------------------------------------------|----------------|-------------------|-------------------------------
[APIC Disk Space Usage][f1] | F1527: 80% - 85%<br>F1528: 85% - 90%<br>F1529: 90% or more | :white_check_mark: | :white_check_mark: 4.2(1)
[APIC Disk Space Usage][f1] | F1527: 75% - 84%<br>F1528: 85% - 89%<br>F1529: 90% or more | :white_check_mark: | :white_check_mark: 4.2(1)
[Standby APIC Disk Space Usage][f2] | | :white_check_mark: | :white_check_mark: 5.2(3)
[Switch Node `/bootflash` usage][f3] | F1821: 90% or more | :white_check_mark: | :white_check_mark: 4.2(4)
[APIC SSD Health][f4] | F2730: less than 10% remaining<br>F2731: less than 5% remaining<br>F2732: less than 1% remaining | :white_check_mark: | :white_check_mark: 4.2(1)
Expand Down Expand Up @@ -190,7 +190,7 @@ Items | Defect | This Script
[Observer Database Size][d25] | CSCvw45531 | :white_check_mark: | :no_entry_sign:
[Stale pconsRA Object][d26] | CSCwp22212 | :warning:{title="Deprecated"} | :no_entry_sign:
[ISIS DTEPs Byte Size][d27] | CSCwp15375 | :white_check_mark: | :no_entry_sign:
[Policydist configpushShardCont Crash][d28] | CSCwp95515 | :white_check_mark: |
[Policydist configpushShardCont Crash][d28] | CSCwp95515 | :white_check_mark: | :no_entry_sign:

[d1]: #ep-announce-compatibility
[d2]: #eventmgr-db-size-defect-susceptibility
Expand Down Expand Up @@ -501,12 +501,54 @@ In either scenario, contact TAC to collect a database dump of the flagged DME(s)

If a Cisco APIC is running low on disk space for any reason, the Cisco APIC upgrade can fail. The Cisco APIC will raise three different faults depending on the amount of disk space remaining. If any of these faults are raised on the system, the issue should be resolved prior to performing the upgrade.

* **F1527**: A warning level fault for Cisco APIC disk space usage. This is raised when the utilization is between 80% and 85%.
* **F1527**: A warning level fault for Cisco APIC disk space usage. This is raised when the utilization is between 75% and 84%.

* **F1528**: A major level fault for Cisco APIC disk space usage. This is raised when the utilization is between 85% and 90%.
* **F1528**: A major level fault for Cisco APIC disk space usage. This is raised when the utilization is between 85% and 89%.

* **F1529**: A critical level fault for Cisco APIC disk space usage. This is raised when the utilization is between 90% and above.

#### Special Handling for /tmp Directory (CSCwo96334)

Prior to ACI version 6.1(4), the APIC uses the `/tmp` directory to store database snapshots during the upgrade process. If the `/tmp` directory has insufficient free space (typically indicated by disk space faults F1527, F1528, or F1529), the upgrade process may fail due to inability to create required snapshot files.

Due to [CSCwo96334][62], starting from ACI version 6.1(4), snapshots are stored in `/data` directory instead of `/tmp`, which provides more available disk space and resolves this issue.

**Version-Specific Behavior:**

This check has special handling for `/tmp` disk space faults based on the target version:

* **For upgrades to versions < 6.1(4)**:
- `/tmp` disk space faults (F1527/F1528/F1529) are reported and must be addressed
- Insufficient `/tmp` space can cause upgrade failure when creating database snapshots
- Check result: **FAIL** if `/tmp` faults exist

* **For upgrades to versions >= 6.1(4)**:
- `/tmp` disk space faults are ignored by this check
- Snapshots are stored in `/data` directory instead, which typically has more space
- Check result: **N/A** if only `/tmp` faults exist (not relevant for upgrade)
- Other mountpoint faults (/firmware, /techsupport, etc.) are still reported

**Impact of /tmp Disk Space Issues (for versions < 6.1(4)):**

If `/tmp` is at or above 75% utilization when upgrading to versions prior to 6.1(4), the upgrade may fail with:

- Upgrade workflow failure during snapshot creation
- Inability to complete APIC database conversion
- Potential need for manual cleanup and upgrade retry
- Extended downtime due to failed upgrade attempts

**Recommended Actions:**

For `/tmp` disk space issues when upgrading to versions < 6.1(4):

1. **Contact Cisco TAC** for assistance before proceeding with the upgrade
2. Work with TAC to identify and remove unnecessary files from `/tmp` safely
3. Consider upgrading to ACI 6.1(4) or later as an alternative, where this issue is resolved
4. Ensure at least 25-30% free space in `/tmp` before attempting upgrade
5. Do not manually delete files from `/tmp` without TAC guidance to avoid system instability

For other mountpoints (applies to all versions):

You can run the following `moquery` on the CLI of any Cisco APIC to check if these faults exist on the system. The faults are visible within the GUI as well. In the example below, with the faults against `/firmware`, you can simply remove unnecessary firmware images under `Admin > Firmware` in the Cisco APIC GUI. You should not perform the Linux command rm to remove an image directly from `/firmware`, as the firmware images are synchronized across Cisco APICs. If the fault is raised against a disk space that you are not aware of, contact Cisco TAC to resolve the issue prior to the upgrade.

!!! example "Fault Example (F1528: Major Fault for APIC disk space usage)"
Expand Down Expand Up @@ -2604,6 +2646,8 @@ Due to [CSCwp95515][59], upgrading to an affected version while having any `conf
If any instances of `configpushShardCont` are flagged by this script, Cisco TAC must be contacted to identify and resolve the underlying issue before performing the upgrade.




[0]: https://github.com/datacenter/ACI-Pre-Upgrade-Validation-Script
[1]: https://www.cisco.com/c/dam/en/us/td/docs/Website/datacenter/apicmatrix/index.html
[2]: https://www.cisco.com/c/en/us/support/switches/nexus-9000-series-switches/products-release-notes-list.html
Expand Down Expand Up @@ -2666,3 +2710,4 @@ If any instances of `configpushShardCont` are flagged by this script, Cisco TAC
[59]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwp95515
[60]: https://www.cisco.com/c/en/us/solutions/collateral/data-center-virtualization/application-centric-infrastructure/white-paper-c11-743951.html#Inter
[61]: https://www.cisco.com/c/en/us/solutions/collateral/data-center-virtualization/application-centric-infrastructure/white-paper-c11-743951.html#EnablePolicyCompression
[62]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwo96334
35 changes: 35 additions & 0 deletions tests/checks/apic_disk_space_faults_check/faultInst_mixed.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1529",
"lc": "raised",
"descr": "Storage unit /tmp on Node 1 of pod 1 is 92% full",
"dn": "topology/pod-1/node-1/sys/ch/p-[/tmp]-fault-F1529"
}
}
},
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1527",
"lc": "raised",
"descr": "Storage unit /firmware on Node 2 of pod 1 is 76% full",
"dn": "topology/pod-1/node-2/sys/ch/p-[/firmware]-fault-F1527"
}
}
},
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1528",
"lc": "raised",
"descr": "Storage unit /techsupport on Node 3 of pod 1 is 85% full",
"dn": "topology/pod-1/node-3/sys/ch/p-[/techsupport]-fault-F1528"
}
}
}
]
24 changes: 24 additions & 0 deletions tests/checks/apic_disk_space_faults_check/faultInst_non_tmp.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1527",
"lc": "raised",
"descr": "Storage unit /firmware on Node 1 of pod 1 is 78% full",
"dn": "topology/pod-1/node-1/sys/ch/p-[/firmware]-fault-F1527"
}
}
},
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1528",
"lc": "raised",
"descr": "Storage unit /techsupport on Node 2 of pod 1 is 88% full",
"dn": "topology/pod-1/node-2/sys/ch/p-[/techsupport]-fault-F1528"
}
}
}
]
35 changes: 35 additions & 0 deletions tests/checks/apic_disk_space_faults_check/faultInst_tmp_pos.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1527",
"lc": "raised",
"descr": "Storage unit /tmp on Node 1 of pod 1 is 80% full",
"dn": "topology/pod-1/node-1/sys/ch/p-[/tmp]-fault-F1527"
}
}
},
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1528",
"lc": "raised",
"descr": "Storage unit /tmp on Node 2 of pod 1 is 87% full",
"dn": "topology/pod-1/node-2/sys/ch/p-[/tmp]-fault-F1528"
}
}
},
{
"faultInst": {
"attributes": {
"cause": "threshold-crossed",
"code": "F1529",
"lc": "raised",
"descr": "Storage unit /tmp on Node 3 of pod 1 is 95% full",
"dn": "topology/pod-1/node-3/sys/ch/p-[/tmp]-fault-F1529"
}
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import os
import pytest
import logging
import importlib
from helpers.utils import read_data

script = importlib.import_module("aci-preupgrade-validation-script")

log = logging.getLogger(__name__)
dir = os.path.dirname(os.path.abspath(__file__))

test_function = "apic_disk_space_faults_check"

# icurl queries
faultInst_api = 'faultInst.json?query-target-filter=or(eq(faultInst.code,"F1527"),eq(faultInst.code,"F1528"),eq(faultInst.code,"F1529"))'


@pytest.mark.parametrize(
"icurl_outputs, tversion, expected_result",
[
# ===== AFFECTED VERSIONS (< 6.1(4a)) =====
# Older 4.x version, no /tmp faults
(
{faultInst_api: []},
"4.2(7f)",
script.PASS,
),
# 5.x version, no /tmp faults
(
{faultInst_api: []},
"5.2(8f)",
script.PASS,
),
# 6.0.x version, no /tmp faults
(
{faultInst_api: []},
"6.0(5a)",
script.PASS,
),
# Just before fix version 6.1(3z), no /tmp faults
(
{faultInst_api: []},
"6.1(3z)",
script.PASS,
),
# 4.x version with /tmp faults
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"4.2(7t)",
script.FAIL_UF,
),
# 5.x version with /tmp faults
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"5.2(8f)",
script.FAIL_UF,
),
# 6.0.x version with /tmp faults
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"6.0(2h)",
script.FAIL_UF,
),
# Just before fix version 6.1(3z) with /tmp faults
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"6.1(3z)",
script.FAIL_UF,
),
# Affected version with only non-/tmp faults (should FAIL_UF)
(
{faultInst_api: read_data(dir, "faultInst_non_tmp.json")},
"5.2(6a)",
script.FAIL_UF,
),
# Affected version with mixed /tmp and non-/tmp faults (should FAIL_UF)
(
{faultInst_api: read_data(dir, "faultInst_mixed.json")},
"6.0(3a)",
script.FAIL_UF,
),
# 3.x version with /tmp faults
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"3.2(10e)",
script.FAIL_UF,
),
# 4.x version with only non-/tmp faults (should FAIL_UF)
(
{faultInst_api: read_data(dir, "faultInst_non_tmp.json")},
"4.2(7f)",
script.FAIL_UF,
),
# 6.0.x version with mixed faults
(
{faultInst_api: read_data(dir, "faultInst_mixed.json")},
"6.0(5h)",
script.FAIL_UF,
),
# ===== FIXED VERSIONS (>= 6.1(4a)) =====
# Exact fix version 6.1(4a) with /tmp faults (should be NA - CSCwo96334 doesn't apply)
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"6.1(4a)",
script.NA,
),
# Exact fix version 6.1(4a) without faults (should PASS)
(
{faultInst_api: []},
"6.1(4a)",
script.PASS,
),
# Later 6.1.x version with /tmp faults (should be NA - CSCwo96334 doesn't apply)
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"6.1(5a)",
script.NA,
),
# 6.2.x version with /tmp faults (should be NA - CSCwo96334 doesn't apply)
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"6.2(1a)",
script.NA,
),
# Future 7.x version with /tmp faults (should be NA - CSCwo96334 doesn't apply)
(
{faultInst_api: read_data(dir, "faultInst_tmp_pos.json")},
"7.0(1a)",
script.NA,
),
],
)
def test_logic(run_check, mock_icurl, tversion, expected_result):
result = run_check(
cversion=script.AciVersion("5.2(1a)"),
tversion=script.AciVersion(tversion) if tversion else None,
)
assert result.result == expected_result