Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 97 additions & 1 deletion checkvsphere/vcmd/vsan.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def run():
args = parser.get_args()

check = Check()
check.set_threshold(warning=args.warning, critical=args.critical)

args._si = service_instance.connect(args)

Expand Down Expand Up @@ -115,6 +116,8 @@ def run():
check_objecthealth(check, clusters)
elif args.mode == "healthtest":
check_healthtest(check, clusters)
elif args.mode == "capacity":
check_capacity(check, clusters, vhs)
else:
raise Exception("WHAT?")

Expand Down Expand Up @@ -179,6 +182,97 @@ def check_objecthealth(check, clusters):
(status, message) = check.check_messages(separator='\n', separator_all='\n', **opts)
check.exit(status, message)

def check_capacity(check, clusters, vhs):
"""
Checks vSAN capacity, including slack and resync.
"""
try:
vcMos = vsu.GetVsanVcMos(
args._si._stub,
context=sslContext(args),
version=vsu.GetLatestVmodlVersion(args.host, int(args.port))
)
vsan_space_system = vcMos['vsan-cluster-space-report-system']
except KeyError:
check.exit(CRITICAL, "vsan-cluster-space-report-system API not avilable!")
except Exception as e:
check.exit(CRITICAL, f"vsan API error: {e}")

for cluster in clusters:
try:
if not cluster['configurationEx'].vsanConfigInfo.enabled:
continue
if isbanned(args, cluster['name'], 'exclude'):
continue
if not isallowed(args, cluster['name'], 'include'):
continue

if getattr(args, 'debug', False):
print(f"DEBUG: Cluster={cluster['name']}, MoRef={cluster['moref']}")
print("DEBUG: vsan_space_system methods:", dir(vsan_space_system))

# Try ManagedStorageSpaceUsage, fallback QuerySpaceUsage
try:
if getattr(args, 'debug', False):
print("DEBUG: Try QueryVsanManagedStorageSpaceUsage")
capacity = vsan_space_system.QueryVsanManagedStorageSpaceUsage(cluster['moref'])
except Exception as e1:
if getattr(args, 'debug', False):
print(f"DEBUG: QueryVsanManagedStorageSpaceUsage failed ({e1}), fallback QuerySpaceUsage")
capacity = vsan_space_system.QuerySpaceUsage(cluster['moref'])

if getattr(args, 'debug', False):
print("DEBUG: Capacity querried:", capacity)

# Correct Usage-Calculation
total = getattr(capacity, 'totalCapacityB', 0)
used = getattr(getattr(capacity, 'spaceOverview', None), 'usedB', 0)
free = getattr(capacity, 'freeCapacityB', 0)
slack = getattr(capacity, 'slackSpaceB', 0)
resync = getattr(capacity, 'resyncSpaceB', 0)

effective_free = max(0, free - slack - resync)
usage_pct = (used / total) * 100 if total > 0 else 0
effective_free_pct = (effective_free / total) * 100 if total > 0 else 0

# This checks the usage_pct against args.warning and args.critical
state = check.check_threshold(usage_pct)

check.add_perfdata(label=f"{cluster['name']}_usage",
value=round(usage_pct, 1),
uom='%',
threshold=check.threshold)

check.add_perfdata(label=f"{cluster['name']}_free_gb",
value=round(free / 1024**3, 1),
uom='GB')
check.add_perfdata(label=f"{cluster['name']}_slack_gb",
value=round(slack / 1024**3, 1),
uom='GB')
check.add_perfdata(label=f"{cluster['name']}_resync_gb",
value=round(resync / 1024**3, 1),
uom='GB')
check.add_perfdata(label=f"{cluster['name']}_effective_free_gb",
value=round(effective_free / 1024**3, 1),
uom='GB')

check.add_message(
state,
f"{cluster['name']}: usage={round(usage_pct,1)}% "
f"(free={round(free/1024**3,1)}GB, slack={round(slack/1024**3,1)}GB, "
f"resync={round(resync/1024**3,1)}GB, effective_free={round(effective_free_pct,1)}%)"
)

except Exception as e:
check.add_message(CRITICAL, f"{cluster['name']}: Error while querying: {e}")

opts = {}
if not getattr(args, 'verbose', False):
opts['allok'] = "everything is fine"

status, message = check.check_messages(separator='\n', separator_all='\n', **opts)
check.exit(status, message)

def sslContext(args):
context = ssl.create_default_context()
context.check_hostname = False
Expand Down Expand Up @@ -209,11 +303,13 @@ def get_argparser():
'choices': [
'objecthealth',
'healthtest',
'capacity'
],
'help': 'which runtime mode to check'
}
})

parser.add_optional_arguments(CheckArgument.WARNING_THRESHOLD)
parser.add_optional_arguments(CheckArgument.CRITICAL_THRESHOLD)
return parser

def import_vsan():
Expand Down
11 changes: 10 additions & 1 deletion docs/cmd/vsan.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ options:
|---|---|
| `--vihost HOSTNAME` | (optional) the name of the HostSystem to check, if omitted the first HostSystem found is checked, which is handy if you run this check directly against the host |
| `--maintenance-state STATE` | one of OK, WARNING, CRITICAL, UNKNOWN. The status to use when the host is in maintenance mode, this defaults to UNKNOWN |
| `--mode MODE` | one of objecthealth, healthtest |
| `--mode MODE` | one of objecthealth, healthtest, capacity |
| `--include REGEX` | (optional) REGEX is checked against the cluster name |
| `--exclude REGEX` | (optional) REGEX is checked against the cluster name |
| `--include-group REGEX` | (optional) only with `--mode healthtest`, REGEX is checked against the tests' group name |
Expand All @@ -25,6 +25,8 @@ options:
| `--exclude-test REGEX` | (optional) only with `--mode healthtest`, REGEX is checked against the test name |
| `--cache` | fetch cached data from the API when available and not outdated |
| `--verbose` | show also tests the where OK |
| `--warning` | warning threshold for capacity |
| `--critical` | critical threshold for capacity |

### `--mode healthtest`

Expand All @@ -45,6 +47,13 @@ REGEX of `--include`, `--exclude` is matched against cluster name.
This is an in depth check of the "vSAN object health" test. It's not very well
tested yet.

### `--mode capacity`

REGEX of `--include`, `--exclude` is matched against cluster name.

This Checks vSAN capacity, including slack and resync.
Provides performance data. Uses --warning and --critical for threshold

## Examples

```
Expand Down