diff --git a/checkvsphere/vcmd/vsan.py b/checkvsphere/vcmd/vsan.py index eb0c994..5d68359 100644 --- a/checkvsphere/vcmd/vsan.py +++ b/checkvsphere/vcmd/vsan.py @@ -65,6 +65,7 @@ def run(): args = parser.get_args() check = Check() + check.set_threshold(warning=args.warning, critical=args.critical) args._si = service_instance.connect(args) @@ -115,6 +116,8 @@ def run(): check_objecthealth(check, clusters) elif args.mode == "healthtest": check_healthtest(check, clusters) + elif args.mode == "capacity": + check_capacity(check, clusters, vhs) else: raise Exception("WHAT?") @@ -179,6 +182,97 @@ def check_objecthealth(check, clusters): (status, message) = check.check_messages(separator='\n', separator_all='\n', **opts) check.exit(status, message) +def check_capacity(check, clusters, vhs): + """ + Checks vSAN capacity, including slack and resync. + """ + try: + vcMos = vsu.GetVsanVcMos( + args._si._stub, + context=sslContext(args), + version=vsu.GetLatestVmodlVersion(args.host, int(args.port)) + ) + vsan_space_system = vcMos['vsan-cluster-space-report-system'] + except KeyError: + check.exit(CRITICAL, "vsan-cluster-space-report-system API not avilable!") + except Exception as e: + check.exit(CRITICAL, f"vsan API error: {e}") + + for cluster in clusters: + try: + if not cluster['configurationEx'].vsanConfigInfo.enabled: + continue + if isbanned(args, cluster['name'], 'exclude'): + continue + if not isallowed(args, cluster['name'], 'include'): + continue + + if getattr(args, 'debug', False): + print(f"DEBUG: Cluster={cluster['name']}, MoRef={cluster['moref']}") + print("DEBUG: vsan_space_system methods:", dir(vsan_space_system)) + + # Try ManagedStorageSpaceUsage, fallback QuerySpaceUsage + try: + if getattr(args, 'debug', False): + print("DEBUG: Try QueryVsanManagedStorageSpaceUsage") + capacity = vsan_space_system.QueryVsanManagedStorageSpaceUsage(cluster['moref']) + except Exception as e1: + if getattr(args, 'debug', False): + print(f"DEBUG: QueryVsanManagedStorageSpaceUsage failed ({e1}), fallback QuerySpaceUsage") + capacity = vsan_space_system.QuerySpaceUsage(cluster['moref']) + + if getattr(args, 'debug', False): + print("DEBUG: Capacity querried:", capacity) + + # Correct Usage-Calculation + total = getattr(capacity, 'totalCapacityB', 0) + used = getattr(getattr(capacity, 'spaceOverview', None), 'usedB', 0) + free = getattr(capacity, 'freeCapacityB', 0) + slack = getattr(capacity, 'slackSpaceB', 0) + resync = getattr(capacity, 'resyncSpaceB', 0) + + effective_free = max(0, free - slack - resync) + usage_pct = (used / total) * 100 if total > 0 else 0 + effective_free_pct = (effective_free / total) * 100 if total > 0 else 0 + + # This checks the usage_pct against args.warning and args.critical + state = check.check_threshold(usage_pct) + + check.add_perfdata(label=f"{cluster['name']}_usage", + value=round(usage_pct, 1), + uom='%', + threshold=check.threshold) + + check.add_perfdata(label=f"{cluster['name']}_free_gb", + value=round(free / 1024**3, 1), + uom='GB') + check.add_perfdata(label=f"{cluster['name']}_slack_gb", + value=round(slack / 1024**3, 1), + uom='GB') + check.add_perfdata(label=f"{cluster['name']}_resync_gb", + value=round(resync / 1024**3, 1), + uom='GB') + check.add_perfdata(label=f"{cluster['name']}_effective_free_gb", + value=round(effective_free / 1024**3, 1), + uom='GB') + + check.add_message( + state, + f"{cluster['name']}: usage={round(usage_pct,1)}% " + f"(free={round(free/1024**3,1)}GB, slack={round(slack/1024**3,1)}GB, " + f"resync={round(resync/1024**3,1)}GB, effective_free={round(effective_free_pct,1)}%)" + ) + + except Exception as e: + check.add_message(CRITICAL, f"{cluster['name']}: Error while querying: {e}") + + opts = {} + if not getattr(args, 'verbose', False): + opts['allok'] = "everything is fine" + + status, message = check.check_messages(separator='\n', separator_all='\n', **opts) + check.exit(status, message) + def sslContext(args): context = ssl.create_default_context() context.check_hostname = False @@ -209,11 +303,13 @@ def get_argparser(): 'choices': [ 'objecthealth', 'healthtest', + 'capacity' ], 'help': 'which runtime mode to check' } }) - + parser.add_optional_arguments(CheckArgument.WARNING_THRESHOLD) + parser.add_optional_arguments(CheckArgument.CRITICAL_THRESHOLD) return parser def import_vsan(): diff --git a/docs/cmd/vsan.md b/docs/cmd/vsan.md index 5c68130..3fe03af 100644 --- a/docs/cmd/vsan.md +++ b/docs/cmd/vsan.md @@ -16,7 +16,7 @@ options: |---|---| | `--vihost HOSTNAME` | (optional) the name of the HostSystem to check, if omitted the first HostSystem found is checked, which is handy if you run this check directly against the host | | `--maintenance-state STATE` | one of OK, WARNING, CRITICAL, UNKNOWN. The status to use when the host is in maintenance mode, this defaults to UNKNOWN | -| `--mode MODE` | one of objecthealth, healthtest | +| `--mode MODE` | one of objecthealth, healthtest, capacity | | `--include REGEX` | (optional) REGEX is checked against the cluster name | | `--exclude REGEX` | (optional) REGEX is checked against the cluster name | | `--include-group REGEX` | (optional) only with `--mode healthtest`, REGEX is checked against the tests' group name | @@ -25,6 +25,8 @@ options: | `--exclude-test REGEX` | (optional) only with `--mode healthtest`, REGEX is checked against the test name | | `--cache` | fetch cached data from the API when available and not outdated | | `--verbose` | show also tests the where OK | +| `--warning` | warning threshold for capacity | +| `--critical` | critical threshold for capacity | ### `--mode healthtest` @@ -45,6 +47,13 @@ REGEX of `--include`, `--exclude` is matched against cluster name. This is an in depth check of the "vSAN object health" test. It's not very well tested yet. +### `--mode capacity` + +REGEX of `--include`, `--exclude` is matched against cluster name. + +This Checks vSAN capacity, including slack and resync. +Provides performance data. Uses --warning and --critical for threshold + ## Examples ```