Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@ kubectl apply -f {{ k8s_client_mount_path }}/telemetry/deployments/telemetry_nam
{% if kafka_support %}
helm -n telemetry install strimzi-cluster-operator {{ k8s_client_mount_path }}/telemetry/{{ strimzi_kafka_pkg }}.tar.gz
{% endif %}
{% if 'victoria' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %}
helm -n telemetry install victoria-metrics-operator {{ k8s_client_mount_path }}/telemetry/{{ victoria_operator_pkg }}.tar.gz
echo "Waiting for victoria-metrics-operator to be ready..."
kubectl wait --for=condition=available --timeout=300s deployment -l app.kubernetes.io/name=victoria-metrics-operator -n telemetry || true
{% endif %}
kubectl apply -k {{ k8s_client_mount_path }}/telemetry/deployments/.
{% if hostvars['localhost']['ldms_support'] %}
kubectl create secret generic nersc-ldms-ovis-auth --from-file=ldmsauth.conf={{ k8s_client_mount_path }}/telemetry/ldms/ldmsauth.conf --dry-run=client -o yaml | kubectl apply -f - -n telemetry
kubectl create secret generic nersc-munge-key --from-file=munge.key={{ k8s_client_mount_path }}/telemetry/ldms/munge.key --dry-run=client -o yaml | kubectl apply -f - -n telemetry
cd {{ k8s_client_mount_path }}/telemetry/ldms/nersc-ldms-aggr && helm install -n telemetry nersc-ldms-aggr nersc-ldms-aggr --values values.yaml
{% endif %}
{% endif %}
13 changes: 13 additions & 0 deletions discovery/roles/telemetry/tasks/generate_telemetry_deployments.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,19 @@
dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/{{ strimzi_kafka_pkg }}.tar.gz"
mode: "{{ hostvars['localhost']['file_permissions_644'] }}"

- name: Victoria Metrics operator configuration
when: "'victoria' in hostvars['localhost']['idrac_telemetry_collection_type'].split(',')"
block:
- name: Extract and set facts for tarball URLs for victoria metrics operator
ansible.builtin.set_fact:
victoria_operator_pkg: "{{ k8s_packages_json['service_k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'victoria-metrics-operator') | map(attribute='package') | join }}" # noqa: yaml[line-length]

- name: Download victoria metrics operator tarball
ansible.builtin.get_url:
url: "{{ victoria_operator_tarball_url }}"
dest: "{{ hostvars['localhost']['k8s_client_share_path'] }}/telemetry/{{ victoria_operator_pkg }}.tar.gz"
mode: "{{ hostvars['localhost']['file_permissions_644'] }}"

- name: Populate common telemetry deployment configs
ansible.builtin.template:
src: "{{ item.src }}"
Expand Down
4 changes: 3 additions & 1 deletion discovery/roles/telemetry/tasks/telemetry_prereq.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,9 @@
when: not cluster_id_present | default(false)

- name: Configure TLS certificate for VictoriaMetrics
when: "'victoria' in hostvars['localhost']['idrac_telemetry_collection_type']"
when:
- "'victoria' in hostvars['localhost']['idrac_telemetry_collection_type']"
- victoria_cluster.tls_enabled | default(false) | bool
block:
- name: Create VictoriaMetrics certificate directory
ansible.builtin.file:
Expand Down
124 changes: 92 additions & 32 deletions discovery/roles/telemetry/templates/telemetry/cleanup_telemetry.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
# Telemetry Stack Cleanup Script
# Removes Kafka, LDMS, iDRAC telemetry, and monitoring resources from the {{ telemetry_namespace }} namespace
#
# Usage: ./cleanup_telemetry.sh [kafka] [ldms] [idrac] [victoria] [all]
# kafka - Delete Kafka cluster, users, and bridge
# ldms - Delete LDMS aggregator and store
# idrac - Delete iDRAC telemetry
# victoria - Delete VictoriaMetrics monitoring
# all - Delete everything (default if no arguments)
# Usage: ./cleanup_telemetry.sh [kafka] [ldms] [idrac] [victoria] [powerscale] [all]
# kafka - Delete Kafka cluster, users, and bridge
# ldms - Delete LDMS aggregator and store
# idrac - Delete iDRAC telemetry
# victoria - Delete VictoriaMetrics monitoring
# powerscale - Delete PowerScale telemetry (CSM Metrics, OTEL Collector, vmagent-powerscale)
# all - Delete everything (default if no arguments)
#

set -e
Expand All @@ -35,6 +36,7 @@ CLEAN_KAFKA=false
CLEAN_LDMS=false
CLEAN_IDRAC=false
CLEAN_VICTORIA=false
CLEAN_POWERSCALE=false
CLEAN_ALL=false

if [ $# -eq 0 ]; then
Expand All @@ -54,6 +56,9 @@ else
victoria)
CLEAN_VICTORIA=true
;;
powerscale)
CLEAN_POWERSCALE=true
;;
all)
CLEAN_ALL=true
;;
Expand Down Expand Up @@ -89,6 +94,7 @@ if [ "$CLEAN_ALL" = true ]; then
CLEAN_LDMS=true
CLEAN_IDRAC=true
CLEAN_VICTORIA=true
CLEAN_POWERSCALE=true
fi

echo "=========================================="
Expand All @@ -101,6 +107,7 @@ echo " Kafka Cluster: $([ "$CLEAN_KAFKA" = true ] && echo "YES" || echo "NO")
echo " LDMS: $([ "$CLEAN_LDMS" = true ] && echo "YES" || echo "NO")"
echo " iDRAC Telemetry: $([ "$CLEAN_IDRAC" = true ] && echo "YES" || echo "NO")"
echo " Victoria Metrics:$([ "$CLEAN_VICTORIA" = true ] && echo "YES" || echo "NO")"
echo " PowerScale Tel.: $([ "$CLEAN_POWERSCALE" = true ] && echo "YES" || echo "NO")"
echo ""
read -p "Continue? (y/N): " -n 1 -r
echo
Expand Down Expand Up @@ -236,12 +243,16 @@ if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_LDMS" = true ] || [ "$CLEAN_IDRAC" = t
# Delete single-node PVCs
delete_all pvc "app=victoria-metric"
delete_resource pvc victoria-metrics-pvc-victoria-metric-0
# Delete cluster mode PVCs (vmstorage StatefulSet PVCs)
delete_all pvc "app=vmstorage"
# Delete cluster mode PVCs (operator-managed vmstorage StatefulSet PVCs)
delete_all pvc "app.kubernetes.io/instance=victoria-cluster"
for i in {0..9}; do
delete_resource pvc vmstorage-data-vmstorage-$i
delete_resource pvc vmstorage-data-vmstorage-victoria-cluster-$i
done
fi
if [ "$CLEAN_POWERSCALE" = true ]; then
# Delete OTEL Collector PVCs
delete_all pvc "app=otel-collector-powerscale"
fi
sleep 2
echo ""
fi
Expand Down Expand Up @@ -320,10 +331,11 @@ if [ "$CLEAN_KAFKA" = true ] || [ "$CLEAN_VICTORIA" = true ]; then
fi
if [ "$CLEAN_VICTORIA" = true ]; then
delete_resource service victoria-metric
delete_resource service vmselect
delete_resource service vminsert
delete_resource service vmstorage
delete_resource service vmagent
# Operator-managed cluster services
delete_resource service vmselect-victoria-cluster
delete_resource service vminsert-victoria-cluster
delete_resource service vmstorage-victoria-cluster
delete_resource service vmagent-victoria-cluster
fi
sleep 2
echo ""
Expand All @@ -333,17 +345,23 @@ if [ "$CLEAN_VICTORIA" = true ]; then
echo "Step 12: Delete Monitoring Resources"
echo "-------------------------------------"

# Delete VictoriaMetrics cluster components (if cluster mode is deployed)
# Delete VictoriaMetrics operator CRD resources (operator cascades deletion)
echo "Deleting VictoriaMetrics operator CRD resources..."
kubectl -n $NAMESPACE delete vmcluster victoria-cluster --ignore-not-found=true 2>/dev/null || true
kubectl -n $NAMESPACE delete vmagent --all --ignore-not-found=true 2>/dev/null || true
kubectl -n $NAMESPACE delete vmpodscrape --all --ignore-not-found=true 2>/dev/null || true
kubectl -n $NAMESPACE delete vmsingle --all --ignore-not-found=true 2>/dev/null || true
sleep 5

# Delete any remaining operator-managed cluster components
echo "Deleting VictoriaMetrics cluster components..."
delete_resource deployment vmselect
delete_resource deployment vminsert
delete_resource statefulset vmstorage
delete_resource service vmselect
delete_resource service vminsert
delete_resource service vmstorage
delete_all pod "app=vmselect"
delete_all pod "app=vminsert"
delete_all pod "app=vmstorage"
delete_resource deployment vmselect-victoria-cluster
delete_resource deployment vminsert-victoria-cluster
delete_resource statefulset vmstorage-victoria-cluster
delete_resource service vmselect-victoria-cluster
delete_resource service vminsert-victoria-cluster
delete_resource service vmstorage-victoria-cluster
delete_all pod "app.kubernetes.io/instance=victoria-cluster"

# Delete VictoriaMetrics single-node components (if single-node mode is deployed)
echo "Deleting VictoriaMetrics single-node components..."
Expand All @@ -357,6 +375,7 @@ if [ "$CLEAN_VICTORIA" = true ]; then
delete_resource deployment vmagent
delete_resource service vmagent
delete_all pod "app=vmagent"
delete_all pod "app.kubernetes.io/name=vmagent"

# Delete shared resources
echo "Deleting VictoriaMetrics shared resources..."
Expand All @@ -372,6 +391,39 @@ if [ "$CLEAN_VICTORIA" = true ]; then
echo ""
fi

if [ "$CLEAN_POWERSCALE" = true ]; then
echo "Step: Delete PowerScale Telemetry"
echo "----------------------------------"

# Delete metrics pipeline components
echo "Deleting CSM Metrics PowerScale..."
delete_all deployment "app=csm-metrics-powerscale"
delete_all service "app=csm-metrics-powerscale"
delete_all configmap "app=csm-metrics-powerscale"
delete_all pod "app=csm-metrics-powerscale"

echo "Deleting OTEL Collector PowerScale..."
delete_all deployment "app=otel-collector-powerscale"
delete_all service "app=otel-collector-powerscale"
delete_all configmap "app=otel-collector-powerscale"
delete_all pod "app=otel-collector-powerscale"

# Note: vmagent is shared with iDRAC telemetry - not deleted here.
# PowerScale scrape targets are removed from vmagent config on next deployment.

# Delete Karavi-specific resources (if deployed)
echo "Deleting Karavi Authorization resources..."
delete_resource configmap karavi-authorization-config
delete_resource secret karavi-authorization-ca-cert

# Delete PowerScale credentials
echo "Deleting PowerScale credentials..."
delete_resource secret isilon-creds

sleep 2
echo ""
fi

echo ""
echo "Step 13: Force Delete Any Remaining Component Pods"
echo "---------------------------------------------------"
Expand All @@ -389,12 +441,15 @@ if [ "$CLEAN_IDRAC" = true ]; then
fi
if [ "$CLEAN_VICTORIA" = true ]; then
kubectl -n $NAMESPACE delete pod -l app=victoria-metric --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true
kubectl -n $NAMESPACE delete pod -l app=vmselect --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true
kubectl -n $NAMESPACE delete pod -l app=vminsert --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true
kubectl -n $NAMESPACE delete pod -l app=vmstorage --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true
kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/instance=victoria-cluster --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true
kubectl -n $NAMESPACE delete pod -l app.kubernetes.io/name=vmagent --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true
kubectl -n $NAMESPACE delete pod -l app=vmagent --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true
kubectl -n $NAMESPACE delete pod -l app=victoria-tls-test --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true
fi
if [ "$CLEAN_POWERSCALE" = true ]; then
kubectl -n $NAMESPACE delete pod -l app=csm-metrics-powerscale --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true
kubectl -n $NAMESPACE delete pod -l app=otel-collector-powerscale --grace-period=0 --force --ignore-not-found=true 2>/dev/null || true
fi
sleep 5

echo ""
Expand All @@ -419,16 +474,21 @@ if [ "$CLEAN_VICTORIA" = true ]; then
echo "Remaining Victoria Metrics resources:"
echo " Single-node:"
kubectl -n $NAMESPACE get statefulset,deployment,pod,configmap -l app=victoria-metric 2>/dev/null || echo " None"
echo " Cluster (vmselect):"
kubectl -n $NAMESPACE get deployment,pod -l app=vmselect 2>/dev/null || echo " None"
echo " Cluster (vminsert):"
kubectl -n $NAMESPACE get deployment,pod -l app=vminsert 2>/dev/null || echo " None"
echo " Cluster (vmstorage):"
kubectl -n $NAMESPACE get statefulset,pod -l app=vmstorage 2>/dev/null || echo " None"
echo " Operator-managed cluster:"
kubectl -n $NAMESPACE get vmcluster,deployment,statefulset,pod -l app.kubernetes.io/instance=victoria-cluster 2>/dev/null || echo " None"
echo " vmagent:"
kubectl -n $NAMESPACE get deployment,pod -l app.kubernetes.io/name=vmagent 2>/dev/null || echo " None"
kubectl -n $NAMESPACE get deployment,pod -l app=vmagent 2>/dev/null || echo " None"
echo ""
fi
if [ "$CLEAN_POWERSCALE" = true ]; then
echo "Remaining PowerScale telemetry resources:"
echo " CSM Metrics:"
kubectl -n $NAMESPACE get deployment,pod -l app=csm-metrics-powerscale 2>/dev/null || echo " None"
echo " OTEL Collector:"
kubectl -n $NAMESPACE get deployment,pod -l app=otel-collector-powerscale 2>/dev/null || echo " None"
echo ""
fi
echo "Remaining PVCs:"
kubectl -n $NAMESPACE get pvc 2>/dev/null || echo " None"
echo ""
Expand Down
29 changes: 15 additions & 14 deletions discovery/roles/telemetry/templates/telemetry/kustomization.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,24 @@ resources:
- telemetry_secret_creation.yaml
{% set types = hostvars['localhost']['idrac_telemetry_collection_type'].split(',') %}
{% if 'victoria' in types %}
# VictoriaMetrics Common Resources
- victoria-tls-secret.yaml
# VictoriaMetrics Common Resources (RBAC)
- victoria-vmagent-rbac.yaml
- vmagent-scrape-config.yaml
- victoria-agent-deployment.yaml
# VictoriaMetrics Deployment (mode: {{ hostvars['localhost']['victoria_configurations']['deployment_mode'] }})
{% if victoria_cluster.tls_enabled | default(false) %}
# TLS secret for VictoriaMetrics cluster components
- victoria-tls-secret.yaml
{% endif %}
# VictoriaMetrics Operator-based Deployment (mode: {{ hostvars['localhost']['victoria_configurations']['deployment_mode'] }})
{% if hostvars['localhost']['victoria_configurations']['deployment_mode'] == 'cluster' %}
# Cluster Mode: High-availability deployment
- victoria-cluster-vmstorage.yaml
- victoria-cluster-vminsert.yaml
- victoria-cluster-vmselect.yaml
# Cluster Mode: VMCluster CR (operator manages StatefulSets)
- victoria-operator-vmcluster.yaml
{% else %}
# Single-Node Mode: Simple deployment
- victoria-statefulset.yaml
# Single-Node Mode: VMSingle CR (operator manages StatefulSet)
- victoria-operator-vmsingle.yaml
{% endif %}
# Uncomment to deploy VictoriaMetrics TLS test job
# - test/victoria-tls-test-job.yaml
# VMAgent CR (operator-managed scraper)
- victoria-operator-vmagent.yaml
# VMPodScrape CR (native operator-based pod discovery)
- victoria-operator-vmpodscrape.yaml
{% endif %}
{% if kafka_support %}
- kafka.kafka.yaml
Expand All @@ -38,4 +39,4 @@ resources:
- idrac_telemetry_statefulset.yaml
- telemetry_cleaner_rbac.yaml
- telemetry_pod_cleanup.yaml
{% endif %}
{% endif %}
Original file line number Diff line number Diff line change
Expand Up @@ -52,26 +52,38 @@ DNS.5 = victoria-metric-0
DNS.6 = victoria-metric-0.{{ telemetry_namespace }}
DNS.7 = victoria-metric-0.{{ telemetry_namespace }}.svc
DNS.8 = victoria-metric-0.{{ telemetry_namespace }}.svc.cluster.local
# Cluster deployment names
DNS.9 = vminsert
DNS.10 = vminsert.{{ telemetry_namespace }}
DNS.11 = vminsert.{{ telemetry_namespace }}.svc
DNS.12 = vminsert.{{ telemetry_namespace }}.svc.cluster.local
DNS.13 = vmselect
DNS.14 = vmselect.{{ telemetry_namespace }}
DNS.15 = vmselect.{{ telemetry_namespace }}.svc
DNS.16 = vmselect.{{ telemetry_namespace }}.svc.cluster.local
DNS.17 = vmstorage
DNS.18 = vmstorage.{{ telemetry_namespace }}
DNS.19 = vmstorage.{{ telemetry_namespace }}.svc
DNS.20 = vmstorage.{{ telemetry_namespace }}.svc.cluster.local
# VMStorage StatefulSet pods
DNS.21 = vmstorage-0.vmstorage.{{ telemetry_namespace }}.svc.cluster.local
DNS.22 = vmstorage-1.vmstorage.{{ telemetry_namespace }}.svc.cluster.local
DNS.23 = vmstorage-2.vmstorage.{{ telemetry_namespace }}.svc.cluster.local
# Cluster deployment names (operator-managed)
DNS.9 = vminsert-victoria-cluster
DNS.10 = vminsert-victoria-cluster.{{ telemetry_namespace }}
DNS.11 = vminsert-victoria-cluster.{{ telemetry_namespace }}.svc
DNS.12 = vminsert-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local
DNS.13 = vmselect-victoria-cluster
DNS.14 = vmselect-victoria-cluster.{{ telemetry_namespace }}
DNS.15 = vmselect-victoria-cluster.{{ telemetry_namespace }}.svc
DNS.16 = vmselect-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local
DNS.17 = vmstorage-victoria-cluster
DNS.18 = vmstorage-victoria-cluster.{{ telemetry_namespace }}
DNS.19 = vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc
DNS.20 = vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local
# VMStorage StatefulSet pods (operator-managed)
DNS.21 = vmstorage-victoria-cluster-0.vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local
DNS.22 = vmstorage-victoria-cluster-1.vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local
DNS.23 = vmstorage-victoria-cluster-2.vmstorage-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local
IP.1 = 127.0.0.1
EOF

# Check if existing cert has the required operator-managed SANs
# If SANs are stale (missing operator-managed names), force server cert regeneration
# CA is preserved so external clients do not need to re-import it
if [ -f "$CERT_FILE" ]; then
REQUIRED_SAN="vminsert-victoria-cluster.{{ telemetry_namespace }}.svc.cluster.local"
if ! openssl x509 -in "$CERT_FILE" -text -noout 2>/dev/null | grep -q "$REQUIRED_SAN"; then
echo "Existing certificate missing required SAN: $REQUIRED_SAN"
echo "Removing stale server cert/key/csr to force regeneration..."
rm -f "$CERT_KEY" "$CSR_FILE" "$CERT_FILE"
fi
fi

# Generate CA key
if [ ! -f "$CA_KEY" ]; then
echo "Generating CA key..."
Expand Down
Loading