-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathcheck_splunk_cluster.py
More file actions
218 lines (194 loc) · 6.45 KB
/
check_splunk_cluster.py
File metadata and controls
218 lines (194 loc) · 6.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/usr/bin/env python
#
# Copyright 2014, Schuberg Philis B.V.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# check_splunk_cluster.py
#
# Nagios plugin to the the health of a Splunk cluster via the REST API
# cluster/master/info:
# Check maintenance mode: WARN if true
# Check indexing_ready_flag:
# 1 = OK
# not 1 = CRITICAL
#
#cluster/master/peers:
# Check peer status:
# Up = OK
# Down = CRITICAL
# Other states: WARNING
# Return stats: pending_job_count
#
#
#cluster/master/generation
# Check "search_factor_met"
# 1 = OK
# not 1 = CRITICAL
#
# Check "replication_factor_met"
# 1 = OK
# not 1 = CRITICAL
#
#messages
# List must be empty
# return messages to Nagios (and assert at least WARNING status)
#
#
#licenser/messages
# Check all messages:
# INFO/WARN => WARNING
# ERROR => CRITICAL
# Return messages (description) to Nagios
#
#licenser/pools
# per pool (calculate used_bytes/effective_quota)*100%
# if % > 90
# Usage warning
# return stats: license pool usage
#
# This check must be run against the splunkd port (default 8089) on the
# cluster master
import urllib2
import json
import sys
from nagios import Nagios
from ConfigParser import ConfigParser
class SplunkCluster(object):
endpoints = [
{
"end_point" : "messages",
"checks" : [
"_cluster_messages"
]
},
{
"end_point" : "cluster/master/info",
"checks" : [
"_check_maintenance_mode",
"_check_indexing_ready",
]
},
{
"end_point" : "cluster/master/peers",
"checks" : [
"_check_peer_status",
"_get_pending_job_count"
]
},
{
"end_point" : "cluster/master/generation",
"checks" : [
"_is_search_factor_met",
"_is_replication_factor_met"
]
},
{
"end_point" : "licenser/messages",
"checks" : [
"_check_licensing_messages"
]
},
{
"end_point" : "licenser/pools",
"checks" : [
"_check_license_pool_usage"
]
},
]
def __init__(self, baseurl, username, password,nagios):
self.baseurl = baseurl
passwd_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
passwd_mgr.add_password(None,baseurl, username, password)
self.http_client = urllib2.build_opener(urllib2.HTTPBasicAuthHandler(passwd_mgr))
self.nagios = nagios
def _load_json(self,endpoint):
return json.loads(self.http_client.open(self.baseurl+endpoint+"?output_mode=json").read())
def run_checks(self):
for checker in self.endpoints:
json = self._load_json(checker["end_point"])
for check in checker["checks"]:
self.__getattribute__(check)(json)
# Check functions
def _cluster_messages(self,json):
for message in json["entry"]:
m=message["content"]
self.nagios.AppendStatus("Splunk system message %s: %s" % (m["severity"],m["message"]))
self.nagios.SetExitCode("WARNING")
def _check_maintenance_mode(self,json):
maint_mode = json["entry"][0]["content"]["maintenance_mode"]
if maint_mode:
self.nagios.AppendStatus("Cluster is in maintenance mode.")
self.nagios.SetExitCode("WARNING")
def _check_indexing_ready(self,json):
idx_rdy = json["entry"][0]["content"]["indexing_ready_flag"]
if not idx_rdy:
self.nagios.AppendStatus("Cluster is not able to index data")
self.nagios.SetExitCode("CRITICAL")
def _check_peer_status(self,json):
for peer in json["entry"]:
p = peer["content"]
if p["status"] == "Up":
pass # all is well with this peer
elif p["status"] == "Down":
self.nagios.AppendStatus("indexer %s is down" % ( p["label"]))
self.nagios.SetExitCode("CRITICAL")
else:
self.nagios.AppendStatus("indexer %s has status %s" % (p["label"], p["status"]))
self.nagios.SetExitCode("WARNING")
def _get_pending_job_count(self,json):
for peer in json["entry"]:
p=peer["content"]
self.nagios.AddPerfData( "jobcount_%s" % p["label"], p["pending_job_count"])
def _is_search_factor_met(self,json):
if int(json["entry"][0]["content"]["search_factor_met"]) != 1:
self.nagios.AppendStatus("Cluster search factor not met")
self.nagios.SetExitCode("CRITICAL")
def _is_replication_factor_met(self,json):
if int(json["entry"][0]["content"]["replication_factor_met"]) != 1:
self.nagios.AppendStatus("Cluster replication factor not met")
self.nagios.SetExitCode("CRITICAL")
def _check_licensing_messages(self,json):
for message in json["entry"]:
m=message["content"]
self.nagios.AppendStatus("Splunk licensing message %s: %s" % (m["severity"],m["description"]))
self.nagios.SetExitCode("WARNING")
def _check_license_pool_usage(self,json):
for lic_pool in json["entry"]:
l = lic_pool["content"]
try:
usage_pct = int((l["used_bytes"] / l["effective_quota"])*100)
self.nagios.AddPerfData("lic_pool_usage_%s" % l["description"], "%d%%" % usage_pct)
if usage_pct > 90:
self.nagios.SetExitCode("WARNING")
self.nagios.AppendStatus("License pool %s usage is high = %d%%" % (l["description"], usage_pct))
except ZeroDivisionError:
pass
if __name__ == "__main__":
nag = Nagios("CHECK_SPLUNK_CLUSTER")
try:
cfg = ConfigParser()
cfg.read(sys.argv[1])
splunk = SplunkCluster(
cfg.get("splunk","baseurl"),
cfg.get("splunk","username"),
cfg.get("splunk","password"),
nag
)
except:
nag.SetStatus("Unable to get configuration data, usage: %s <cfg_file>"%(sys.argv[0]))
nag.BuildResponseAndExit()
nag.SetStatus("")
nag.SetExitCode("OK")
splunk.run_checks()
nag.BuildResponseAndExit()