Skip to content

Commit

Permalink
Add slave and complete master metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
discordianfish committed Nov 17, 2015
1 parent 7a52f5d commit cefed2a
Show file tree
Hide file tree
Showing 4 changed files with 228 additions and 2 deletions.
File renamed without changes.
47 changes: 47 additions & 0 deletions master.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,16 @@ func newMasterCollector(url string, timeout time.Duration) *metricCollector {
c.(*prometheus.GaugeVec).WithLabelValues("used").Set(used)
return nil
},
gauge("master", "mem_revocable", "Current revocable memory resources in cluster.", "type"): func(m metricMap, c prometheus.Collector) error {
total, ok := m["master/mem_revocable_total"]
used, ok := m["master/mem_revocable_used"]
if !ok {
return notFoundInMap
}
c.(*prometheus.GaugeVec).WithLabelValues("free").Set(total - used)
c.(*prometheus.GaugeVec).WithLabelValues("used").Set(used)
return nil
},
gauge("master", "disk", "Current disk resources in cluster.", "type"): func(m metricMap, c prometheus.Collector) error {
total, ok := m["master/disk_total"]
used, ok := m["master/disk_used"]
Expand All @@ -50,7 +60,44 @@ func newMasterCollector(url string, timeout time.Duration) *metricCollector {
c.(*prometheus.GaugeVec).WithLabelValues("used").Set(used)
return nil
},
gauge("master", "disk_revocable", "Current disk resources in cluster.", "type"): func(m metricMap, c prometheus.Collector) error {
total, ok := m["master/disk_revocable_total"]
used, ok := m["master/disk_revocable_used"]
if !ok {
return notFoundInMap
}
c.(*prometheus.GaugeVec).WithLabelValues("free").Set(total - used)
c.(*prometheus.GaugeVec).WithLabelValues("used").Set(used)
return nil
},

// Master stats about uptime and election state
prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "mesos",
Subsystem: "master",
Name: "elected",
Help: "1 if master is elected leader, 0 if not",
}): func(m metricMap, c prometheus.Collector) error {
elected, ok := m["master/elected"]
if !ok {
return notFoundInMap
}
c.(prometheus.Gauge).Set(elected)
return nil
},
prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "mesos",
Subsystem: "master",
Name: "uptime_seconds",
Help: "Number of seconds the master process is running.",
}): func(m metricMap, c prometheus.Collector) error {
uptime, ok := m["master/uptime_secs"]
if !ok {
return notFoundInMap
}
c.(prometheus.Gauge).Set(uptime)
return nil
},
// Master stats about agents
counter("master", "slave_registration_events_total", "Total number of registration events on this master since it booted.", "event"): func(m metricMap, c prometheus.Collector) error {
registrations, ok := m["master/slave_registrations"]
Expand Down
1 change: 0 additions & 1 deletion master_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ func newMasterStateCollector(url string, timeout time.Duration) *masterCollector
continue
}
for _, task := range f.Completed {
log.Printf("%#v", task)
values := []string{
task.ID,
task.SlaveID,
Expand Down
182 changes: 181 additions & 1 deletion slave.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,187 @@ import (
)

func newSlaveCollector(url string, timeout time.Duration) *metricCollector {
metrics := map[prometheus.Collector]func(metricMap, prometheus.Collector) error{}
metrics := map[prometheus.Collector]func(metricMap, prometheus.Collector) error{
// CPU/Disk/Mem resources in free/used
gauge("slave", "cpus", "Current CPU resources in cluster.", "type"): func(m metricMap, c prometheus.Collector) error {
total, ok := m["slave/cpus_total"]
used, ok := m["slave/cpus_used"]
if !ok {
return notFoundInMap
}
c.(*prometheus.GaugeVec).WithLabelValues("free").Set(total - used)
c.(*prometheus.GaugeVec).WithLabelValues("used").Set(used)
return nil
},
gauge("slave", "cpus_revocable", "Current revocable CPU resources in cluster.", "type"): func(m metricMap, c prometheus.Collector) error {
total, ok := m["slave/cpus_revocable_total"]
used, ok := m["slave/cpus_revocable_used"]
if !ok {
return notFoundInMap
}
c.(*prometheus.GaugeVec).WithLabelValues("free").Set(total - used)
c.(*prometheus.GaugeVec).WithLabelValues("used").Set(used)
return nil
},
gauge("slave", "mem", "Current memory resources in cluster.", "type"): func(m metricMap, c prometheus.Collector) error {
total, ok := m["slave/mem_total"]
used, ok := m["slave/mem_used"]
if !ok {
return notFoundInMap
}
c.(*prometheus.GaugeVec).WithLabelValues("free").Set(total - used)
c.(*prometheus.GaugeVec).WithLabelValues("used").Set(used)
return nil
},
gauge("slave", "mem_revocable", "Current revocable memory resources in cluster.", "type"): func(m metricMap, c prometheus.Collector) error {
total, ok := m["slave/mem_revocable_total"]
used, ok := m["slave/mem_revocable_used"]
if !ok {
return notFoundInMap
}
c.(*prometheus.GaugeVec).WithLabelValues("free").Set(total - used)
c.(*prometheus.GaugeVec).WithLabelValues("used").Set(used)
return nil
},
gauge("slave", "disk", "Current disk resources in cluster.", "type"): func(m metricMap, c prometheus.Collector) error {
total, ok := m["slave/disk_total"]
used, ok := m["slave/disk_used"]
if !ok {
return notFoundInMap
}
c.(*prometheus.GaugeVec).WithLabelValues("free").Set(total - used)
c.(*prometheus.GaugeVec).WithLabelValues("used").Set(used)
return nil
},
gauge("slave", "disk_revocable", "Current disk resources in cluster.", "type"): func(m metricMap, c prometheus.Collector) error {
total, ok := m["slave/disk_revocable_total"]
used, ok := m["slave/disk_revocable_used"]
if !ok {
return notFoundInMap
}
c.(*prometheus.GaugeVec).WithLabelValues("free").Set(total - used)
c.(*prometheus.GaugeVec).WithLabelValues("used").Set(used)
return nil
},

// Slave stats about uptime and connectivity
prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "mesos",
Subsystem: "slave",
Name: "registered",
Help: "1 if slave is registered with master, 0 if not.",
}): func(m metricMap, c prometheus.Collector) error {
registered, ok := m["slave/registered"]
if !ok {
return notFoundInMap
}
c.(prometheus.Gauge).Set(registered)
return nil
},
prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "mesos",
Subsystem: "slave",
Name: "uptime_seconds",
Help: "Number of seconds the master process is running.",
}): func(m metricMap, c prometheus.Collector) error {
uptime, ok := m["slave/uptime_secs"]
if !ok {
return notFoundInMap
}
c.(prometheus.Gauge).Set(uptime)
return nil
},

// Slave stats about frameworks and executors
gauge("slave", "executor_state", "Current number of executors by state.", "state"): func(m metricMap, c prometheus.Collector) error {
registering, ok := m["slave/executors_registering"]
running, ok := m["slave/executors_running"]
terminating, ok := m["slave/executors_terminating"]
if !ok {
return notFoundInMap
}
c.(*prometheus.GaugeVec).WithLabelValues("registering").Set(registering)
c.(*prometheus.GaugeVec).WithLabelValues("running").Set(running)
c.(*prometheus.GaugeVec).WithLabelValues("terminating").Set(terminating)
return nil
},
prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "mesos",
Subsystem: "slave",
Name: "frameworks_active",
Help: "Current number of active frameworks",
}): func(m metricMap, c prometheus.Collector) error {
active, ok := m["slave/frameworks_active"]
if !ok {
return notFoundInMap
}
c.(prometheus.Gauge).Set(active)
return nil
},
prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "mesos",
Subsystem: "slave",
Name: "executors_terminated",
Help: "Total number of executor terminations.",
}): func(m metricMap, c prometheus.Collector) error {
terminated, ok := m["slave/executors_terminated"]
if !ok {
return notFoundInMap
}
c.(prometheus.Counter).Set(terminated)
return nil
},

// Slave stats about tasks
counter("slave", "task_states_exit_total", "Total number of tasks processed by exit state.", "state"): func(m metricMap, c prometheus.Collector) error {
errored, ok := m["slave/tasks_error"]
failed, ok := m["slave/tasks_failed"]
finished, ok := m["slave/tasks_finished"]
killed, ok := m["slave/tasks_killed"]
lost, ok := m["slave/tasks_lost"]
if !ok {
return notFoundInMap
}
c.(*prometheus.CounterVec).WithLabelValues("errored").Set(errored)
c.(*prometheus.CounterVec).WithLabelValues("failed").Set(failed)
c.(*prometheus.CounterVec).WithLabelValues("finished").Set(finished)
c.(*prometheus.CounterVec).WithLabelValues("killed").Set(killed)
c.(*prometheus.CounterVec).WithLabelValues("lost").Set(lost)
return nil
},
counter("slave", "task_states_current", "Current number of tasks by state.", "state"): func(m metricMap, c prometheus.Collector) error {
running, ok := m["slave/tasks_running"]
staging, ok := m["slave/tasks_staging"]
starting, ok := m["slave/tasks_starting"]
if !ok {
return notFoundInMap
}
c.(*prometheus.CounterVec).WithLabelValues("running").Set(running)
c.(*prometheus.CounterVec).WithLabelValues("staging").Set(staging)
c.(*prometheus.CounterVec).WithLabelValues("starting").Set(starting)
return nil
},

// Slave stats about messages
counter("slave", "messages_outcomes_total",
"Total number of messages by outcome of operation",
"type", "outcome"): func(m metricMap, c prometheus.Collector) error {

frameworkMessagesValid, ok := m["slave/valid_framework_messages"]
frameworkMessagesInvalid, ok := m["slave/invalid_framework_messages"]
statusUpdateValid, ok := m["slave/valid_status_updates"]
statusUpdateInvalid, ok := m["slave/invalid_status_updates"]

if !ok {
return notFoundInMap
}
c.(*prometheus.CounterVec).WithLabelValues("framework", "valid").Set(frameworkMessagesValid)
c.(*prometheus.CounterVec).WithLabelValues("framework", "invalid").Set(frameworkMessagesInvalid)
c.(*prometheus.CounterVec).WithLabelValues("status", "valid").Set(statusUpdateValid)
c.(*prometheus.CounterVec).WithLabelValues("status", "invalid").Set(statusUpdateInvalid)

return nil
},
}
return newMetricCollector(url, timeout, metrics)
}

0 comments on commit cefed2a

Please # to comment.