Skip to content

Commit b58be93

Browse files
Merge pull request #19133 from bparees/metrics
TemplateInstance metrics update
2 parents bb8bbbe + 6386481 commit b58be93

File tree

6 files changed

+212
-95
lines changed

6 files changed

+212
-95
lines changed

pkg/template/controller/metrics.go

Lines changed: 44 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,79 @@
11
package controller
22

33
import (
4+
"time"
5+
46
templateapi "github.com/openshift/origin/pkg/template/apis/template"
57
"github.com/prometheus/client_golang/prometheus"
68
"k8s.io/apimachinery/pkg/labels"
79
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
810
kapi "k8s.io/kubernetes/pkg/apis/core"
911
)
1012

11-
var templateInstancesTotal = prometheus.NewGaugeVec(
12-
prometheus.GaugeOpts{
13-
Name: "openshift_template_instance_total",
14-
Help: "Counts TemplateInstance objects",
13+
var templateInstanceCompleted = prometheus.NewCounterVec(
14+
prometheus.CounterOpts{
15+
Name: "openshift_template_instance_completed_total",
16+
Help: "Counts completed TemplateInstance objects by condition",
1517
},
16-
nil,
18+
[]string{"condition"},
1719
)
1820

19-
var templateInstanceStatusCondition = prometheus.NewGaugeVec(
20-
prometheus.GaugeOpts{
21-
Name: "openshift_template_instance_status_condition_total",
22-
Help: "Counts TemplateInstance objects by condition type and status",
23-
},
24-
[]string{"type", "status"},
25-
)
21+
func newTemplateInstanceActiveAge() prometheus.Histogram {
22+
// We recreate a new Histogram object every time Collect is called. This is
23+
// because we are recording a series of point-in-time observations about the
24+
// population of "active" TemplateInstances. Were we to use a singleton
25+
// Histogram, we would only be able to observe TemplateInstances as they
26+
// completed, which would add latency in reporting very long-running
27+
// TemplateInstances and completely prevent reporting of non-completing
28+
// TemplateInstances.
29+
//
30+
// Effectively, the resulting series is to Histogram what Gauge is to
31+
// Counter. In the resulting series, _count and _sum are not monotonically
32+
// increasing (because TemplateInstances are no longer part of the
33+
// population once they terminate or are deleted), therefore it is not valid
34+
// to use counter functions such as rate() on this series.
2635

27-
var templateInstancesActiveStartTime = prometheus.NewGaugeVec(
28-
prometheus.GaugeOpts{
29-
Name: "openshift_template_instance_active_start_time_seconds",
30-
Help: "Show the start time in unix epoch form of active TemplateInstance objects by namespace and name",
31-
},
32-
[]string{"namespace", "name"},
33-
)
36+
return prometheus.NewHistogram(
37+
prometheus.HistogramOpts{
38+
Name: "openshift_template_instance_active_age_seconds",
39+
Help: "Shows the instantaneous age distribution of active TemplateInstance objects",
40+
Buckets: prometheus.LinearBuckets(600, 600, 7),
41+
},
42+
)
43+
}
3444

3545
func (c *TemplateInstanceController) Describe(ch chan<- *prometheus.Desc) {
36-
templateInstancesTotal.Describe(ch)
37-
templateInstanceStatusCondition.Describe(ch)
38-
templateInstancesActiveStartTime.Describe(ch)
46+
templateInstanceActiveAge := newTemplateInstanceActiveAge()
47+
48+
templateInstanceCompleted.Describe(ch)
49+
templateInstanceActiveAge.Describe(ch)
3950
}
4051

4152
func (c *TemplateInstanceController) Collect(ch chan<- prometheus.Metric) {
53+
templateInstanceCompleted.Collect(ch)
54+
55+
now := c.clock.Now()
56+
4257
templateInstances, err := c.lister.List(labels.Everything())
4358
if err != nil {
4459
utilruntime.HandleError(err)
4560
return
4661
}
4762

48-
templateInstancesTotal.Reset()
49-
templateInstanceStatusCondition.Reset()
50-
templateInstancesActiveStartTime.Reset()
51-
52-
templateInstancesTotal.WithLabelValues().Set(0)
63+
templateInstanceActiveAge := newTemplateInstanceActiveAge()
5364

65+
nextTemplateInstance:
5466
for _, templateInstance := range templateInstances {
55-
waiting := true
56-
57-
templateInstancesTotal.WithLabelValues().Inc()
58-
5967
for _, cond := range templateInstance.Status.Conditions {
60-
templateInstanceStatusCondition.WithLabelValues(string(cond.Type), string(cond.Status)).Inc()
61-
6268
if cond.Status == kapi.ConditionTrue &&
63-
(cond.Type == templateapi.TemplateInstanceInstantiateFailure || cond.Type == templateapi.TemplateInstanceReady) {
64-
waiting = false
69+
(cond.Type == templateapi.TemplateInstanceInstantiateFailure ||
70+
cond.Type == templateapi.TemplateInstanceReady) {
71+
continue nextTemplateInstance
6572
}
6673
}
6774

68-
if waiting {
69-
templateInstancesActiveStartTime.WithLabelValues(templateInstance.Namespace, templateInstance.Name).Set(float64(templateInstance.CreationTimestamp.Unix()))
70-
}
75+
templateInstanceActiveAge.Observe(float64(now.Sub(templateInstance.CreationTimestamp.Time) / time.Second))
7176
}
7277

73-
templateInstancesTotal.Collect(ch)
74-
templateInstanceStatusCondition.Collect(ch)
75-
templateInstancesActiveStartTime.Collect(ch)
78+
templateInstanceActiveAge.Collect(ch)
7679
}

pkg/template/controller/metrics_test.go

Lines changed: 134 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,42 @@ import (
77
"time"
88

99
templateapi "github.com/openshift/origin/pkg/template/apis/template"
10+
templateclient "github.com/openshift/origin/pkg/template/generated/internalclientset"
11+
"github.com/openshift/origin/pkg/template/generated/internalclientset/fake"
1012
"github.com/openshift/origin/pkg/template/generated/listers/template/internalversion"
1113

1214
"github.com/prometheus/client_golang/prometheus"
1315
"github.com/prometheus/client_golang/prometheus/promhttp"
1416

1517
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1618
"k8s.io/apimachinery/pkg/labels"
19+
"k8s.io/apimachinery/pkg/runtime"
20+
"k8s.io/client-go/util/workqueue"
1721
kapi "k8s.io/kubernetes/pkg/apis/core"
1822
)
1923

20-
type fakeLister []*templateapi.TemplateInstance
24+
type fakeLister struct {
25+
templateClient templateclient.Interface
26+
}
27+
28+
func (f *fakeLister) List(labels.Selector) ([]*templateapi.TemplateInstance, error) {
29+
list, err := f.templateClient.Template().TemplateInstances("").List(metav1.ListOptions{})
30+
if err != nil {
31+
return nil, err
32+
}
33+
templateInstances := make([]*templateapi.TemplateInstance, len(list.Items))
34+
for i := range list.Items {
35+
templateInstances[i] = &list.Items[i]
36+
}
37+
return templateInstances, err
38+
}
2139

22-
func (f fakeLister) List(labels.Selector) ([]*templateapi.TemplateInstance, error) {
23-
return f, nil
40+
func (f *fakeLister) Get(name string) (*templateapi.TemplateInstance, error) {
41+
return f.templateClient.Template().TemplateInstances("").Get(name, metav1.GetOptions{})
2442
}
25-
func (fakeLister) TemplateInstances(string) internalversion.TemplateInstanceNamespaceLister {
26-
return nil
43+
44+
func (f *fakeLister) TemplateInstances(string) internalversion.TemplateInstanceNamespaceLister {
45+
return f
2746
}
2847

2948
type fakeResponseWriter struct {
@@ -41,58 +60,132 @@ func (f *fakeResponseWriter) WriteHeader(statusCode int) {
4160
}
4261

4362
func TestMetrics(t *testing.T) {
44-
expectedResponse := `# HELP openshift_template_instance_active_start_time_seconds Show the start time in unix epoch form of active TemplateInstance objects by namespace and name
45-
# TYPE openshift_template_instance_active_start_time_seconds gauge
46-
openshift_template_instance_active_start_time_seconds{name="testname",namespace="testnamespace"} 123
47-
# HELP openshift_template_instance_status_condition_total Counts TemplateInstance objects by condition type and status
48-
# TYPE openshift_template_instance_status_condition_total gauge
49-
openshift_template_instance_status_condition_total{status="False",type="Ready"} 1
50-
openshift_template_instance_status_condition_total{status="True",type="Ready"} 1
51-
# HELP openshift_template_instance_total Counts TemplateInstance objects
52-
# TYPE openshift_template_instance_total gauge
53-
openshift_template_instance_total 2
63+
expectedResponse := `# HELP openshift_template_instance_active_age_seconds Shows the instantaneous age distribution of active TemplateInstance objects
64+
# TYPE openshift_template_instance_active_age_seconds histogram
65+
openshift_template_instance_active_age_seconds_bucket{le="600"} 0
66+
openshift_template_instance_active_age_seconds_bucket{le="1200"} 1
67+
openshift_template_instance_active_age_seconds_bucket{le="1800"} 1
68+
openshift_template_instance_active_age_seconds_bucket{le="2400"} 1
69+
openshift_template_instance_active_age_seconds_bucket{le="3000"} 1
70+
openshift_template_instance_active_age_seconds_bucket{le="3600"} 1
71+
openshift_template_instance_active_age_seconds_bucket{le="4200"} 1
72+
openshift_template_instance_active_age_seconds_bucket{le="+Inf"} 1
73+
openshift_template_instance_active_age_seconds_sum 900
74+
openshift_template_instance_active_age_seconds_count 1
75+
# HELP openshift_template_instance_completed_total Counts completed TemplateInstance objects by condition
76+
# TYPE openshift_template_instance_completed_total counter
77+
openshift_template_instance_completed_total{condition="InstantiateFailure"} 2
78+
openshift_template_instance_completed_total{condition="Ready"} 1
5479
`
80+
81+
clock := &fakeClock{now: time.Unix(0, 0)}
82+
5583
registry := prometheus.NewRegistry()
5684

57-
c := &TemplateInstanceController{
58-
lister: &fakeLister{
59-
{
60-
Status: templateapi.TemplateInstanceStatus{
61-
Conditions: []templateapi.TemplateInstanceCondition{
62-
{
63-
Type: templateapi.TemplateInstanceReady,
64-
Status: kapi.ConditionTrue,
65-
},
85+
fakeTemplateClient := fake.NewSimpleClientset(
86+
// when sync is called on this TemplateInstance it should fail and
87+
// increment openshift_template_instance_completed_total
88+
// {condition="InstantiateFailure"}
89+
&templateapi.TemplateInstance{
90+
ObjectMeta: metav1.ObjectMeta{
91+
Name: "abouttofail",
92+
},
93+
Spec: templateapi.TemplateInstanceSpec{
94+
Template: templateapi.Template{
95+
Objects: []runtime.Object{
96+
&kapi.ConfigMap{},
6697
},
6798
},
6899
},
69-
{
70-
ObjectMeta: metav1.ObjectMeta{
71-
Namespace: "testnamespace",
72-
Name: "testname",
73-
CreationTimestamp: metav1.Time{
74-
Time: time.Unix(123, 0),
100+
},
101+
// when sync is called on this TemplateInstance it should timeout and
102+
// increment openshift_template_instance_completed_total
103+
// {condition="InstantiateFailure"}
104+
&templateapi.TemplateInstance{
105+
ObjectMeta: metav1.ObjectMeta{
106+
Name: "abouttotimeout",
107+
},
108+
Spec: templateapi.TemplateInstanceSpec{
109+
Template: templateapi.Template{
110+
Objects: []runtime.Object{
111+
&kapi.ConfigMap{},
75112
},
76113
},
77-
Status: templateapi.TemplateInstanceStatus{
78-
Conditions: []templateapi.TemplateInstanceCondition{
79-
{
80-
Type: templateapi.TemplateInstanceReady,
81-
Status: kapi.ConditionFalse,
82-
},
114+
Requester: &templateapi.TemplateInstanceRequester{},
115+
},
116+
Status: templateapi.TemplateInstanceStatus{
117+
Objects: []templateapi.TemplateInstanceObject{
118+
{},
119+
},
120+
},
121+
},
122+
// when sync is called on this TemplateInstance it should succeed and
123+
// increment openshift_template_instance_completed_total
124+
// {condition="Ready"}
125+
&templateapi.TemplateInstance{
126+
ObjectMeta: metav1.ObjectMeta{
127+
Name: "abouttosucceed",
128+
CreationTimestamp: metav1.Time{
129+
Time: clock.now,
130+
},
131+
},
132+
Spec: templateapi.TemplateInstanceSpec{
133+
Template: templateapi.Template{
134+
Objects: []runtime.Object{
135+
&kapi.ConfigMap{},
83136
},
84137
},
138+
Requester: &templateapi.TemplateInstanceRequester{},
139+
},
140+
Status: templateapi.TemplateInstanceStatus{
141+
Objects: []templateapi.TemplateInstanceObject{
142+
{},
143+
},
85144
},
86145
},
146+
// this TemplateInstance is in-flight, not timed out.
147+
&templateapi.TemplateInstance{
148+
ObjectMeta: metav1.ObjectMeta{
149+
CreationTimestamp: metav1.Time{
150+
Time: clock.now.Add(-900 * time.Second),
151+
},
152+
},
153+
Status: templateapi.TemplateInstanceStatus{
154+
Conditions: []templateapi.TemplateInstanceCondition{
155+
{
156+
Type: templateapi.TemplateInstanceReady,
157+
Status: kapi.ConditionFalse,
158+
},
159+
},
160+
},
161+
},
162+
)
163+
164+
c := &TemplateInstanceController{
165+
lister: &fakeLister{fakeTemplateClient},
166+
templateClient: fakeTemplateClient,
167+
clock: clock,
168+
readinessLimiter: &workqueue.BucketRateLimiter{},
87169
}
88170

89171
registry.MustRegister(c)
90-
91172
h := promhttp.HandlerFor(registry, promhttp.HandlerOpts{ErrorHandling: promhttp.PanicOnError})
92-
rw := &fakeResponseWriter{header: http.Header{}}
93-
h.ServeHTTP(rw, &http.Request{})
94173

95-
if rw.String() != expectedResponse {
96-
t.Error(rw.String())
174+
// We loop twice: we expect the metrics response to match after the first
175+
// set of sync calls, and not change after the second set.
176+
for i := 0; i < 2; i++ {
177+
for _, key := range []string{"/abouttofail", "/abouttotimeout", "/abouttosucceed"} {
178+
err := c.sync(key)
179+
if err != nil {
180+
t.Fatal(err)
181+
}
182+
}
183+
184+
rw := &fakeResponseWriter{header: http.Header{}}
185+
h.ServeHTTP(rw, &http.Request{})
186+
187+
if rw.String() != expectedResponse {
188+
t.Errorf("run %d: %s\n", i, rw.String())
189+
}
97190
}
98191
}

pkg/template/controller/readiness_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ func TestCheckReadiness(t *testing.T) {
194194
groupKind: batch.Kind("Job"),
195195
object: &batch.Job{
196196
Status: batch.JobStatus{
197-
CompletionTime: &metav1.Time{Time: time.Now()},
197+
CompletionTime: &metav1.Time{Time: time.Unix(0, 0)},
198198
},
199199
},
200200
expectedReady: true,

0 commit comments

Comments
 (0)