Expose probe_dns_duration_seconds metric (#662)
authorMarcelo Magallon <marcelo.magallon@gmail.com>
Mon, 13 Jul 2020 10:30:52 +0000 (04:30 -0600)
committerGitHub <noreply@github.com>
Mon, 13 Jul 2020 10:30:52 +0000 (11:30 +0100)
* Expose probe_dns_duration_seconds metric

This follows the same pattern as probe_http_duration_seconds and
probe_icmp_duration_seconds: it captures the time it takes to perform
the actual request, excluding the time it takes to set up things.
probe_duration_seconds includes the time to perform the request as well
as all the time it takes to set it up. probe_dns_lookup_time_seconds
refers to the time it takes to resolve the target's address, so it
doesn't capture the time it takes to make one request for that target.

Follow probe_http_duration_seconds and add a "phase" label to
probe_dns_duration_seconds, which splits the time it takes to complete
the check into "resolve" (time to resolve the target name), "connect"
(time to connect to the DNS server) and "request" (time to send the
query and retrieve the response).

Signed-off-by: Marcelo E. Magallon <marcelo.magallon@grafana.com>
prober/dns.go
prober/dns_test.go
prober/http_test.go
prober/utils_test.go

index 5ecb6f070f81581cb881002f610dc85353041ed4..4b3aeac0dae87fcc37d3792c62bbb429c247e0e7 100644 (file)
@@ -126,6 +126,10 @@ func validRcode(rcode int, valid []string, logger log.Logger) bool {
 
 func ProbeDNS(ctx context.Context, target string, module config.Module, registry *prometheus.Registry, logger log.Logger) bool {
        var dialProtocol string
+       probeDNSDurationGaugeVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+               Name: "probe_dns_duration_seconds",
+               Help: "Duration of DNS request by phase",
+       }, []string{"phase"})
        probeDNSAnswerRRSGauge := prometheus.NewGauge(prometheus.GaugeOpts{
                Name: "probe_dns_answer_rrs",
                Help: "Returns number of entries in the answer resource record list",
@@ -138,6 +142,12 @@ func ProbeDNS(ctx context.Context, target string, module config.Module, registry
                Name: "probe_dns_additional_rrs",
                Help: "Returns number of entries in the additional resource record list",
        })
+
+       for _, lv := range []string{"resolve", "connect", "request"} {
+               probeDNSDurationGaugeVec.WithLabelValues(lv)
+       }
+
+       registry.MustRegister(probeDNSDurationGaugeVec)
        registry.MustRegister(probeDNSAnswerRRSGauge)
        registry.MustRegister(probeDNSAuthorityRRSGauge)
        registry.MustRegister(probeDNSAdditionalRRSGauge)
@@ -182,11 +192,12 @@ func ProbeDNS(ctx context.Context, target string, module config.Module, registry
                }
                targetAddr = target
        }
-       ip, _, err = chooseProtocol(ctx, module.DNS.IPProtocol, module.DNS.IPProtocolFallback, targetAddr, registry, logger)
+       ip, lookupTime, err := chooseProtocol(ctx, module.DNS.IPProtocol, module.DNS.IPProtocolFallback, targetAddr, registry, logger)
        if err != nil {
                level.Error(logger).Log("msg", "Error resolving address", "err", err)
                return false
        }
+       probeDNSDurationGaugeVec.WithLabelValues("resolve").Add(lookupTime)
        targetIP := net.JoinHostPort(ip.String(), port)
 
        if ip.IP.To4() == nil {
@@ -246,7 +257,14 @@ func ProbeDNS(ctx context.Context, target string, module config.Module, registry
        level.Info(logger).Log("msg", "Making DNS query", "target", targetIP, "dial_protocol", dialProtocol, "query", module.DNS.QueryName, "type", qt, "class", qc)
        timeoutDeadline, _ := ctx.Deadline()
        client.Timeout = time.Until(timeoutDeadline)
-       response, _, err := client.Exchange(msg, targetIP)
+       requestStart := time.Now()
+       response, rtt, err := client.Exchange(msg, targetIP)
+       // The rtt value returned from client.Exchange includes only the time to
+       // exchange messages with the server _after_ the connection is created.
+       // We compute the connection time as the total time for the operation
+       // minus the time for the actual request rtt.
+       probeDNSDurationGaugeVec.WithLabelValues("connect").Set((time.Since(requestStart) - rtt).Seconds())
+       probeDNSDurationGaugeVec.WithLabelValues("request").Set(rtt.Seconds())
        if err != nil {
                level.Error(logger).Log("msg", "Error while sending a DNS query", "err", err)
                return false
index 75dc7c33aa22abfb69056c3a9681bd1c33dd0759..ef2d2b0cbcc63c88f28382457dafcbda3bf1d9d6 100644 (file)
@@ -570,3 +570,49 @@ func TestDNSProtocol(t *testing.T) {
 
        }
 }
+
+// TestDNSMetrics checks that calling ProbeDNS populates the expected
+// set of metrics for a DNS probe, but it does not test that those
+// metrics contain specific values.
+func TestDNSMetrics(t *testing.T) {
+       server, addr := startDNSServer("udp", recursiveDNSHandler)
+       defer server.Shutdown()
+
+       _, port, _ := net.SplitHostPort(addr.String())
+
+       module := config.Module{
+               Timeout: time.Second,
+               DNS: config.DNSProbe{
+                       IPProtocol:         "ip4",
+                       IPProtocolFallback: true,
+                       QueryName:          "example.com",
+               },
+       }
+       registry := prometheus.NewRegistry()
+       testCTX, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+       defer cancel()
+       result := ProbeDNS(testCTX, net.JoinHostPort("localhost", port), module, registry, log.NewNopLogger())
+       if !result {
+               t.Fatalf("DNS test connection failed, expected success.")
+       }
+       mfs, err := registry.Gather()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       expectedMetrics := map[string]map[string]map[string]struct{}{
+               "probe_dns_lookup_time_seconds": nil,
+               "probe_dns_duration_seconds": {
+                       "phase": {
+                               "resolve": {},
+                               "connect": {},
+                               "request": {},
+                       },
+               },
+               "probe_dns_answer_rrs":     nil,
+               "probe_dns_authority_rrs":  nil,
+               "probe_dns_additional_rrs": nil,
+       }
+
+       checkMetrics(expectedMetrics, mfs, t)
+}
index 3d3f63e7e584a96ace0225d1558d9b4f23857dc1..b2c28ae02937618a7fc982e5feefda04441950ef 100644 (file)
@@ -702,42 +702,19 @@ func TestHTTPPhases(t *testing.T) {
                t.Fatal(err)
        }
 
-       found := false
-       foundLabel := map[string]bool{
-               "connect":    false,
-               "processing": false,
-               "resolve":    false,
-               "transfer":   false,
-               "tls":        false,
-       }
-       for _, mf := range mfs {
-               if mf.GetName() == "probe_http_duration_seconds" {
-                       found = true
-                       for _, metric := range mf.GetMetric() {
-                               for _, lp := range metric.Label {
-                                       if lp.GetName() == "phase" {
-                                               f, ok := foundLabel[lp.GetValue()]
-                                               if !ok {
-                                                       t.Fatalf("Unexpected label phase=%s", lp.GetValue())
-                                               }
-                                               if f {
-                                                       t.Fatalf("Label phase=%s duplicated", lp.GetValue())
-                                               }
-                                               foundLabel[lp.GetValue()] = true
-                                       }
-                               }
-                       }
-
-               }
-       }
-       if !found {
-               t.Fatal("probe_http_duration_seconds not found")
-       }
-       for lv, found := range foundLabel {
-               if !found {
-                       t.Fatalf("Label phase=%s not found", lv)
-               }
+       expectedMetrics := map[string]map[string]map[string]struct{}{
+               "probe_http_duration_seconds": {
+                       "phase": {
+                               "connect":    {},
+                               "processing": {},
+                               "resolve":    {},
+                               "transfer":   {},
+                               "tls":        {},
+                       },
+               },
        }
+
+       checkMetrics(expectedMetrics, mfs, t)
 }
 
 func TestCookieJar(t *testing.T) {
index 2ac38eebe4e5e96dfd825fa04858aac9ab7fee46..a53ee4905a95c143a6e84861219405defc93b305 100644 (file)
@@ -171,3 +171,83 @@ func TestChooseProtocol(t *testing.T) {
                t.Error("without fallback it should not answer")
        }
 }
+
+func checkMetrics(expected map[string]map[string]map[string]struct{}, mfs []*dto.MetricFamily, t *testing.T) {
+       type (
+               valueValidation struct {
+                       found bool
+               }
+               labelValidation struct {
+                       found  bool
+                       values map[string]valueValidation
+               }
+               metricValidation struct {
+                       found  bool
+                       labels map[string]labelValidation
+               }
+       )
+
+       foundMetrics := map[string]metricValidation{}
+
+       for mname, labels := range expected {
+               var mv metricValidation
+               if labels != nil {
+                       mv.labels = map[string]labelValidation{}
+                       for lname, values := range labels {
+                               var lv labelValidation
+                               if values != nil {
+                                       lv.values = map[string]valueValidation{}
+                                       for vname, _ := range values {
+                                               lv.values[vname] = valueValidation{}
+                                       }
+                               }
+                               mv.labels[lname] = lv
+                       }
+               }
+               foundMetrics[mname] = mv
+       }
+
+       for _, mf := range mfs {
+               info, wanted := foundMetrics[mf.GetName()]
+               if !wanted {
+                       continue
+               }
+               info.found = true
+               for _, metric := range mf.GetMetric() {
+                       if info.labels == nil {
+                               continue
+                       }
+                       for _, lp := range metric.Label {
+                               if label, labelWanted := info.labels[lp.GetName()]; labelWanted {
+                                       label.found = true
+                                       if label.values != nil {
+                                               if value, wanted := label.values[lp.GetValue()]; !wanted {
+                                                       t.Fatalf("Unexpected label %s=%s", lp.GetName(), lp.GetValue())
+                                               } else if value.found {
+                                                       t.Fatalf("Label %s=%s duplicated", lp.GetName(), lp.GetValue())
+                                               }
+                                               label.values[lp.GetValue()] = valueValidation{found: true}
+                                       }
+                                       info.labels[lp.GetName()] = label
+                               }
+                       }
+               }
+               foundMetrics[mf.GetName()] = info
+       }
+
+       for mname, m := range foundMetrics {
+               if !m.found {
+                       t.Fatalf("metric %s wanted, not found", mname)
+               }
+               for lname, label := range m.labels {
+                       if !label.found {
+                               t.Fatalf("metric %s, label %s wanted, not found", mname, lname)
+                       }
+                       for vname, value := range label.values {
+                               if !value.found {
+                                       t.Fatalf("metric %s, label %s, value %s wanted, not found", mname, lname, vname)
+                               }
+                       }
+               }
+       }
+}