From 55c667ab4ce5bdd81ebf8b843851222aa1f7fab3 Mon Sep 17 00:00:00 2001 From: Marcelo Magallon Date: Mon, 13 Jul 2020 04:30:52 -0600 Subject: [PATCH] Expose probe_dns_duration_seconds metric (#662) * Expose probe_dns_duration_seconds metric This follows the same pattern as probe_http_duration_seconds and probe_icmp_duration_seconds: it captures the time it takes to perform the actual request, excluding the time it takes to set up things. probe_duration_seconds includes the time to perform the request as well as all the time it takes to set it up. probe_dns_lookup_time_seconds refers to the time it takes to resolve the target's address, so it doesn't capture the time it takes to make one request for that target. Follow probe_http_duration_seconds and add a "phase" label to probe_dns_duration_seconds, which splits the time it takes to complete the check into "resolve" (time to resolve the target name), "connect" (time to connect to the DNS server) and "request" (time to send the query and retrieve the response). Signed-off-by: Marcelo E. Magallon --- prober/dns.go | 22 ++++++++++-- prober/dns_test.go | 46 +++++++++++++++++++++++++ prober/http_test.go | 47 +++++++------------------- prober/utils_test.go | 80 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 158 insertions(+), 37 deletions(-) diff --git a/prober/dns.go b/prober/dns.go index 5ecb6f0..4b3aeac 100644 --- a/prober/dns.go +++ b/prober/dns.go @@ -126,6 +126,10 @@ func validRcode(rcode int, valid []string, logger log.Logger) bool { func ProbeDNS(ctx context.Context, target string, module config.Module, registry *prometheus.Registry, logger log.Logger) bool { var dialProtocol string + probeDNSDurationGaugeVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "probe_dns_duration_seconds", + Help: "Duration of DNS request by phase", + }, []string{"phase"}) probeDNSAnswerRRSGauge := prometheus.NewGauge(prometheus.GaugeOpts{ Name: "probe_dns_answer_rrs", Help: "Returns number of entries in the answer resource record list", @@ -138,6 +142,12 @@ func ProbeDNS(ctx context.Context, target string, module config.Module, registry Name: "probe_dns_additional_rrs", Help: "Returns number of entries in the additional resource record list", }) + + for _, lv := range []string{"resolve", "connect", "request"} { + probeDNSDurationGaugeVec.WithLabelValues(lv) + } + + registry.MustRegister(probeDNSDurationGaugeVec) registry.MustRegister(probeDNSAnswerRRSGauge) registry.MustRegister(probeDNSAuthorityRRSGauge) registry.MustRegister(probeDNSAdditionalRRSGauge) @@ -182,11 +192,12 @@ func ProbeDNS(ctx context.Context, target string, module config.Module, registry } targetAddr = target } - ip, _, err = chooseProtocol(ctx, module.DNS.IPProtocol, module.DNS.IPProtocolFallback, targetAddr, registry, logger) + ip, lookupTime, err := chooseProtocol(ctx, module.DNS.IPProtocol, module.DNS.IPProtocolFallback, targetAddr, registry, logger) if err != nil { level.Error(logger).Log("msg", "Error resolving address", "err", err) return false } + probeDNSDurationGaugeVec.WithLabelValues("resolve").Add(lookupTime) targetIP := net.JoinHostPort(ip.String(), port) if ip.IP.To4() == nil { @@ -246,7 +257,14 @@ func ProbeDNS(ctx context.Context, target string, module config.Module, registry level.Info(logger).Log("msg", "Making DNS query", "target", targetIP, "dial_protocol", dialProtocol, "query", module.DNS.QueryName, "type", qt, "class", qc) timeoutDeadline, _ := ctx.Deadline() client.Timeout = time.Until(timeoutDeadline) - response, _, err := client.Exchange(msg, targetIP) + requestStart := time.Now() + response, rtt, err := client.Exchange(msg, targetIP) + // The rtt value returned from client.Exchange includes only the time to + // exchange messages with the server _after_ the connection is created. + // We compute the connection time as the total time for the operation + // minus the time for the actual request rtt. + probeDNSDurationGaugeVec.WithLabelValues("connect").Set((time.Since(requestStart) - rtt).Seconds()) + probeDNSDurationGaugeVec.WithLabelValues("request").Set(rtt.Seconds()) if err != nil { level.Error(logger).Log("msg", "Error while sending a DNS query", "err", err) return false diff --git a/prober/dns_test.go b/prober/dns_test.go index 75dc7c3..ef2d2b0 100644 --- a/prober/dns_test.go +++ b/prober/dns_test.go @@ -570,3 +570,49 @@ func TestDNSProtocol(t *testing.T) { } } + +// TestDNSMetrics checks that calling ProbeDNS populates the expected +// set of metrics for a DNS probe, but it does not test that those +// metrics contain specific values. +func TestDNSMetrics(t *testing.T) { + server, addr := startDNSServer("udp", recursiveDNSHandler) + defer server.Shutdown() + + _, port, _ := net.SplitHostPort(addr.String()) + + module := config.Module{ + Timeout: time.Second, + DNS: config.DNSProbe{ + IPProtocol: "ip4", + IPProtocolFallback: true, + QueryName: "example.com", + }, + } + registry := prometheus.NewRegistry() + testCTX, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + result := ProbeDNS(testCTX, net.JoinHostPort("localhost", port), module, registry, log.NewNopLogger()) + if !result { + t.Fatalf("DNS test connection failed, expected success.") + } + mfs, err := registry.Gather() + if err != nil { + t.Fatal(err) + } + + expectedMetrics := map[string]map[string]map[string]struct{}{ + "probe_dns_lookup_time_seconds": nil, + "probe_dns_duration_seconds": { + "phase": { + "resolve": {}, + "connect": {}, + "request": {}, + }, + }, + "probe_dns_answer_rrs": nil, + "probe_dns_authority_rrs": nil, + "probe_dns_additional_rrs": nil, + } + + checkMetrics(expectedMetrics, mfs, t) +} diff --git a/prober/http_test.go b/prober/http_test.go index 3d3f63e..b2c28ae 100644 --- a/prober/http_test.go +++ b/prober/http_test.go @@ -702,42 +702,19 @@ func TestHTTPPhases(t *testing.T) { t.Fatal(err) } - found := false - foundLabel := map[string]bool{ - "connect": false, - "processing": false, - "resolve": false, - "transfer": false, - "tls": false, - } - for _, mf := range mfs { - if mf.GetName() == "probe_http_duration_seconds" { - found = true - for _, metric := range mf.GetMetric() { - for _, lp := range metric.Label { - if lp.GetName() == "phase" { - f, ok := foundLabel[lp.GetValue()] - if !ok { - t.Fatalf("Unexpected label phase=%s", lp.GetValue()) - } - if f { - t.Fatalf("Label phase=%s duplicated", lp.GetValue()) - } - foundLabel[lp.GetValue()] = true - } - } - } - - } - } - if !found { - t.Fatal("probe_http_duration_seconds not found") - } - for lv, found := range foundLabel { - if !found { - t.Fatalf("Label phase=%s not found", lv) - } + expectedMetrics := map[string]map[string]map[string]struct{}{ + "probe_http_duration_seconds": { + "phase": { + "connect": {}, + "processing": {}, + "resolve": {}, + "transfer": {}, + "tls": {}, + }, + }, } + + checkMetrics(expectedMetrics, mfs, t) } func TestCookieJar(t *testing.T) { diff --git a/prober/utils_test.go b/prober/utils_test.go index 2ac38ee..a53ee49 100644 --- a/prober/utils_test.go +++ b/prober/utils_test.go @@ -171,3 +171,83 @@ func TestChooseProtocol(t *testing.T) { t.Error("without fallback it should not answer") } } + +func checkMetrics(expected map[string]map[string]map[string]struct{}, mfs []*dto.MetricFamily, t *testing.T) { + type ( + valueValidation struct { + found bool + } + labelValidation struct { + found bool + values map[string]valueValidation + } + metricValidation struct { + found bool + labels map[string]labelValidation + } + ) + + foundMetrics := map[string]metricValidation{} + + for mname, labels := range expected { + var mv metricValidation + if labels != nil { + mv.labels = map[string]labelValidation{} + for lname, values := range labels { + var lv labelValidation + if values != nil { + lv.values = map[string]valueValidation{} + for vname, _ := range values { + lv.values[vname] = valueValidation{} + } + } + mv.labels[lname] = lv + } + } + foundMetrics[mname] = mv + } + + for _, mf := range mfs { + info, wanted := foundMetrics[mf.GetName()] + if !wanted { + continue + } + info.found = true + for _, metric := range mf.GetMetric() { + if info.labels == nil { + continue + } + for _, lp := range metric.Label { + if label, labelWanted := info.labels[lp.GetName()]; labelWanted { + label.found = true + if label.values != nil { + if value, wanted := label.values[lp.GetValue()]; !wanted { + t.Fatalf("Unexpected label %s=%s", lp.GetName(), lp.GetValue()) + } else if value.found { + t.Fatalf("Label %s=%s duplicated", lp.GetName(), lp.GetValue()) + } + label.values[lp.GetValue()] = valueValidation{found: true} + } + info.labels[lp.GetName()] = label + } + } + } + foundMetrics[mf.GetName()] = info + } + + for mname, m := range foundMetrics { + if !m.found { + t.Fatalf("metric %s wanted, not found", mname) + } + for lname, label := range m.labels { + if !label.found { + t.Fatalf("metric %s, label %s wanted, not found", mname, lname) + } + for vname, value := range label.values { + if !value.found { + t.Fatalf("metric %s, label %s, value %s wanted, not found", mname, lname, vname) + } + } + } + } +} -- 2.25.1