Add metric for internode RPC calls errors (#11669)

This commit is contained in:
Anis Elleuch 2021-03-01 21:31:33 +01:00 committed by GitHub
parent bbd1244a88
commit e8d8dfa3ae
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 36 additions and 0 deletions

View file

@ -346,6 +346,16 @@ func getBucketObjectDistributionMD() MetricDescription {
Type: histogramMetric,
}
}
func getInternodeFailedRequests() MetricDescription {
return MetricDescription{
Namespace: interNodeMetricNamespace,
Subsystem: trafficSubsystem,
Name: errorsTotal,
Help: "Total number of failed internode calls.",
Type: counterMetric,
}
}
func getInterNodeSentBytesMD() MetricDescription {
return MetricDescription{
Namespace: interNodeMetricNamespace,
@ -982,6 +992,10 @@ func getNetworkMetrics() MetricsGroup {
return MetricsGroup{
Metrics: []Metric{},
initialize: func(ctx context.Context, metrics *MetricsGroup) {
metrics.Metrics = append(metrics.Metrics, Metric{
Description: getInternodeFailedRequests(),
Value: float64(loadAndResetRPCNetworkErrsCounter()),
})
connStats := globalConnStats.toServerConnStats()
metrics.Metrics = append(metrics.Metrics, Metric{
Description: getInterNodeSentBytesMD(),

View file

@ -42,6 +42,19 @@ const (
closed
)
// Hold the number of failed RPC calls due to networking errors
var networkErrsCounter uint64
// GetNetworkErrsCounter returns the number of failed RPC requests
func GetNetworkErrsCounter() uint64 {
return atomic.LoadUint64(&networkErrsCounter)
}
// ResetNetworkErrsCounter resets the number of failed RPC requests
func ResetNetworkErrsCounter() {
atomic.StoreUint64(&networkErrsCounter, 0)
}
// NetworkError - error type in case of errors related to http/transport
// for ex. connection refused, connection reset, dns resolution failure etc.
// All errors returned by storage-rest-server (ex errFileNotFound, errDiskNotFound) are not considered to be network errors.
@ -120,6 +133,7 @@ func (c *Client) Call(ctx context.Context, method string, values url.Values, bod
resp, err := c.httpClient.Do(req)
if err != nil {
if c.HealthCheckFn != nil && xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) {
atomic.AddUint64(&networkErrsCounter, 1)
if c.MarkOffline() {
logger.LogIf(ctx, fmt.Errorf("Marking %s temporary offline; caused by %w", c.url.String(), err))
}

View file

@ -43,6 +43,7 @@ import (
"github.com/gorilla/mux"
xhttp "github.com/minio/minio/cmd/http"
"github.com/minio/minio/cmd/logger"
"github.com/minio/minio/cmd/rest"
"github.com/minio/minio/pkg/certs"
"github.com/minio/minio/pkg/handlers"
"github.com/minio/minio/pkg/madmin"
@ -883,3 +884,10 @@ func decodeDirObject(object string) string {
}
return object
}
// This is used by metrics to show the number of failed RPC calls
// between internodes
func loadAndResetRPCNetworkErrsCounter() uint64 {
defer rest.ResetNetworkErrsCounter()
return rest.GetNetworkErrsCounter()
}