Monitoring and Metrics

Overview

BuildGrid’s monitoring is off by default. When enabled, BuildGrid will output a stream of monitoring messages to a configurable destination. Unless using StatsD as the serialization format, these messages will include both metrics and server logs.

This can be enabled in the configuration file along with configuration for metric name prefixes, serialization format, endpoint type, and location.

Serialization Format

BuildGrid allows the monitoring messages to be serialized using either Protobuf wire format, ProtoJSON format, or a StatsD format. For details about these messages, see Protobuf format.

  • Binary format: BuildGrid will serialize the message to Protobuf wire format. These serialized messages are prepended with a uint32 containing the size of the message in bytes.

  • JSON format: BuildGrid will serialize the message to ProtoJSON.

  • StatsD format: BuildGrid will publish the method in the StatsD format. It will exclude any log messages.

End Points Supported

BuildGrid supports publishing metrics and logs to one of four locations.

  • stdout

  • file (path to file)

  • unix domain socket (socket address)

  • udp address (address:port)

Configuration

Monitoring configuration lives in the monitoring section of the configuration file. Refer to the reference configuration to see this in context.

monitoring:
  # Whether or not to activate the monitoring subsystem. Defaults to `false`.
  enabled: true

  # Type of the monitoring bus endpoint.
  #  stdout  - Standard output stream.
  #  file    - On-disk file.
  #  socket  - UNIX domain socket.
  #  udp     - Port listening for UDP packets
  endpoint-type: socket

  # Location for the monitoring bus endpoint. Only
  # necessary for 'file', 'socket', and 'udp' `endpoint-type`.
  # Full path is expected for 'file', name
  # only for 'socket', and `hostname:port` for 'udp'.
  endpoint-location: monitoring_bus_socket

  # Messages serialisation format.
  #  binary  - Protobuf binary format.
  #  json    - JSON format.
  #  statsd  - StatsD format. Only metrics are kept - logs are dropped.
  serialization-format: binary

  # Prefix to prepend to the metric name before writing
  # to the configured endpoint.
  metric-prefix: buildgrid

  # Format to use to apply metadata tags to metrics when using `statsd`
  # serialization format. Defaults to `influx-statsd`.
  #  none           - No tags, plain StatsD format, `name:value|type`
  #  influx-statsd  - InfluxDB-style, `name,tag=tag_value:value|type`
  #  dogstatsd      - Datadog-style, `name:value|type|#tag:tag_value`
  #  graphite       - Graphite-style, `name;tag=tag_value:value|type`
  tag-format: influx-statsd

  # Additional key/value pairs to add as tags when using `statsd`
  # serialization format.
  additional-tags:
    example: value

Metrics

All the possible metrics that BuildGrid can publish are listed here. Where appropriate, these metrics will be annotated with contextual information about the instance name, service, and RPC name they relate to, along with any other useful metadata.

When using the statsd serialization format this metadata is serialized as tags on the metric using the configured tag-format. If unspecified, this defaults to InfluxDB-style tags (comma-separated list of key=value tags appended to the metric name).

class buildgrid.server.metrics_names.METRIC
class RPC
DURATION = 'rpc.duration.ms'
INPUT_BYTES = 'rpc.input_bytes.count'
OUTPUT_BYTES = 'rpc.output_bytes.count'
AUTH_DURATION = 'rpc.auth.duration.ms'
class ACTION_CACHE
INVALID_CACHE_COUNT = 'action_cache.invalid_cache.count'
MIRRORED_MATCH_COUNT = 'action_cache.mirrored_matches.count'
MIRRORED_MISMATCH_COUNT = 'action_cache.mirrored_mismatches.count'
RESULT_AGE = 'action_cache.result_age.ms'
class CAS
BLOBS_COUNT = 'cas.blobs.count'
BLOBS_MISSING_COUNT = 'cas.blobs_missing.count'
BLOBS_MISSING_PERCENT = 'cas.blobs_missing.percent'
BLOB_BYTES = 'cas.blob_bytes.count'
TREE_CACHE_HIT_COUNT = 'cas.tree_cache_hit.count'
TREE_CACHE_MISS_COUNT = 'cas.tree_cache_miss.count'
class STORAGE
STAT_DURATION = 'storage.stat.duration.ms'
BULK_STAT_DURATION = 'storage.bulk_stat.duration.ms'
READ_DURATION = 'storage.read.duration.ms'
STREAM_READ_DURATION = 'storage.stream_read.duration.ms'
BULK_READ_DURATION = 'storage.bulk_read.duration.ms'
DELETE_DURATION = 'storage.delete_blob.duration.ms'
BULK_DELETE_DURATION = 'storage.bulk_delete.duration.ms'
DELETE_ERRORS_COUNT = 'storage.delete_errors.count'
WRITE_DURATION = 'storage.write.duration.ms'
STREAM_WRITE_DURATION = 'storage.stream_write.duration.ms'
BULK_WRITE_DURATION = 'storage.bulk_write.duration.ms'
GET_TREE_DURATION = 'storage.get_tree.duration.ms'
class WITH_CACHE
CACHE_HIT_COUNT = 'storage.with_cache.cache_hit.count'
CACHE_MISS_COUNT = 'storage.with_cache.cache_miss.count'
CACHE_HIT_PERCENT = 'storage.with_cache.cache_hit.percent'
class SQL_INDEX
UPDATE_TIMESTAMP_DURATION = 'storage.sql_index.update_timestamp.duration.ms'
SAVE_DIGESTS_DURATION = 'storage.sql_index.save_digest.duration.ms'
SIZE_CALCULATION_DURATION = 'storage.sql_index.size_calculation.duration.ms'
DELETE_N_BYTES_DURATION = 'storage.sql_index.delete_n_bytes.duration.ms'
BULK_DELETE_INDEX_DURATION = 'storage.sql_index.bulk_delete_index.duration.ms'
class REPLICATED
REQUIRED_REPLICATION_COUNT = 'storage.replicated.required_replication.count'
REPLICATION_COUNT = 'storage.replicated.replication.count'
REPLICATION_QUEUE_FULL_COUNT = 'storage.replicated.replication_queue_full.count'
REPLICATION_ERROR_COUNT = 'storage.replicated.replication.errors.count'
class S3
BLOB_AGE = 'storage.s3.total_age.ms'
BLOB_BYTES = 'storage.s3.blob_bytes.count'
class CLEANUP
DURATION = 'cleanup.duration.ms'
BATCH_DURATION = 'cleanup.batch.duration.ms'
BLOBS_DELETED_PER_SECOND = 'cleanup.blobs_deleted.per_second'
BYTES_DELETED_PER_SECOND = 'cleanup.bytes_deleted.per_second'
BYTES_DELETED_COUNT = 'cleanup.bytes_deleted.count'
TOTAL_BYTES_COUNT = 'cleanup.total_bytes.count'
LOW_WATERMARK_BYTES_COUNT = 'cleanup.low_watermark_bytes.count'
HIGH_WATERMARK_BYTES_COUNT = 'cleanup.high_watermark_bytes.count'
TOTAL_BYTES_WATERMARK_PERCENT = 'cleanup.total_bytes_watermark.percent'
TOTAL_BLOBS_COUNT = 'cleanup.total_blobs.count'
LOW_WATERMARK_BLOBS_COUNT = 'cleanup.low_watermark_blobs.count'
HIGH_WATERMARK_BLOBS_COUNT = 'cleanup.high_watermark_blobs.count'
TOTAL_BLOBS_WATERMARK_PERCENT = 'cleanup.total_blobs_watermark.percent'
class JANITOR
BLOB_AGE = 'cleanup.janitor.blob_age.ms'
BLOB_BYTES = 'cleanup.janitor.blob_bytes.count'
class SCHEDULER
JOB_COUNT = 'scheduler.jobs.count'
BOTS_COUNT = 'scheduler.bots.count'
AVAILABLE_CAPACITY_COUNT = 'scheduler.available_bot_capacity.count'
ASSIGNMENT_DURATION = 'scheduler.assignment.duration.ms'
SYNCHRONIZE_DURATION = 'scheduler.synchronize.duration.ms'
ASSIGNMENT_RESPONSE_DURATION = 'scheduler.assignment-response.duration.ms'
PRUNE_DURATION = 'scheduler.prune.duration.ms'
PRUNE_COUNT = 'scheduler.prune.count'
QUEUE_TIMEOUT_DURATION = 'scheduler.queue_timeout.duration.ms'
QUEUE_TIMEOUT_COUNT = 'scheduler.queue_timeout.count'
EXECUTION_TIMEOUT_DURATION = 'scheduler.execution_timeout.duration.ms'
EXECUTION_TIMEOUT_COUNT = 'scheduler.execution_timeout.count'
COHORT_TOTAL_USAGE_COUNT = 'scheduler.cohort.total_usage.count'
COHORT_TOTAL_MIN_QUOTA_COUNT = 'scheduler.cohort.total_min_quota.count'
COHORT_TOTAL_MAX_QUOTA_COUNT = 'scheduler.cohort.total_max_quota.count'
class CONNECTIONS
CLIENT_COUNT = 'connections.clients.count'
WORKER_COUNT = 'connections.workers.count'
class SQL
SQL_SESSION_COUNT_TEMPLATE = 'sql.session.count.{name}'
SQL_ACTIVE_SESSION_GAUGE_TEMPLATE = 'sql.active.session.gauge.{name}'
class JOB
DURATION = 'job.duration.ms'

Protobuf format

When using binary or json as the serialization format, each entry written to the endpoint will be an appropriately serialized BusMessage containing either a LogRecord or a MetricRecord.

// Copyright (C) 2018 Bloomberg LP
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  <http://www.apache.org/licenses/LICENSE-2.0>
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package build.buildgrid;

import "google/protobuf/duration.proto";
import "google/protobuf/timestamp.proto";

message BusMessage {
  // The position of this message in the bus stream.
  int64 sequence_number = 1;

  // The carried message.
  oneof record {
    LogRecord log_record = 2;
    MetricRecord metric_record = 3;
  }
}

message LogRecord {
  // When the record has been created.
  google.protobuf.Timestamp creation_timestamp = 1;

  enum Level {
    NOTSET = 0;
    // Debug message severity level.
    DEBUG = 1;
    // Information message severity level.
    INFO = 2;
    // Warning message severity level.
    WARNING = 3;
    // Error message severity level.
    ERROR = 4;
    // Critical message severity level.
    CRITICAL = 5;
  }

  // The domain name for the record.
  string domain = 2;

  // The severity level of the record.
  Level level = 3;

  // The human-readable record's message.
  string message = 4;

  // An optional list of additional metadata.
  map<string, string> metadata = 5;
}

message MetricRecord {
  // When the metric has been created.
  google.protobuf.Timestamp creation_timestamp = 1;

  enum Type {
    NONE = 0;
    // A metric for counting.
    COUNTER = 1;
    // A metric for measuring a duration.
    TIMER = 2;
    // A metric in arbitrary value.
    GAUGE = 3;
    // A metric distribution semantics
    DISTRIBUTION = 4;
  }

  // The type of metric, see Type.
  Type type = 3;

  // The name identifying the metric.
  string name = 4;

  // The carried value, depending on the metric's type.
  oneof data {
    // Set for Type.COUNTER and Type.DISTRIBUTION metrics.
    float count = 5;
    // Set for Type.TIMER metrics.
    google.protobuf.Duration duration = 6;
    // Set for Type.GAUGE metrics.
    float value = 7;
  }

  // An optional list of additional metadata.
  map<string, string> metadata = 8;
}