Monitoring and Metrics
Overview
BuildGrid’s monitoring is off by default. When enabled, BuildGrid will output a stream of monitoring messages to a configurable destination. Unless using StatsD as the serialization format, these messages will include both metrics and server logs.
This can be enabled in the configuration file along with configuration for metric name prefixes, serialization format, endpoint type, and location.
Serialization Format
BuildGrid allows the monitoring messages to be serialized using either Protobuf wire format, ProtoJSON format, or a StatsD format. For details about these messages, see Protobuf format.
Binary format: BuildGrid will serialize the message to Protobuf wire format. These serialized messages are prepended with a uint32 containing the size of the message in bytes.
JSON format: BuildGrid will serialize the message to ProtoJSON.
StatsD format: BuildGrid will publish the method in the StatsD format. It will exclude any log messages.
End Points Supported
BuildGrid supports publishing metrics and logs to one of four locations.
stdout
file (path to file)
unix domain socket (socket address)
udp address (address:port)
Configuration
Monitoring configuration lives in the monitoring section of the
configuration file. Refer to the reference configuration to see this in context.
monitoring:
# Whether or not to activate the monitoring subsystem. Defaults to `false`.
enabled: true
# Type of the monitoring bus endpoint.
# stdout - Standard output stream.
# file - On-disk file.
# socket - UNIX domain socket.
# udp - Port listening for UDP packets
endpoint-type: socket
# Location for the monitoring bus endpoint. Only
# necessary for 'file', 'socket', and 'udp' `endpoint-type`.
# Full path is expected for 'file', name
# only for 'socket', and `hostname:port` for 'udp'.
endpoint-location: monitoring_bus_socket
# Messages serialisation format.
# binary - Protobuf binary format.
# json - JSON format.
# statsd - StatsD format. Only metrics are kept - logs are dropped.
serialization-format: binary
# Prefix to prepend to the metric name before writing
# to the configured endpoint.
metric-prefix: buildgrid
# Format to use to apply metadata tags to metrics when using `statsd`
# serialization format. Defaults to `influx-statsd`.
# none - No tags, plain StatsD format, `name:value|type`
# influx-statsd - InfluxDB-style, `name,tag=tag_value:value|type`
# dogstatsd - Datadog-style, `name:value|type|#tag:tag_value`
# graphite - Graphite-style, `name;tag=tag_value:value|type`
tag-format: influx-statsd
# Additional key/value pairs to add as tags when using `statsd`
# serialization format.
additional-tags:
example: value
Metrics
All the possible metrics that BuildGrid can publish are listed here. Where appropriate, these metrics will be annotated with contextual information about the instance name, service, and RPC name they relate to, along with any other useful metadata.
When using the statsd serialization format this metadata is serialized as
tags on the metric using the configured tag-format. If unspecified, this
defaults to InfluxDB-style tags (comma-separated list of key=value tags
appended to the metric name).
- class buildgrid.server.metrics_names.METRIC
- class RPC
- DURATION = 'rpc.duration.ms'
- INPUT_BYTES = 'rpc.input_bytes.count'
- OUTPUT_BYTES = 'rpc.output_bytes.count'
- AUTH_DURATION = 'rpc.auth.duration.ms'
- class ACTION_CACHE
- INVALID_CACHE_COUNT = 'action_cache.invalid_cache.count'
- MIRRORED_MATCH_COUNT = 'action_cache.mirrored_matches.count'
- MIRRORED_MISMATCH_COUNT = 'action_cache.mirrored_mismatches.count'
- RESULT_AGE = 'action_cache.result_age.ms'
- class CAS
- BLOBS_COUNT = 'cas.blobs.count'
- BLOBS_MISSING_COUNT = 'cas.blobs_missing.count'
- BLOBS_MISSING_PERCENT = 'cas.blobs_missing.percent'
- BLOB_BYTES = 'cas.blob_bytes.count'
- TREE_CACHE_HIT_COUNT = 'cas.tree_cache_hit.count'
- TREE_CACHE_MISS_COUNT = 'cas.tree_cache_miss.count'
- class STORAGE
- STAT_DURATION = 'storage.stat.duration.ms'
- BULK_STAT_DURATION = 'storage.bulk_stat.duration.ms'
- READ_DURATION = 'storage.read.duration.ms'
- STREAM_READ_DURATION = 'storage.stream_read.duration.ms'
- BULK_READ_DURATION = 'storage.bulk_read.duration.ms'
- DELETE_DURATION = 'storage.delete_blob.duration.ms'
- BULK_DELETE_DURATION = 'storage.bulk_delete.duration.ms'
- DELETE_ERRORS_COUNT = 'storage.delete_errors.count'
- WRITE_DURATION = 'storage.write.duration.ms'
- STREAM_WRITE_DURATION = 'storage.stream_write.duration.ms'
- BULK_WRITE_DURATION = 'storage.bulk_write.duration.ms'
- GET_TREE_DURATION = 'storage.get_tree.duration.ms'
- class WITH_CACHE
- CACHE_HIT_COUNT = 'storage.with_cache.cache_hit.count'
- CACHE_MISS_COUNT = 'storage.with_cache.cache_miss.count'
- CACHE_HIT_PERCENT = 'storage.with_cache.cache_hit.percent'
- class SQL_INDEX
- UPDATE_TIMESTAMP_DURATION = 'storage.sql_index.update_timestamp.duration.ms'
- SAVE_DIGESTS_DURATION = 'storage.sql_index.save_digest.duration.ms'
- SIZE_CALCULATION_DURATION = 'storage.sql_index.size_calculation.duration.ms'
- DELETE_N_BYTES_DURATION = 'storage.sql_index.delete_n_bytes.duration.ms'
- BULK_DELETE_INDEX_DURATION = 'storage.sql_index.bulk_delete_index.duration.ms'
- class REPLICATED
- REQUIRED_REPLICATION_COUNT = 'storage.replicated.required_replication.count'
- REPLICATION_COUNT = 'storage.replicated.replication.count'
- REPLICATION_QUEUE_FULL_COUNT = 'storage.replicated.replication_queue_full.count'
- REPLICATION_ERROR_COUNT = 'storage.replicated.replication.errors.count'
- class CLEANUP
- DURATION = 'cleanup.duration.ms'
- BATCH_DURATION = 'cleanup.batch.duration.ms'
- BLOBS_DELETED_PER_SECOND = 'cleanup.blobs_deleted.per_second'
- BYTES_DELETED_PER_SECOND = 'cleanup.bytes_deleted.per_second'
- BYTES_DELETED_COUNT = 'cleanup.bytes_deleted.count'
- TOTAL_BYTES_COUNT = 'cleanup.total_bytes.count'
- LOW_WATERMARK_BYTES_COUNT = 'cleanup.low_watermark_bytes.count'
- HIGH_WATERMARK_BYTES_COUNT = 'cleanup.high_watermark_bytes.count'
- TOTAL_BYTES_WATERMARK_PERCENT = 'cleanup.total_bytes_watermark.percent'
- TOTAL_BLOBS_COUNT = 'cleanup.total_blobs.count'
- LOW_WATERMARK_BLOBS_COUNT = 'cleanup.low_watermark_blobs.count'
- HIGH_WATERMARK_BLOBS_COUNT = 'cleanup.high_watermark_blobs.count'
- TOTAL_BLOBS_WATERMARK_PERCENT = 'cleanup.total_blobs_watermark.percent'
- class SCHEDULER
- JOB_COUNT = 'scheduler.jobs.count'
- BOTS_COUNT = 'scheduler.bots.count'
- AVAILABLE_CAPACITY_COUNT = 'scheduler.available_bot_capacity.count'
- ASSIGNMENT_DURATION = 'scheduler.assignment.duration.ms'
- SYNCHRONIZE_DURATION = 'scheduler.synchronize.duration.ms'
- ASSIGNMENT_RESPONSE_DURATION = 'scheduler.assignment-response.duration.ms'
- PRUNE_DURATION = 'scheduler.prune.duration.ms'
- PRUNE_COUNT = 'scheduler.prune.count'
- QUEUE_TIMEOUT_DURATION = 'scheduler.queue_timeout.duration.ms'
- QUEUE_TIMEOUT_COUNT = 'scheduler.queue_timeout.count'
- EXECUTION_TIMEOUT_DURATION = 'scheduler.execution_timeout.duration.ms'
- EXECUTION_TIMEOUT_COUNT = 'scheduler.execution_timeout.count'
- COHORT_TOTAL_USAGE_COUNT = 'scheduler.cohort.total_usage.count'
- COHORT_TOTAL_MIN_QUOTA_COUNT = 'scheduler.cohort.total_min_quota.count'
- COHORT_TOTAL_MAX_QUOTA_COUNT = 'scheduler.cohort.total_max_quota.count'
- class CONNECTIONS
- CLIENT_COUNT = 'connections.clients.count'
- WORKER_COUNT = 'connections.workers.count'
Protobuf format
When using binary or json as the serialization format, each entry
written to the endpoint will be an appropriately serialized BusMessage
containing either a LogRecord or a MetricRecord.
// Copyright (C) 2018 Bloomberg LP
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// <http://www.apache.org/licenses/LICENSE-2.0>
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package build.buildgrid;
import "google/protobuf/duration.proto";
import "google/protobuf/timestamp.proto";
message BusMessage {
// The position of this message in the bus stream.
int64 sequence_number = 1;
// The carried message.
oneof record {
LogRecord log_record = 2;
MetricRecord metric_record = 3;
}
}
message LogRecord {
// When the record has been created.
google.protobuf.Timestamp creation_timestamp = 1;
enum Level {
NOTSET = 0;
// Debug message severity level.
DEBUG = 1;
// Information message severity level.
INFO = 2;
// Warning message severity level.
WARNING = 3;
// Error message severity level.
ERROR = 4;
// Critical message severity level.
CRITICAL = 5;
}
// The domain name for the record.
string domain = 2;
// The severity level of the record.
Level level = 3;
// The human-readable record's message.
string message = 4;
// An optional list of additional metadata.
map<string, string> metadata = 5;
}
message MetricRecord {
// When the metric has been created.
google.protobuf.Timestamp creation_timestamp = 1;
enum Type {
NONE = 0;
// A metric for counting.
COUNTER = 1;
// A metric for measuring a duration.
TIMER = 2;
// A metric in arbitrary value.
GAUGE = 3;
// A metric distribution semantics
DISTRIBUTION = 4;
}
// The type of metric, see Type.
Type type = 3;
// The name identifying the metric.
string name = 4;
// The carried value, depending on the metric's type.
oneof data {
// Set for Type.COUNTER and Type.DISTRIBUTION metrics.
float count = 5;
// Set for Type.TIMER metrics.
google.protobuf.Duration duration = 6;
// Set for Type.GAUGE metrics.
float value = 7;
}
// An optional list of additional metadata.
map<string, string> metadata = 8;
}