diff --git a/CHANGELOG.md b/CHANGELOG.md index 00ff7a81c2..2bc8af553d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,30 @@ # Changelog ## master / unreleased +* [FEATURE] Ruler: Add new `-ruler.query-stats-enabled` which when enabled will report the `cortex_ruler_query_seconds_total` as a per-user metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317 * [CHANGE] Querier / ruler: Change `-querier.max-fetched-chunks-per-query` configuration to limit to maximum number of chunks that can be fetched in a single query. The number of chunks fetched by ingesters AND long-term storare combined should not exceed the value configured on `-querier.max-fetched-chunks-per-query`. #4260 +* [CHANGE] Memberlist: the `memberlist_kv_store_value_bytes` has been removed due to values no longer being stored in-memory as encoded bytes. #4345 +* [ENHANCEMENT] Add timeout for waiting on compactor to become ACTIVE in the ring. #4262 +* [ENHANCEMENT] Reduce memory used by streaming queries, particularly in ruler. #4341 +* [ENHANCEMENT] Ring: allow experimental configuration of disabling of heartbeat timeouts by setting the relevant configuration value to zero. Applies to the following: #4342 + * `-distributor.ring.heartbeat-timeout` + * `-ring.heartbeat-timeout` + * `-ruler.ring.heartbeat-timeout` + * `-alertmanager.sharding-ring.heartbeat-timeout` + * `-compactor.ring.heartbeat-timeout` + * `-store-gateway.sharding-ring.heartbeat-timeout` +* [ENHANCEMENT] Ring: allow heartbeats to be explicitly disabled by setting the interval to zero. This is considered experimental. This applies to the following configuration options: #4344 + * `-distributor.ring.heartbeat-period` + * `-ingester.heartbeat-period` + * `-ruler.ring.heartbeat-period` + * `-alertmanager.sharding-ring.heartbeat-period` + * `-compactor.ring.heartbeat-period` + * `-store-gateway.sharding-ring.heartbeat-period` +* [ENHANCEMENT] Memberlist: optimized receive path for processing ring state updates, to help reduce CPU utilization in large clusters. #4345 +* [ENHANCEMENT] Memberlist: expose configuration of memberlist packet compression via `-memberlist.compression=enabled`. #4346 * [BUGFIX] HA Tracker: when cleaning up obsolete elected replicas from KV store, tracker didn't update number of cluster per user correctly. #4336 +* [FEATURE] Add shuffle sharding grouper and planner within compactor to allow further work towards parallelizing compaction #4318 ## 1.10.0-rc.0 / 2021-06-28 @@ -17,6 +38,14 @@ * [CHANGE] Change default value of `-server.grpc.keepalive.min-time-between-pings` from `5m` to `10s` and `-server.grpc.keepalive.ping-without-stream-allowed` to `true`. #4168 * [CHANGE] Ingester: Change default value of `-ingester.active-series-metrics-enabled` to `true`. This incurs a small increase in memory usage, between 1.2% and 1.6% as measured on ingesters with 1.3M active series. #4257 * [CHANGE] Dependency: update go-redis from v8.2.3 to v8.9.0. #4236 +* [CHANGE] Memberlist: Expose default configuration values to the command line options. Note that setting these explicitly to zero will no longer cause the default to be used. If the default is desired, then do set the option. The following are affected: #4276 + - `-memberlist.stream-timeout` + - `-memberlist.retransmit-factor` + - `-memberlist.pull-push-interval` + - `-memberlist.gossip-interval` + - `-memberlist.gossip-nodes` + - `-memberlist.gossip-to-dead-nodes-time` + - `-memberlist.dead-node-reclaim-time` * [FEATURE] Querier: Added new `-querier.max-fetched-series-per-query` flag. When Cortex is running with blocks storage, the max series per query limit is enforced in the querier and applies to unique series received from ingesters and store-gateway (long-term storage). #4179 * [FEATURE] Querier/Ruler: Added new `-querier.max-fetched-chunk-bytes-per-query` flag. When Cortex is running with blocks storage, the max chunk bytes limit is enforced in the querier and ruler and limits the size of all aggregated chunks returned from ingesters and storage as bytes for a query. #4216 * [FEATURE] Alertmanager: support negative matchers, time-based muting - [upstream release notes](https://github.com/prometheus/alertmanager/releases/tag/v0.22.0). #4237 diff --git a/docs/_index.md b/docs/_index.md index b97ef3a463..a11bead3d9 100644 --- a/docs/_index.md +++ b/docs/_index.md @@ -36,6 +36,9 @@ should read: 1. [Getting started with Cortex](getting-started/_index.md) 1. [Information regarding configuring Cortex](configuration/_index.md) +There are also individual [guides](guides/_index.md) to many tasks. +Please review the important [security advice](guides/security.md) before deploying. + For a guide to contributing to Cortex, see the [contributor guidelines](contributing/). ## Further reading diff --git a/docs/blocks-storage/compactor.md b/docs/blocks-storage/compactor.md index 213405822c..c8620d3bf9 100644 --- a/docs/blocks-storage/compactor.md +++ b/docs/blocks-storage/compactor.md @@ -173,6 +173,11 @@ compactor: # CLI flag: -compactor.sharding-enabled [sharding_enabled: | default = false] + # The sharding strategy to use. Supported values are: default, + # shuffle-sharding. + # CLI flag: -compactor.sharding-strategy + [sharding_strategy: | default = "default"] + sharding_ring: kvstore: # Backend storage to use for the ring. Supported values are: consul, etcd, @@ -209,12 +214,12 @@ compactor: # CLI flag: -compactor.ring.multi.mirror-timeout [mirror_timeout: | default = 2s] - # Period at which to heartbeat to the ring. + # Period at which to heartbeat to the ring. 0 = disabled. # CLI flag: -compactor.ring.heartbeat-period [heartbeat_period: | default = 5s] # The heartbeat timeout after which compactors are considered unhealthy - # within the ring. + # within the ring. 0 = never (timeout disabled). # CLI flag: -compactor.ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] @@ -230,4 +235,8 @@ compactor: # Name of network interface to read address from. # CLI flag: -compactor.ring.instance-interface-names [instance_interface_names: | default = [eth0 en0]] + + # Timeout for waiting on compactor to become ACTIVE in the ring. + # CLI flag: -compactor.ring.wait-active-instance-timeout + [wait_active_instance_timeout: | default = 10m] ``` diff --git a/docs/blocks-storage/store-gateway.md b/docs/blocks-storage/store-gateway.md index d24813be5f..a00297a445 100644 --- a/docs/blocks-storage/store-gateway.md +++ b/docs/blocks-storage/store-gateway.md @@ -232,13 +232,13 @@ store_gateway: # CLI flag: -store-gateway.sharding-ring.multi.mirror-timeout [mirror_timeout: | default = 2s] - # Period at which to heartbeat to the ring. + # Period at which to heartbeat to the ring. 0 = disabled. # CLI flag: -store-gateway.sharding-ring.heartbeat-period [heartbeat_period: | default = 15s] # The heartbeat timeout after which store gateways are considered unhealthy - # within the ring. This option needs be set both on the store-gateway and - # querier when running in microservices mode. + # within the ring. 0 = never (timeout disabled). This option needs be set + # both on the store-gateway and querier when running in microservices mode. # CLI flag: -store-gateway.sharding-ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 9b20d81bea..0f70a3f3e5 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -563,12 +563,12 @@ ring: # CLI flag: -distributor.ring.multi.mirror-timeout [mirror_timeout: | default = 2s] - # Period at which to heartbeat to the ring. + # Period at which to heartbeat to the ring. 0 = disabled. # CLI flag: -distributor.ring.heartbeat-period [heartbeat_period: | default = 5s] # The heartbeat timeout after which distributors are considered unhealthy - # within the ring. + # within the ring. 0 = never (timeout disabled). # CLI flag: -distributor.ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] @@ -662,6 +662,7 @@ lifecycler: [mirror_timeout: | default = 2s] # The heartbeat timeout after which ingesters are skipped for reads/writes. + # 0 = never (timeout disabled). # CLI flag: -ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] @@ -678,7 +679,7 @@ lifecycler: # CLI flag: -ingester.num-tokens [num_tokens: | default = 128] - # Period at which to heartbeat to consul. + # Period at which to heartbeat to consul. 0 = disabled. # CLI flag: -ingester.heartbeat-period [heartbeat_period: | default = 5s] @@ -1580,12 +1581,12 @@ ring: # CLI flag: -ruler.ring.multi.mirror-timeout [mirror_timeout: | default = 2s] - # Period at which to heartbeat to the ring. + # Period at which to heartbeat to the ring. 0 = disabled. # CLI flag: -ruler.ring.heartbeat-period [heartbeat_period: | default = 5s] # The heartbeat timeout after which rulers are considered unhealthy within the - # ring. + # ring. 0 = never (timeout disabled). # CLI flag: -ruler.ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] @@ -1616,6 +1617,11 @@ ring: # processing will ignore them instead. Subject to sharding. # CLI flag: -ruler.disabled-tenants [disabled_tenants: | default = ""] + +# Report the wall time for ruler queries to complete as a per user metric and as +# an info level log message. +# CLI flag: -ruler.query-stats-enabled +[query_stats_enabled: | default = false] ``` ### `ruler_storage_config` @@ -1901,12 +1907,12 @@ sharding_ring: # CLI flag: -alertmanager.sharding-ring.multi.mirror-timeout [mirror_timeout: | default = 2s] - # Period at which to heartbeat to the ring. + # Period at which to heartbeat to the ring. 0 = disabled. # CLI flag: -alertmanager.sharding-ring.heartbeat-period [heartbeat_period: | default = 15s] # The heartbeat timeout after which alertmanagers are considered unhealthy - # within the ring. + # within the ring. 0 = never (timeout disabled). # CLI flag: -alertmanager.sharding-ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] @@ -3761,36 +3767,40 @@ The `memberlist_config` configures the Gossip memberlist. [randomize_node_name: | default = true] # The timeout for establishing a connection with a remote node, and for -# read/write operations. Uses memberlist LAN defaults if 0. +# read/write operations. # CLI flag: -memberlist.stream-timeout -[stream_timeout: | default = 0s] +[stream_timeout: | default = 10s] # Multiplication factor used when sending out messages (factor * log(N+1)). # CLI flag: -memberlist.retransmit-factor -[retransmit_factor: | default = 0] +[retransmit_factor: | default = 4] -# How often to use pull/push sync. Uses memberlist LAN defaults if 0. +# How often to use pull/push sync. # CLI flag: -memberlist.pullpush-interval -[pull_push_interval: | default = 0s] +[pull_push_interval: | default = 30s] -# How often to gossip. Uses memberlist LAN defaults if 0. +# How often to gossip. # CLI flag: -memberlist.gossip-interval -[gossip_interval: | default = 0s] +[gossip_interval: | default = 200ms] -# How many nodes to gossip to. Uses memberlist LAN defaults if 0. +# How many nodes to gossip to. # CLI flag: -memberlist.gossip-nodes -[gossip_nodes: | default = 0] +[gossip_nodes: | default = 3] # How long to keep gossiping to dead nodes, to give them chance to refute their -# death. Uses memberlist LAN defaults if 0. +# death. # CLI flag: -memberlist.gossip-to-dead-nodes-time -[gossip_to_dead_nodes_time: | default = 0s] +[gossip_to_dead_nodes_time: | default = 30s] -# How soon can dead node's name be reclaimed with new address. Defaults to 0, -# which is disabled. +# How soon can dead node's name be reclaimed with new address. 0 to disable. # CLI flag: -memberlist.dead-node-reclaim-time [dead_node_reclaim_time: | default = 0s] +# Enable message compression. This can be used to reduce bandwidth usage at the +# cost of slightly more CPU utilization. +# CLI flag: -memberlist.compression-enabled +[compression_enabled: | default = true] + # Other cluster members to join. Can be specified multiple times. It can be an # IP, hostname or an entry specified in the DNS Service Discovery format (see # https://cortexmetrics.io/docs/configuration/arguments/#dns-service-discovery @@ -5138,6 +5148,10 @@ The `compactor_config` configures the compactor for the blocks storage. # CLI flag: -compactor.sharding-enabled [sharding_enabled: | default = false] +# The sharding strategy to use. Supported values are: default, shuffle-sharding. +# CLI flag: -compactor.sharding-strategy +[sharding_strategy: | default = "default"] + sharding_ring: kvstore: # Backend storage to use for the ring. Supported values are: consul, etcd, @@ -5174,12 +5188,12 @@ sharding_ring: # CLI flag: -compactor.ring.multi.mirror-timeout [mirror_timeout: | default = 2s] - # Period at which to heartbeat to the ring. + # Period at which to heartbeat to the ring. 0 = disabled. # CLI flag: -compactor.ring.heartbeat-period [heartbeat_period: | default = 5s] # The heartbeat timeout after which compactors are considered unhealthy within - # the ring. + # the ring. 0 = never (timeout disabled). # CLI flag: -compactor.ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] @@ -5195,6 +5209,10 @@ sharding_ring: # Name of network interface to read address from. # CLI flag: -compactor.ring.instance-interface-names [instance_interface_names: | default = [eth0 en0]] + + # Timeout for waiting on compactor to become ACTIVE in the ring. + # CLI flag: -compactor.ring.wait-active-instance-timeout + [wait_active_instance_timeout: | default = 10m] ``` ### `store_gateway_config` @@ -5248,13 +5266,13 @@ sharding_ring: # CLI flag: -store-gateway.sharding-ring.multi.mirror-timeout [mirror_timeout: | default = 2s] - # Period at which to heartbeat to the ring. + # Period at which to heartbeat to the ring. 0 = disabled. # CLI flag: -store-gateway.sharding-ring.heartbeat-period [heartbeat_period: | default = 15s] # The heartbeat timeout after which store gateways are considered unhealthy - # within the ring. This option needs be set both on the store-gateway and - # querier when running in microservices mode. + # within the ring. 0 = never (timeout disabled). This option needs be set both + # on the store-gateway and querier when running in microservices mode. # CLI flag: -store-gateway.sharding-ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] diff --git a/docs/configuration/v1-guarantees.md b/docs/configuration/v1-guarantees.md index 8c87104a22..3af5dd4cd1 100644 --- a/docs/configuration/v1-guarantees.md +++ b/docs/configuration/v1-guarantees.md @@ -81,3 +81,17 @@ Currently experimental features are: - user config size (`-alertmanager.max-config-size-bytes`) - templates count in user config (`-alertmanager.max-templates-count`) - max template size (`-alertmanager.max-template-size-bytes`) +- Disabling ring heartbeat timeouts + - `-distributor.ring.heartbeat-timeout=0` + - `-ring.heartbeat-timeout=0` + - `-ruler.ring.heartbeat-timeout=0` + - `-alertmanager.sharding-ring.heartbeat-timeout=0` + - `-compactor.ring.heartbeat-timeout=0` + - `-store-gateway.sharding-ring.heartbeat-timeout=0` +- Disabling ring heartbeats + - `-distributor.ring.heartbeat-period=0` + - `-ingester.heartbeat-period=0` + - `-ruler.ring.heartbeat-period=0` + - `-alertmanager.sharding-ring.heartbeat-period=0` + - `-compactor.ring.heartbeat-period=0` + - `-store-gateway.sharding-ring.heartbeat-period=0` diff --git a/docs/guides/security.md b/docs/guides/security.md new file mode 100644 index 0000000000..96592715d3 --- /dev/null +++ b/docs/guides/security.md @@ -0,0 +1,12 @@ +--- +title: "Security" +linkTitle: "Security" +weight: 10 +slug: security +--- + +Cortex must be deployed with due care over system configuration, using principles such as "least privilege" to limit any exposure due to flaws in the source code. + +You must configure authorisation and authentication externally to Cortex; see [this guide](./authentication-and-authorisation.md) + +Information about security disclosures and mailing lists is [in the main repo](https://github.com/cortexproject/cortex/blob/master/SECURITY.md) diff --git a/docs/proposals/block-storage-time-series-deletion.md b/docs/proposals/block-storage-time-series-deletion.md new file mode 100644 index 0000000000..20d7a346e5 --- /dev/null +++ b/docs/proposals/block-storage-time-series-deletion.md @@ -0,0 +1,255 @@ +--- +title: "Time Series Deletion from Blocks Storage" +linkTitle: "Time Series Deletion from Blocks Storage" +weight: 1 +slug: block-storage-time-series-deletion +--- + +- Author: [Ilan Gofman](https://github.com/ilangofman) +- Date: June 2021 +- Status: Proposal + +## Problem + +Currently, Cortex only implements a time series deletion API for chunk storage. We present a design for implementing time series deletion with block storage. We would like to have the same API for deleting series as currently implemented in Prometheus and in Cortex with chunk storage. + + +This can be very important for users to have as confidential or accidental data might have been incorrectly pushed and needs to be removed. As well as potentially removing high cardinality data that is causing inefficient queries. + +## Related works + +As previously mentioned, the deletion feature is already implemented with chunk storage. The main functionality is implemented through the purger service. It accepts requests for deletion and processes them. At first, when a deletion request is made, a tombstone is created. This is used to filter out the data for queries. After some time, a deletion plan is executed where the data is permanently removed from chunk storage. + +Can find more info here: + +- [Cortex documentation for chunk store deletion](https://cortexmetrics.io/docs/guides/deleting-series/) +- [Chunk deletion proposal](https://docs.google.com/document/d/1PeKwP3aGo3xVrR-2qJdoFdzTJxT8FcAbLm2ew_6UQyQ/edit) + + + +## Background on current storage + +With a block-storage configuration, Cortex stores data that could be potentially deleted by a user in: + +- Object store (GCS, S3, etc..) for long term storage of blocks +- Ingesters for more recent data that should be eventually transferred to the object store +- Cache + - Index cache + - Metadata cache + - Chunks cache (stores the potentially to be deleted data) + - Query results cache (stores the potentially to be deleted data) +- Compactor during the compaction process +- Store-gateway + + +## Proposal + +The deletion will not happen right away. Initially, the data will be filtered out from queries using tombstones and will be deleted afterward. This will allow the user some time to cancel the delete request. + +### API Endpoints + +The existing purger service will be used to process the incoming requests for deletion. The API will follow the same structure as the chunk storage endpoints for deletion, which is also based on the Prometheus deletion API. + +This will enable the following endpoints for Cortex when using block storage: + +`POST /api/v1/admin/tsdb/delete_series` - Accepts [Prometheus style delete request](https://prometheus.io/docs/prometheus/latest/querying/api/#delete-series) for deleting series. + +Parameters: + +- `start=` + - Optional. If not provided, will be set to minimum possible time. +- `end= ` + - Optional. If not provided, will be set to maximum possible time (time when request was made). End time cannot be greater than the current UTC time. +- `match[]=` + - Cannot be empty, must contain at least one label matcher argument. + + +`POST /api/v1/admin/tsdb/cancel_delete_request` - To cancel a request if it has not been processed yet for permanent deletion. This can only be done before the `-purger.delete-request-cancel-period` has passed. +Parameters: + +- `request_id` + +`GET /api/v1/admin/tsdb/delete_series` - Get all delete requests id’s and their current status. + +Prometheus also implements a [clean_tombstones](https://prometheus.io/docs/prometheus/latest/querying/api/#clean-tombstones) API which is not included in this proposal. The tombstones will be deleted automatically once the permanent deletion has taken place which is described in the section below. By default, this should take approximately 24 hours. + +### Deletion Lifecycle + +The deletion request lifecycle can follow these 3 states: + +1. Pending - Tombstone file is created. During this state, the queriers will be performing query time filtering. The initial time period configured by `-purger.delete-request-cancel-period`, no data will be deleted. Once this period is over, permanent deletion processing will begin and the request is no longer cancellable. +2. Processed - All requested data has been deleted. Initially, will still need to do query time filtering while waiting for the bucket index and store-gateway to pick up the new blocks. Once that period has passed, will no longer require any query time filtering. +3. Deleted - The deletion request was cancelled. A grace period configured by `-purger.delete-request-cancel-period` will allow the user some time to cancel the deletion request if it was made by mistake. The request is no longer cancelable after this period has passed. + + + +### Filtering data during queries while not yet deleted: + +Once a deletion request is received, a tombstone entry will be created. The object store such as S3, GCS, Azure storage, can be used to store all the deletion requests. See the section below for more detail on how the tombstones will be stored. Using the tombstones, the querier will be able to filter the to-be-deleted data initially. If a cancel delete request is made, then the tombstone file will be deleted. In addition, the existing cache will be invalidated using cache generation numbers, which are described in the later sections. + +The compactor's _BlocksCleaner_ service will scan for new tombstone files and will update the bucket-index with the tombstone information regarding the deletion requests. This will enable the querier to periodically check the bucket index if there are any new tombstone files that are required to be used for filtering. One drawback of this approach is the time it could take to start filtering the data. Since the compactor will update the bucket index with the new tombstones every `-compactor.cleanup-interval` (default 15 min). Then the cached bucket index is refreshed in the querier every `-blocks-storage.bucket-store.sync-interval` (default 15 min). Potentially could take almost 30 min for queriers to start filtering deleted data when using the default values. If the information requested for deletion is confidential/classified, the time delay is something that the user should be aware of, in addition to the time that the data has already been in Cortex. + +An additional thing to consider is that this would mean that the bucket-index would have to be enabled for this API to work. Since the plan is to make to the bucket-index mandatory in the future for block storage, this shouldn't be an issue. + +Similar to the chunk storage deletion implementation, the initial filtering of the deleted data will be done inside the Querier. This will allow filtering the data read from both the store gateway and the ingester. This functionality already exists for the chunk storage implementation. By implementing it in the querier, this would mean that the ruler will be supported too (ruler internally runs the querier). + +#### Storing tombstones in object store + + +The Purger will write the new tombstone entries in a separate folder called `tombstones` in the object store (e.g. S3 bucket) in the respective tenant folder. Each tombstone can have a separate JSON file outlining all the necessary information about the deletion request such as the parameters passed in the request, as well as some meta-data such as the creation date of the file. The name of the file can be a hash of the API parameters (start, end, markers). This way if a user calls the API twice by accident with the same parameters, it will only create one tombstone. To keep track of the request state, filename extensions can be used. This will allow the tombstone files to be immutable. The 3 different file extensions will be `pending, processed, deleted`. Each time the deletion request moves to a new state, a new file will be added with the same deletion information but a different extension to indicate the new state. The file containing the previous state will be deleted once the new one is created. If a deletion request is cancelled, then a tombstone file with the `.deleted` filename extension will be created. + +When it is determined that the request should move to the next state, then it will first write a new file containing the tombstone information to the object store. The information inside the file will be the same except the `stateCreationTime`, which is replaced with the current timestamp. The extension of the new file will be different to reflect the new state. If the new file is successfully written, the file with the previous state is deleted. If the write of the new file fails, then the previous file is not going to be deleted. Next time the service runs to check the state of each tombstone, it will retry creating the new file with the updated state. If the write is successful but the deletion of the old file is unsuccessful then there will be 2 tombstone files with the same filename but different extension. When `BlocksCleaner` writes the tombstones to the bucket index, the compactor will check for duplicate tombstone files but with different extensions. It will use the tombstone with the most recently updated state and try to delete the file with the older state. There could be a scenario where there are two files with the same request ID but different extensions: {`.pending`, `.processed`} or {`.pending`, `.deleted`}. In this case, the `.processed` or `.deleted ` file will be selected as it is always the later state compared to the `pending` state. + +The tombstone will be stored in a single JSON file per request and state: + +- `//tombstones/.json.` + + +The schema of the JSON file is: + + +``` +{ + "requestId": , + "startTime": , + "endTime": , + "requestCreationTime": , + "stateCreationTime": , + "matchers": [ + "", + .., + "" + ] + }, + "userID": , +} +``` + + +Pros: +- Allows deletion and un-delete to be done in a single operation. + +Cons: + +- Negative impact on query performance when there are active tombstones. As in the chunk storage implementation, all the series will have to be compared to the matchers contained in the active tombstone files. The impact on performance should be the same as the deletion would have with chunk storage. + +- With the default config, potential 30 minute wait for the data to begin filtering if using the default configuration. + +#### Invalidating cache + +Using block store, the different caches available are: +- Index cache +- Metadata cache +- Chunks cache (stores the potentially to be deleted chunks of data) +- Query results cache (stores the potentially to be deleted data) + +There are two potential caches that could contain deleted data, the chunks cache, and the query results cache. Using the tombstones, the queriers filter out the data received from the ingesters and store-gateway. The cache not being processed through the querier needs to be invalidated to prevent deleted data from coming up in queries. + +Firstly, the query results cache needs to be invalidated for each new delete request or a cancellation of one. This can be accomplished by utilizing cache generation numbers. For each tenant, their cache is prefixed with a cache generation number. When the query front-end discovers a cache generation number that is greater than the previous generation number, then it knows to invalidate the query results cache. However, the cache can only be invalidated once the queriers have loaded the tombstones from the bucket index and have begun filtering the data. Otherwise, to-be deleted data might show up in queries and be cached again. One of the way to guarantee that all the queriers are using the new tombstones is to wait until the bucket index staleness period has passed from the time the tombstones have been written to the bucket index. The staleness period can be configured using the following flag: `-blocks-storage.bucket-store.bucket-index.max-stale-period`. We can use the bucket index staleness period as the delay to wait before the cache generation number is increased. A query will fail inside the querier, if the bucket index last update is older the staleness period. Once this period is over, all the queriers should have the updated tombstones and the query results cache can be invalidated. Here is the proposed method for accomplishing this: + + +- The cache generation number will be a timestamp. The default value will be 0. +- The bucket index will store the cache generation number. The query front-end will periodically fetch the bucket index. +- Inside the compactor, the _BlocksCleaner_ will load the tombstones from object store and update the bucket index accordingly. It will calculate the cache generation number by iterating through all the tombstones and their respective times (next bullet point) and selecting the maximum timestamp that is less than (current time minus `-blocks-storage.bucket-store.bucket-index.max-stale-period`). This would mean that if a deletion request is made or cancelled, the compactor will only update the cache generation number once the staleness period is over, ensuring that all queriers have the updated tombstones. +- For requests in a pending or processed state, the `requestCreationTime` will be used when comparing the maximum timestamps. If a request is in a deleted state, it will use the `stateCreationTime` for comparing the timestamps. This means that the cache gets invalidated only once it has been created or deleted, and the bucket index staleness period has passed. The cache will not be invalidated again when a request advances from pending to processed state. +- The query front-end will fetch the cache generation number from the bucket index. The query front end will compare it to the current cache generation number stored in the front-end. If the cache generation number from the front-end is less than the one from bucket index, then the cache is invalidated. + +In regards to the chunks cache, since it is retrieved from the store gateway and passed to the querier, it will be filtered out like the rest of the time series data in the querier using the tombstones, with the mechanism described in the previous section. + +### Permanently deleting the data + +The proposed approach is to perform the deletions from the compactor. A new background service inside the compactor called _DeletedSeriesCleaner_ can be created and is responsible for executing the deletion. + +#### Processing + + +This will happen after a grace period has passed once the API request has been made. By default this should be 24 hours. A background task can be created to process the permanent deletion of time series. This background task can be executed each hour. + +To delete the data from the blocks, the same logic as the [Bucket Rewrite Tool](https://thanos.io/tip/components/tools.md/#bucket-rewrite +) from Thanos can be leveraged. This tool does the following: `tools bucket rewrite rewrites chosen blocks in the bucket, while deleting or modifying series`. The tool itself is a CLI tool that we won’t be using, but instead we can utilize the logic inside it. For more information about the way this tool runs, please see the code [here](https://github.com/thanos-io/thanos/blob/d8b21e708bee6d19f46ca32b158b0509ca9b7fed/cmd/thanos/tools_bucket.go#L809). + +The compactor’s _DeletedSeriesCleaner_ will apply this logic on individual blocks and each time it is run, it creates a new block without the data that matched the deletion request. The original individual blocks containing the data that was requested to be deleted, need to be marked for deletion by the compactor. + +While deleting the data permanently from the block storage, the `meta.json` files will be used to keep track of the deletion progress. Inside each `meta.json` file, we will add a new field called `tombstonesFiltered`. This will store an array of deletion request id's that were used to create this block. Once the rewrite logic is applied to a block, the new block's `meta.json` file will append the deletion request id(s) used for the rewrite operation inside this field. This will let the _DeletedSeriesCleaner_ know that this block has already processed the particular deletions requests listed in this field. Assuming that the deletion requests are quite rare, the size of the meta.json files should remain small. + +The _DeletedSeriesCleaner_ can iterate through all the blocks that the deletion request could apply to. For each of these blocks, if the deletion request ID isn't inside the meta.json `tombstonesFiltered` field, then the compactor can apply the rewrite logic to this block. If there are multiple tombstones that are currently being processing for deletions and apply to a particular block, then the _DeletedSeriesCleaner_ will process both at the same time to prevent additional blocks from being created. If after iterating through all the blocks, it doesn’t find any such blocks requiring deletion, then the `Pending` state is complete and the request progresses to the `Processed` state. + +One important thing to note regarding this rewrite tool is that it should not be used at the same time as when another compactor is touching a block. If the tool is run at the same time as compaction on a particular block, it can cause overlap and the data marked for deletion can already be part of the compacted block. To mitigate such issues, these are some of the proposed solutions: + +Option 1: Only apply the deletion once the blocks are in the final state of compaction. + +Pros: +- Simpler implementation as everything is contained within the DeletedSeriesCleaner. + +Cons: +- Might have to wait for a longer period of time for the compaction to be finished. + - This would mean the earliest time to be able to run the deletion would be once the last time from the block_ranges in the [compactor_config](https://cortexmetrics.io/docs/blocks-storage/compactor/#compactor-configuration) has passed. By default this value is 24 hours, so only once 24 hours have passed and the new compacted blocks have been created, then the rewrite can be safely run. + + + + +Option 2: For blocks that still need to be compacted further after the deletion request cancel period is over, the deletion logic can be applied before the blocks are compacted. This will generate a new block which can then be used instead for compaction with other blocks. + +Pros: +- The deletion can be applied earlier than the previous options. + - Only applies if the deletion request cancel period is less than the last time interval for compaction is. +Cons: +- Added coupling between the compaction and the DeletedSeriesCleaner. +- Might block compaction for a short time while doing the deletion. + + + +Once all the applicable blocks have been rewritten without the deleted data, the deletion request state moves to the `Processed` state. Once in this state, the queriers will still have to perform query time filtering using the tombstones until the old blocks that were marked for deletion are no longer queried by the queriers. This will mean that the query time filtering will last for an additional length of `-compactor.deletion-delay + -compactor.cleanup-interval + -blocks-storage.bucket-store.sync-interval` in the `Processed` state. Once that time period has passed, the queriers should no longer be querying any of the old blocks that were marked for deletion. The tombstone will no longer be used after this. + + +#### Cancelled Delete Requests + +If a request was successfully cancelled, then a tombstone file a `.deleted` extension is created. This is done to help ensure that the cache generation number is updated and the query results cache is invalidated. The compactor's blocks cleaner can take care of cleaning up `.deleted` tombstones after a period of time of when they are no longer required for cache invalidation. This can be done after 10 times the bucket index max staleness time period has passed. Before removing the file from the object store, the current cache generation number must greater than or equal to when the tombstone was cancelled. + +#### Handling failed/unfinished delete jobs: + +Deletions will be completed and the tombstones will be deleted only when the DeletedSeriesCleaner iterates over all blocks that match the time interval and confirms that they have been re-written without the deleted data. Otherwise, it will keep iterating over the blocks and process the blocks that haven't been rewritten according to the information in the `meta.json` file. In case of any failure that causes the deletion to stop, any unfinished deletions will be resumed once the service is restarted. If the block rewrite was not completed on a particular block, then the original block will not be marked for deletion. The compactor will continue to iterate over the blocks and process the block again. + + +#### Tenant Deletion API + +If a request is made to delete a tenant, then all the tombstones will be deleted for that user. + + +## Current Open Questions: + +- If the start and end time is very far apart, it might result in a lot of the data being re-written. Since we create a new block without the deleted data and mark the old one for deletion, there may be a period of time with lots of extra blocks and space used for large deletion queries. +- There will be a delay between the deletion request and the deleted data being filtered during queries. + - In Prometheus, there is no delay. + - One way to filter out immediately is to load the tombstones during query time but this will cause a negative performance impact. +- Adding limits to the API such as: + - Max number of deletion requests allowed in the last 24 hours for a given tenent. + - Max number of pending tombstones for a given tenant. + + +## Alternatives Considered + + +#### Adding a Pre-processing State + +The process of permanently deleting the data can be separated into 2 stages, preprocessing and processing. + +Pre-processing will begin after the `-purger.delete-request-cancel-period` has passed since the API request has been made. The deletion request will move to a new state called `BuildingPlan`. The compactor will outline all the blocks that may contain data to be deleted. For each separate block that the deletion may be applicable to, the compactor will begin the process by adding a series deletion marker inside the series-deletion-marker.json file. The JSON file will contain an array of deletion request id's that need to be applied to the block, which allows the ability to handle the situation when there are multiple tombstones that could be applicable to a particular block. Then during the processing step, instead of checking the meta.json file, we only need to check if a marker file exists with a specific deletion request id. If the marker file exists, then we apply the rewrite logic. + +#### Alternative Permanent Deletion Processing + +For processing the actual deletions, an alternative approach is not to wait until the final compaction has been completed and filter out the data during compaction. If the data is marked to be deleted, then don’t include it the new bigger block during compaction. For the remaining blocks where the data wasn’t filtered during compaction, the deletion can be done the same as in the previous section. + +Pros: + +- The deletion can happen sooner. +- The rewrite tools creates additional blocks. By filtering the metrics during compaction, the intermediary re-written block will be avoided. + +Cons: + +- A more complicated implementation requiring add more logic to the compactor +- Slower compaction if it needs to filter all the data +- Need to manage which blocks should be deleted with the rewrite vs which blocks already had data filtered during compaction. +- Would need to run the rewrite logic during and outside of compaction because some blocks that might need to be deleted are already in the final compaction state. So that would mean the deletion functionality has to be implemented in multiple places. +- Won’t be leveraging the rewrites tools from Thanos for all the deletion, so potentially more work is duplicated + diff --git a/integration/integration_memberlist_single_binary_test.go b/integration/integration_memberlist_single_binary_test.go index 53c1a945fd..f4cd7b7914 100644 --- a/integration/integration_memberlist_single_binary_test.go +++ b/integration/integration_memberlist_single_binary_test.go @@ -23,15 +23,21 @@ import ( func TestSingleBinaryWithMemberlist(t *testing.T) { t.Run("default", func(t *testing.T) { - testSingleBinaryEnv(t, false) + testSingleBinaryEnv(t, false, nil) }) t.Run("tls", func(t *testing.T) { - testSingleBinaryEnv(t, true) + testSingleBinaryEnv(t, true, nil) + }) + + t.Run("compression-disabled", func(t *testing.T) { + testSingleBinaryEnv(t, false, map[string]string{ + "-memberlist.compression-enabled": "false", + }) }) } -func testSingleBinaryEnv(t *testing.T, tlsEnabled bool) { +func testSingleBinaryEnv(t *testing.T, tlsEnabled bool, flags map[string]string) { s, err := e2e.NewScenario(networkName) require.NoError(t, err) defer s.Close() @@ -65,13 +71,13 @@ func testSingleBinaryEnv(t *testing.T, tlsEnabled bool) { filepath.Join(s.SharedDir(), clientKeyFile), )) - cortex1 = newSingleBinary("cortex-1", memberlistDNS, "") - cortex2 = newSingleBinary("cortex-2", memberlistDNS, networkName+"-cortex-1:8000") - cortex3 = newSingleBinary("cortex-3", memberlistDNS, networkName+"-cortex-1:8000") + cortex1 = newSingleBinary("cortex-1", memberlistDNS, "", flags) + cortex2 = newSingleBinary("cortex-2", memberlistDNS, networkName+"-cortex-1:8000", flags) + cortex3 = newSingleBinary("cortex-3", memberlistDNS, networkName+"-cortex-1:8000", flags) } else { - cortex1 = newSingleBinary("cortex-1", "", "") - cortex2 = newSingleBinary("cortex-2", "", networkName+"-cortex-1:8000") - cortex3 = newSingleBinary("cortex-3", "", networkName+"-cortex-1:8000") + cortex1 = newSingleBinary("cortex-1", "", "", flags) + cortex2 = newSingleBinary("cortex-2", "", networkName+"-cortex-1:8000", flags) + cortex3 = newSingleBinary("cortex-3", "", networkName+"-cortex-1:8000", flags) } // start cortex-1 first, as cortex-2 and cortex-3 both connect to cortex-1 @@ -109,7 +115,7 @@ func testSingleBinaryEnv(t *testing.T, tlsEnabled bool) { require.NoError(t, s.Stop(cortex3)) } -func newSingleBinary(name string, servername string, join string) *e2ecortex.CortexService { +func newSingleBinary(name string, servername string, join string, testFlags map[string]string) *e2ecortex.CortexService { flags := map[string]string{ "-ingester.final-sleep": "0s", "-ingester.join-after": "0s", // join quickly @@ -132,6 +138,7 @@ func newSingleBinary(name string, servername string, join string) *e2ecortex.Cor mergeFlags( ChunksStorageFlags(), flags, + testFlags, getTLSFlagsWithPrefix("memberlist", servername, servername == ""), ), "", @@ -170,7 +177,7 @@ func TestSingleBinaryWithMemberlistScaling(t *testing.T) { if i > 0 { join = e2e.NetworkContainerHostPort(networkName, "cortex-1", 8000) } - c := newSingleBinary(name, "", join) + c := newSingleBinary(name, "", join, nil) require.NoError(t, s.StartAndWaitReady(c)) instances = append(instances, c) } diff --git a/pkg/alertmanager/alertmanager_ring.go b/pkg/alertmanager/alertmanager_ring.go index 04d08d2d65..0c60d0f69c 100644 --- a/pkg/alertmanager/alertmanager_ring.go +++ b/pkg/alertmanager/alertmanager_ring.go @@ -76,8 +76,8 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // Ring flags cfg.KVStore.RegisterFlagsWithPrefix(rfprefix, "alertmanagers/", f) - f.DurationVar(&cfg.HeartbeatPeriod, rfprefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring.") - f.DurationVar(&cfg.HeartbeatTimeout, rfprefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which alertmanagers are considered unhealthy within the ring.") + f.DurationVar(&cfg.HeartbeatPeriod, rfprefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.") + f.DurationVar(&cfg.HeartbeatTimeout, rfprefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which alertmanagers are considered unhealthy within the ring. 0 = never (timeout disabled).") f.IntVar(&cfg.ReplicationFactor, rfprefix+"replication-factor", 3, "The replication factor to use when sharding the alertmanager.") f.BoolVar(&cfg.ZoneAwarenessEnabled, rfprefix+"zone-awareness-enabled", false, "True to enable zone-awareness and replicate alerts across different availability zones.") diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index dc60301eb8..285914b99b 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -1325,18 +1325,15 @@ func TestMultitenantAlertmanager_SyncOnRingTopologyChanges(t *testing.T) { return ringDesc, true, nil })) - // Assert if we expected a sync or not. + // Assert if we expected an additional sync or not. + expectedSyncs := 1 if tt.expected { - test.Poll(t, time.Second, float64(2), func() interface{} { - metrics := regs.BuildMetricFamiliesPerUser() - return metrics.GetSumOfCounters("cortex_alertmanager_sync_configs_total") - }) - } else { - time.Sleep(250 * time.Millisecond) - - metrics := regs.BuildMetricFamiliesPerUser() - assert.Equal(t, float64(1), metrics.GetSumOfCounters("cortex_alertmanager_sync_configs_total")) + expectedSyncs++ } + test.Poll(t, 5*time.Second, float64(expectedSyncs), func() interface{} { + metrics := regs.BuildMetricFamiliesPerUser() + return metrics.GetSumOfCounters("cortex_alertmanager_sync_configs_total") + }) }) } } @@ -1820,7 +1817,14 @@ func TestAlertmanager_StateReplicationWithSharding_InitialSyncFromPeers(t *testi { metrics := registries.BuildMetricFamiliesPerUser() assert.Equal(t, float64(1), metrics.GetSumOfGauges("cortex_alertmanager_silences")) - assert.Equal(t, float64(1), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_total")) + } + // 2.c. Wait for the silence replication to be attempted; note this is asynchronous. + { + test.Poll(t, 5*time.Second, float64(1), func() interface{} { + metrics := registries.BuildMetricFamiliesPerUser() + return metrics.GetSumOfCounters("cortex_alertmanager_state_replication_total") + }) + metrics := registries.BuildMetricFamiliesPerUser() assert.Equal(t, float64(0), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_failed_total")) } diff --git a/pkg/compactor/compactor.go b/pkg/compactor/compactor.go index c41f7a9ccc..8dddac33ec 100644 --- a/pkg/compactor/compactor.go +++ b/pkg/compactor/compactor.go @@ -44,7 +44,10 @@ var ( errInvalidBlockRanges = "compactor block range periods should be divisible by the previous one, but %s is not divisible by %s" RingOp = ring.NewOp([]ring.InstanceState{ring.ACTIVE}, nil) - DefaultBlocksGrouperFactory = func(ctx context.Context, cfg Config, bkt objstore.Bucket, logger log.Logger, reg prometheus.Registerer, blocksMarkedForDeletion prometheus.Counter, garbageCollectedBlocks prometheus.Counter) compact.Grouper { + supportedShardingStrategies = []string{util.ShardingStrategyDefault, util.ShardingStrategyShuffle} + errInvalidShardingStrategy = errors.New("invalid sharding strategy") + + DefaultBlocksGrouperFactory = func(ctx context.Context, cfg Config, bkt objstore.Bucket, logger log.Logger, reg prometheus.Registerer, blocksMarkedForDeletion prometheus.Counter, garbageCollectedBlocks prometheus.Counter, _ prometheus.Gauge) compact.Grouper { return compact.NewDefaultGrouper( logger, bkt, @@ -56,6 +59,20 @@ var ( metadata.NoneFunc) } + ShuffleShardingGrouperFactory = func(ctx context.Context, cfg Config, bkt objstore.Bucket, logger log.Logger, reg prometheus.Registerer, blocksMarkedForDeletion prometheus.Counter, garbageCollectedBlocks prometheus.Counter, remainingPlannedCompactions prometheus.Gauge) compact.Grouper { + return NewShuffleShardingGrouper( + logger, + bkt, + false, // Do not accept malformed indexes + true, // Enable vertical compaction + reg, + blocksMarkedForDeletion, + garbageCollectedBlocks, + remainingPlannedCompactions, + metadata.NoneFunc, + cfg) + } + DefaultBlocksCompactorFactory = func(ctx context.Context, cfg Config, logger log.Logger, reg prometheus.Registerer) (compact.Compactor, compact.Planner, error) { compactor, err := tsdb.NewLeveledCompactor(ctx, reg, logger, cfg.BlockRanges.ToMilliseconds(), downsample.NewPool()) if err != nil { @@ -65,6 +82,16 @@ var ( planner := compact.NewTSDBBasedPlanner(logger, cfg.BlockRanges.ToMilliseconds()) return compactor, planner, nil } + + ShuffleShardingBlocksCompactorFactory = func(ctx context.Context, cfg Config, logger log.Logger, reg prometheus.Registerer) (compact.Compactor, compact.Planner, error) { + compactor, err := tsdb.NewLeveledCompactor(ctx, reg, logger, cfg.BlockRanges.ToMilliseconds(), downsample.NewPool()) + if err != nil { + return nil, nil, err + } + + planner := NewShuffleShardingPlanner(logger, cfg.BlockRanges.ToMilliseconds()) + return compactor, planner, nil + } ) // BlocksGrouperFactory builds and returns the grouper to use to compact a tenant's blocks. @@ -76,6 +103,7 @@ type BlocksGrouperFactory func( reg prometheus.Registerer, blocksMarkedForDeletion prometheus.Counter, garbageCollectedBlocks prometheus.Counter, + remainingPlannedCompactions prometheus.Gauge, ) compact.Grouper // BlocksCompactorFactory builds and returns the compactor and planner to use to compact a tenant's blocks. @@ -108,8 +136,9 @@ type Config struct { DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"` // Compactors sharding. - ShardingEnabled bool `yaml:"sharding_enabled"` - ShardingRing RingConfig `yaml:"sharding_ring"` + ShardingEnabled bool `yaml:"sharding_enabled"` + ShardingStrategy string `yaml:"sharding_strategy"` + ShardingRing RingConfig `yaml:"sharding_ring"` // No need to add options to customize the retry backoff, // given the defaults should be fine, but allow to override @@ -141,6 +170,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.DurationVar(&cfg.CleanupInterval, "compactor.cleanup-interval", 15*time.Minute, "How frequently compactor should run blocks cleanup and maintenance, as well as update the bucket index.") f.IntVar(&cfg.CleanupConcurrency, "compactor.cleanup-concurrency", 20, "Max number of tenants for which blocks cleanup and maintenance should run concurrently.") f.BoolVar(&cfg.ShardingEnabled, "compactor.sharding-enabled", false, "Shard tenants across multiple compactor instances. Sharding is required if you run multiple compactor instances, in order to coordinate compactions and avoid race conditions leading to the same tenant blocks simultaneously compacted by different instances.") + f.StringVar(&cfg.ShardingStrategy, "compactor.sharding-strategy", util.ShardingStrategyDefault, fmt.Sprintf("The sharding strategy to use. Supported values are: %s.", strings.Join(supportedShardingStrategies, ", "))) f.DurationVar(&cfg.DeletionDelay, "compactor.deletion-delay", 12*time.Hour, "Time before a block marked for deletion is deleted from bucket. "+ "If not 0, blocks will be marked for deletion and compactor component will permanently delete blocks marked for deletion from the bucket. "+ "If 0, blocks will be deleted straight away. Note that deleting blocks immediately can cause query failures.") @@ -159,6 +189,11 @@ func (cfg *Config) Validate() error { } } + // Make sure a valid sharding strategy is being used + if !util.StringsContain(supportedShardingStrategies, cfg.ShardingStrategy) { + return errInvalidShardingStrategy + } + return nil } @@ -217,6 +252,7 @@ type Compactor struct { compactionRunInterval prometheus.Gauge blocksMarkedForDeletion prometheus.Counter garbageCollectedBlocks prometheus.Counter + remainingPlannedCompactions prometheus.Gauge // TSDB syncer metrics syncerMetrics *syncerMetrics @@ -230,12 +266,20 @@ func NewCompactor(compactorCfg Config, storageCfg cortex_tsdb.BlocksStorageConfi blocksGrouperFactory := compactorCfg.BlocksGrouperFactory if blocksGrouperFactory == nil { - blocksGrouperFactory = DefaultBlocksGrouperFactory + if compactorCfg.ShardingStrategy == util.ShardingStrategyShuffle { + blocksGrouperFactory = ShuffleShardingGrouperFactory + } else { + blocksGrouperFactory = DefaultBlocksGrouperFactory + } } blocksCompactorFactory := compactorCfg.BlocksCompactorFactory if blocksCompactorFactory == nil { - blocksCompactorFactory = DefaultBlocksCompactorFactory + if compactorCfg.ShardingStrategy == util.ShardingStrategyShuffle { + blocksCompactorFactory = ShuffleShardingBlocksCompactorFactory + } else { + blocksCompactorFactory = DefaultBlocksCompactorFactory + } } cortexCompactor, err := newCompactor(compactorCfg, storageCfg, cfgProvider, logger, registerer, bucketClientFactory, blocksGrouperFactory, blocksCompactorFactory) @@ -256,6 +300,13 @@ func newCompactor( blocksGrouperFactory BlocksGrouperFactory, blocksCompactorFactory BlocksCompactorFactory, ) (*Compactor, error) { + var remainingPlannedCompactions prometheus.Gauge + if compactorCfg.ShardingStrategy == "shuffle-sharding" { + remainingPlannedCompactions = promauto.With(registerer).NewGauge(prometheus.GaugeOpts{ + Name: "cortex_compactor_remaining_planned_compactions", + Help: "Total number of plans that remain to be compacted.", + }) + } c := &Compactor{ compactorCfg: compactorCfg, storageCfg: storageCfg, @@ -314,6 +365,7 @@ func newCompactor( Name: "cortex_compactor_garbage_collected_blocks_total", Help: "Total number of blocks marked for deletion by compactor.", }), + remainingPlannedCompactions: remainingPlannedCompactions, } if len(compactorCfg.EnabledTenants) > 0 { @@ -393,7 +445,11 @@ func (c *Compactor) starting(ctx context.Context) error { // users scanner depends on the ring (to check whether an user belongs // to this shard or not). level.Info(c.logger).Log("msg", "waiting until compactor is ACTIVE in the ring") - if err := ring.WaitInstanceState(ctx, c.ring, c.ringLifecycler.ID, ring.ACTIVE); err != nil { + + ctxWithTimeout, cancel := context.WithTimeout(ctx, c.compactorCfg.ShardingRing.WaitActiveInstanceTimeout) + defer cancel() + if err := ring.WaitInstanceState(ctxWithTimeout, c.ring, c.ringLifecycler.ID, ring.ACTIVE); err != nil { + level.Error(c.logger).Log("msg", "compactor failed to become ACTIVE in the ring", "err", err) return err } level.Info(c.logger).Log("msg", "compactor is ACTIVE in the ring") @@ -645,7 +701,7 @@ func (c *Compactor) compactUser(ctx context.Context, userID string) error { compactor, err := compact.NewBucketCompactor( ulogger, syncer, - c.blocksGrouperFactory(ctx, c.compactorCfg, bucket, ulogger, reg, c.blocksMarkedForDeletion, c.garbageCollectedBlocks), + c.blocksGrouperFactory(ctx, c.compactorCfg, bucket, ulogger, reg, c.blocksMarkedForDeletion, c.garbageCollectedBlocks, c.remainingPlannedCompactions), c.blocksPlanner, c.blocksCompactor, path.Join(c.compactorCfg.DataDir, "compact"), diff --git a/pkg/compactor/compactor_ring.go b/pkg/compactor/compactor_ring.go index 74c3a77ad3..285a1fa6bc 100644 --- a/pkg/compactor/compactor_ring.go +++ b/pkg/compactor/compactor_ring.go @@ -34,6 +34,10 @@ type RingConfig struct { // Injected internally ListenPort int `yaml:"-"` + + WaitActiveInstanceTimeout time.Duration `yaml:"wait_active_instance_timeout"` + + ObservePeriod time.Duration `yaml:"-"` } // RegisterFlags adds the flags required to config this to the given FlagSet @@ -46,8 +50,8 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // Ring flags cfg.KVStore.RegisterFlagsWithPrefix("compactor.ring.", "collectors/", f) - f.DurationVar(&cfg.HeartbeatPeriod, "compactor.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring.") - f.DurationVar(&cfg.HeartbeatTimeout, "compactor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which compactors are considered unhealthy within the ring.") + f.DurationVar(&cfg.HeartbeatPeriod, "compactor.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.") + f.DurationVar(&cfg.HeartbeatTimeout, "compactor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which compactors are considered unhealthy within the ring. 0 = never (timeout disabled).") // Wait stability flags. f.DurationVar(&cfg.WaitStabilityMinDuration, "compactor.ring.wait-stability-min-duration", time.Minute, "Minimum time to wait for ring stability at startup. 0 to disable.") @@ -59,6 +63,9 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { f.StringVar(&cfg.InstanceAddr, "compactor.ring.instance-addr", "", "IP address to advertise in the ring.") f.IntVar(&cfg.InstancePort, "compactor.ring.instance-port", 0, "Port to advertise in the ring (defaults to server.grpc-listen-port).") f.StringVar(&cfg.InstanceID, "compactor.ring.instance-id", hostname, "Instance ID to register in the ring.") + + // Timeout durations + f.DurationVar(&cfg.WaitActiveInstanceTimeout, "compactor.ring.wait-active-instance-timeout", 10*time.Minute, "Timeout for waiting on compactor to become ACTIVE in the ring.") } // ToLifecyclerConfig returns a LifecyclerConfig based on the compactor @@ -87,7 +94,7 @@ func (cfg *RingConfig) ToLifecyclerConfig() ring.LifecyclerConfig { lc.InfNames = cfg.InstanceInterfaceNames lc.UnregisterOnShutdown = true lc.HeartbeatPeriod = cfg.HeartbeatPeriod - lc.ObservePeriod = 0 + lc.ObservePeriod = cfg.ObservePeriod lc.JoinAfter = 0 lc.MinReadyDuration = 0 lc.FinalSleep = 0 diff --git a/pkg/compactor/compactor_test.go b/pkg/compactor/compactor_test.go index 9c14cb0924..da1a83cead 100644 --- a/pkg/compactor/compactor_test.go +++ b/pkg/compactor/compactor_test.go @@ -1092,6 +1092,9 @@ func prepareConfig() Config { compactorCfg.ShardingRing.WaitStabilityMinDuration = 0 compactorCfg.ShardingRing.WaitStabilityMaxDuration = 0 + // Set lower timeout for waiting on compactor to become ACTIVE in the ring for unit tests + compactorCfg.ShardingRing.WaitActiveInstanceTimeout = 5 * time.Second + return compactorCfg } @@ -1279,3 +1282,33 @@ func TestCompactor_DeleteLocalSyncFiles(t *testing.T) { require.NotEqual(t, numUsers, c1Users) require.Equal(t, numUsers, c1Users+c2Users) } + +func TestCompactor_ShouldFailCompactionOnTimeout(t *testing.T) { + t.Parallel() + + // Mock the bucket + bucketClient := &bucket.ClientMock{} + bucketClient.MockIter("", []string{}, nil) + + cfg := prepareConfig() + cfg.ShardingEnabled = true + cfg.ShardingRing.InstanceID = "compactor-1" + cfg.ShardingRing.InstanceAddr = "1.2.3.4" + cfg.ShardingRing.KVStore.Mock = consul.NewInMemoryClient(ring.GetCodec()) + + // Set ObservePeriod to longer than the timeout period to mock a timeout while waiting on ring to become ACTIVE + cfg.ShardingRing.ObservePeriod = time.Second * 10 + + c, _, _, logs, _ := prepare(t, cfg, bucketClient) + + // Try to start the compactor with a bad consul kv-store. The + err := services.StartAndAwaitRunning(context.Background(), c) + + // Assert that the compactor timed out + assert.Equal(t, context.DeadlineExceeded, err) + + assert.ElementsMatch(t, []string{ + `level=info component=compactor msg="waiting until compactor is ACTIVE in the ring"`, + `level=error component=compactor msg="compactor failed to become ACTIVE in the ring" err="context deadline exceeded"`, + }, removeIgnoredLogs(strings.Split(strings.TrimSpace(logs.String()), "\n"))) +} diff --git a/pkg/compactor/shuffle_sharding_grouper.go b/pkg/compactor/shuffle_sharding_grouper.go new file mode 100644 index 0000000000..2e90e87abe --- /dev/null +++ b/pkg/compactor/shuffle_sharding_grouper.go @@ -0,0 +1,386 @@ +package compactor + +import ( + "fmt" + "hash/fnv" + "sort" + "strings" + "time" + + "github.com/go-kit/kit/log" + "github.com/go-kit/kit/log/level" + "github.com/oklog/ulid" + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/prometheus/prometheus/pkg/labels" + "github.com/thanos-io/thanos/pkg/block/metadata" + "github.com/thanos-io/thanos/pkg/compact" + "github.com/thanos-io/thanos/pkg/objstore" +) + +type ShuffleShardingGrouper struct { + logger log.Logger + bkt objstore.Bucket + acceptMalformedIndex bool + enableVerticalCompaction bool + reg prometheus.Registerer + blocksMarkedForDeletion prometheus.Counter + garbageCollectedBlocks prometheus.Counter + remainingPlannedCompactions prometheus.Gauge + hashFunc metadata.HashFunc + compactions *prometheus.CounterVec + compactionRunsStarted *prometheus.CounterVec + compactionRunsCompleted *prometheus.CounterVec + compactionFailures *prometheus.CounterVec + verticalCompactions *prometheus.CounterVec + compactorCfg Config +} + +func NewShuffleShardingGrouper( + logger log.Logger, + bkt objstore.Bucket, + acceptMalformedIndex bool, + enableVerticalCompaction bool, + reg prometheus.Registerer, + blocksMarkedForDeletion prometheus.Counter, + garbageCollectedBlocks prometheus.Counter, + remainingPlannedCompactions prometheus.Gauge, + hashFunc metadata.HashFunc, + compactorCfg Config, +) *ShuffleShardingGrouper { + if logger == nil { + logger = log.NewNopLogger() + } + + return &ShuffleShardingGrouper{ + logger: logger, + bkt: bkt, + acceptMalformedIndex: acceptMalformedIndex, + enableVerticalCompaction: enableVerticalCompaction, + reg: reg, + blocksMarkedForDeletion: blocksMarkedForDeletion, + garbageCollectedBlocks: garbageCollectedBlocks, + remainingPlannedCompactions: remainingPlannedCompactions, + hashFunc: hashFunc, + compactions: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "thanos_compact_group_compactions_total", + Help: "Total number of group compaction attempts that resulted in a new block.", + }, []string{"group"}), + compactionRunsStarted: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "thanos_compact_group_compaction_runs_started_total", + Help: "Total number of group compaction attempts.", + }, []string{"group"}), + compactionRunsCompleted: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "thanos_compact_group_compaction_runs_completed_total", + Help: "Total number of group completed compaction runs. This also includes compactor group runs that resulted with no compaction.", + }, []string{"group"}), + compactionFailures: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "thanos_compact_group_compactions_failures_total", + Help: "Total number of failed group compactions.", + }, []string{"group"}), + verticalCompactions: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "thanos_compact_group_vertical_compactions_total", + Help: "Total number of group compaction attempts that resulted in a new block based on overlapping blocks.", + }, []string{"group"}), + compactorCfg: compactorCfg, + } +} + +// Groups function modified from https://github.com/cortexproject/cortex/pull/2616 +func (g *ShuffleShardingGrouper) Groups(blocks map[ulid.ULID]*metadata.Meta) (res []*compact.Group, err error) { + // First of all we have to group blocks using the Thanos default + // grouping (based on downsample resolution + external labels). + mainGroups := map[string][]*metadata.Meta{} + for _, b := range blocks { + key := compact.DefaultGroupKey(b.Thanos) + mainGroups[key] = append(mainGroups[key], b) + } + + // For each group, we have to further split it into set of blocks + // which we can parallelly compact. + var outGroups []*compact.Group + + i := 0 + // Metrics for the remaining planned compactions + g.remainingPlannedCompactions.Set(0) + + for _, mainBlocks := range mainGroups { + for _, group := range groupBlocksByCompactableRanges(mainBlocks, g.compactorCfg.BlockRanges.ToMilliseconds()) { + // Nothing to do if we don't have at least 2 blocks. + if len(group.blocks) < 2 { + continue + } + + // TODO: Use the group's hash to determine whether a compactor should be responsible for compacting that group + groupHash := hashGroup(group.blocks[0].Thanos.Labels["__org_id__"], group.rangeStart, group.rangeEnd) + + g.remainingPlannedCompactions.Add(1) + groupKey := fmt.Sprintf("%v%d", groupHash, i) + i++ + + level.Debug(g.logger).Log("msg", "found compactable group for user", "user", group.blocks[0].Thanos.Labels["__org_id__"], "plan", group.String()) + + // All the blocks within the same group have the same downsample + // resolution and external labels. + resolution := group.blocks[0].Thanos.Downsample.Resolution + externalLabels := labels.FromMap(group.blocks[0].Thanos.Labels) + + thanosGroup, err := compact.NewGroup( + log.With(g.logger, "groupKey", groupKey, "rangeStart", group.rangeStartTime().String(), "rangeEnd", group.rangeEndTime().String(), "externalLabels", externalLabels, "downsampleResolution", resolution), + g.bkt, + groupKey, + externalLabels, + resolution, + false, // No malformed index. + true, // Enable vertical compaction. + g.compactions.WithLabelValues(groupKey), + g.compactionRunsStarted.WithLabelValues(groupKey), + g.compactionRunsCompleted.WithLabelValues(groupKey), + g.compactionFailures.WithLabelValues(groupKey), + g.verticalCompactions.WithLabelValues(groupKey), + g.garbageCollectedBlocks, + g.blocksMarkedForDeletion, + g.hashFunc, + ) + if err != nil { + return nil, errors.Wrap(err, "create compaction group") + } + + for _, m := range group.blocks { + if err := thanosGroup.AppendMeta(m); err != nil { + return nil, errors.Wrap(err, "add block to compaction group") + } + } + + outGroups = append(outGroups, thanosGroup) + } + } + + // Ensure groups are sorted by smallest range, oldest min time first. The rationale + // is that we wanna favor smaller ranges first (ie. to deduplicate samples sooner + // than later) and older ones are more likely to be "complete" (no missing block still + // to be uploaded). + sort.SliceStable(outGroups, func(i, j int) bool { + iLength := outGroups[i].MaxTime() - outGroups[i].MinTime() + jLength := outGroups[j].MaxTime() - outGroups[j].MinTime() + + if iLength != jLength { + return iLength < jLength + } + if outGroups[i].MinTime() != outGroups[j].MinTime() { + return outGroups[i].MinTime() < outGroups[j].MinTime() + } + + // Guarantee stable sort for tests. + return outGroups[i].Key() < outGroups[j].Key() + }) + + return outGroups, nil +} + +// Get the hash of a group based on the UserID, and the starting and ending time of the group's range. +func hashGroup(userID string, rangeStart int64, rangeEnd int64) uint32 { + groupString := fmt.Sprintf("%v%v%v", userID, rangeStart, rangeEnd) + groupHasher := fnv.New32a() + // Hasher never returns err. + _, _ = groupHasher.Write([]byte(groupString)) + groupHash := groupHasher.Sum32() + + return groupHash +} + +// blocksGroup struct and functions copied and adjusted from https://github.com/cortexproject/cortex/pull/2616 +type blocksGroup struct { + rangeStart int64 // Included. + rangeEnd int64 // Excluded. + blocks []*metadata.Meta + key string +} + +// overlaps returns whether the group range overlaps with the input group. +func (g blocksGroup) overlaps(other blocksGroup) bool { + if g.rangeStart >= other.rangeEnd || other.rangeStart >= g.rangeEnd { + return false + } + + return true +} + +func (g blocksGroup) rangeStartTime() time.Time { + return time.Unix(0, g.rangeStart*int64(time.Millisecond)).UTC() +} + +func (g blocksGroup) rangeEndTime() time.Time { + return time.Unix(0, g.rangeEnd*int64(time.Millisecond)).UTC() +} + +func (g blocksGroup) String() string { + out := strings.Builder{} + out.WriteString(fmt.Sprintf("Group range start: %d, range end: %d, key %v, blocks: ", g.rangeStart, g.rangeEnd, g.key)) + + for i, b := range g.blocks { + if i > 0 { + out.WriteString(", ") + } + + minT := time.Unix(0, b.MinTime*int64(time.Millisecond)).UTC() + maxT := time.Unix(0, b.MaxTime*int64(time.Millisecond)).UTC() + out.WriteString(fmt.Sprintf("%s (min time: %s, max time: %s)", b.ULID.String(), minT.String(), maxT.String())) + } + + return out.String() +} + +func (g blocksGroup) rangeLength() int64 { + return g.rangeEnd - g.rangeStart +} + +// minTime returns the MinTime across all blocks in the group. +func (g blocksGroup) minTime() int64 { + // Blocks are expected to be sorted by MinTime. + return g.blocks[0].MinTime +} + +// maxTime returns the MaxTime across all blocks in the group. +func (g blocksGroup) maxTime() int64 { + max := g.blocks[0].MaxTime + + for _, b := range g.blocks[1:] { + if b.MaxTime > max { + max = b.MaxTime + } + } + + return max +} + +// groupBlocksByCompactableRanges groups input blocks by compactable ranges, giving preference +// to smaller ranges. If a smaller range contains more than 1 block (and thus it should +// be compacted), the larger range block group is not generated until each of its +// smaller ranges have 1 block each at most. +func groupBlocksByCompactableRanges(blocks []*metadata.Meta, ranges []int64) []blocksGroup { + if len(blocks) == 0 { + return nil + } + + // Sort blocks by min time. + sortMetasByMinTime(blocks) + + var groups []blocksGroup + + for _, tr := range ranges { + nextGroup: + for _, group := range groupBlocksByRange(blocks, tr) { + + // Exclude groups with a single block, because no compaction is required. + if len(group.blocks) < 2 { + continue + } + + // Ensure this group's range does not overlap with any group already scheduled + // for compaction by a smaller range, because we need to guarantee that smaller ranges + // are compacted first. + for _, c := range groups { + if group.overlaps(c) { + continue nextGroup + } + } + + groups = append(groups, group) + } + } + + // Ensure we don't compact the most recent blocks prematurely when another one of + // the same size still fits in the range. To do it, we consider valid a group only + // if it's before the most recent block or if it fully covers the range. + highestMinTime := blocks[len(blocks)-1].MinTime + + for idx := 0; idx < len(groups); { + group := groups[idx] + + // If the group covers a range before the most recent block, it's fine. + if group.rangeEnd <= highestMinTime { + idx++ + continue + } + + // If the group covers the full range, it's fine. + if group.maxTime()-group.minTime() == group.rangeLength() { + idx++ + continue + } + + // We hit into a group which would compact recent blocks prematurely, + // so we need to filter it out. + + groups = append(groups[:idx], groups[idx+1:]...) + } + + return groups +} + +// groupBlocksByRange splits the blocks by the time range. The range sequence starts at 0. +// Input blocks are expected to be sorted by MinTime. +// +// For example, if we have blocks [0-10, 10-20, 50-60, 90-100] and the split range tr is 30 +// it returns [0-10, 10-20], [50-60], [90-100]. +func groupBlocksByRange(blocks []*metadata.Meta, tr int64) []blocksGroup { + var ret []blocksGroup + + for i := 0; i < len(blocks); { + var ( + group blocksGroup + m = blocks[i] + ) + + group.rangeStart = getRangeStart(m, tr) + group.rangeEnd = group.rangeStart + tr + + // Skip blocks that don't fall into the range. This can happen via mis-alignment or + // by being the multiple of the intended range. + if m.MaxTime > group.rangeEnd { + i++ + continue + } + + // Add all blocks to the current group that are within [t0, t0+tr]. + for ; i < len(blocks); i++ { + // If the block does not start within this group, then we should break the iteration + // and move it to the next group. + if blocks[i].MinTime >= group.rangeEnd { + break + } + + // If the block doesn't fall into this group, but it started within this group then it + // means it spans across multiple ranges and we should skip it. + if blocks[i].MaxTime > group.rangeEnd { + continue + } + + group.blocks = append(group.blocks, blocks[i]) + } + + if len(group.blocks) > 0 { + ret = append(ret, group) + } + } + + return ret +} + +func getRangeStart(m *metadata.Meta, tr int64) int64 { + // Compute start of aligned time range of size tr closest to the current block's start. + // This code has been copied from TSDB. + if m.MinTime >= 0 { + return tr * (m.MinTime / tr) + } + + return tr * ((m.MinTime - tr + 1) / tr) +} + +func sortMetasByMinTime(metas []*metadata.Meta) { + sort.Slice(metas, func(i, j int) bool { + return metas[i].BlockMeta.MinTime < metas[j].BlockMeta.MinTime + }) +} diff --git a/pkg/compactor/shuffle_sharding_grouper_test.go b/pkg/compactor/shuffle_sharding_grouper_test.go new file mode 100644 index 0000000000..dd1a9eb3c5 --- /dev/null +++ b/pkg/compactor/shuffle_sharding_grouper_test.go @@ -0,0 +1,497 @@ +package compactor + +import ( + "testing" + "time" + + "github.com/oklog/ulid" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/prometheus/tsdb" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/thanos-io/thanos/pkg/block/metadata" +) + +func TestTimeShardingGrouper_Groups(t *testing.T) { + block1ulid := ulid.MustNew(1, nil) + block2ulid := ulid.MustNew(2, nil) + block3ulid := ulid.MustNew(3, nil) + block4ulid := ulid.MustNew(4, nil) + block5ulid := ulid.MustNew(5, nil) + block6ulid := ulid.MustNew(6, nil) + block7ulid := ulid.MustNew(7, nil) + block8ulid := ulid.MustNew(8, nil) + block9ulid := ulid.MustNew(9, nil) + block10ulid := ulid.MustNew(10, nil) + block11ulid := ulid.MustNew(11, nil) + block12ulid := ulid.MustNew(12, nil) + block13ulid := ulid.MustNew(13, nil) + block14ulid := ulid.MustNew(14, nil) + block15ulid := ulid.MustNew(15, nil) + + blocks := + map[ulid.ULID]*metadata.Meta{ + block1ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block1ulid, MinTime: 1 * time.Hour.Milliseconds(), MaxTime: 2 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "1"}}, + }, + block2ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block2ulid, MinTime: 3 * time.Hour.Milliseconds(), MaxTime: 4 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "1"}}, + }, + block3ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block3ulid, MinTime: 0 * time.Hour.Milliseconds(), MaxTime: 1 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "1"}}, + }, + block4ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block4ulid, MinTime: 2 * time.Hour.Milliseconds(), MaxTime: 3 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "1"}}, + }, + block5ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block5ulid, MinTime: 1 * time.Hour.Milliseconds(), MaxTime: 2 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "2"}}, + }, + block6ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block6ulid, MinTime: 0 * time.Hour.Milliseconds(), MaxTime: 1 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "2"}}, + }, + block7ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block7ulid, MinTime: 0 * time.Hour.Milliseconds(), MaxTime: 1 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "1"}}, + }, + block8ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block8ulid, MinTime: 0 * time.Hour.Milliseconds(), MaxTime: 1 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "2"}}, + }, + block9ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block9ulid, MinTime: 0 * time.Hour.Milliseconds(), MaxTime: 1 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "3"}}, + }, + block10ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block10ulid, MinTime: 4 * time.Hour.Milliseconds(), MaxTime: 6 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "2"}}, + }, + block11ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block11ulid, MinTime: 6 * time.Hour.Milliseconds(), MaxTime: 8 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "2"}}, + }, + block12ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block12ulid, MinTime: 1 * time.Hour.Milliseconds(), MaxTime: 2 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "1"}}, + }, + block13ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block13ulid, MinTime: 0 * time.Hour.Milliseconds(), MaxTime: 20 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "1"}}, + }, + block14ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block14ulid, MinTime: 21 * time.Hour.Milliseconds(), MaxTime: 40 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "1"}}, + }, + block15ulid: { + BlockMeta: tsdb.BlockMeta{ULID: block15ulid, MinTime: 21 * time.Hour.Milliseconds(), MaxTime: 40 * time.Hour.Milliseconds()}, + Thanos: metadata.Thanos{Labels: map[string]string{"__org_id__": "1"}}, + }, + } + + tests := map[string]struct { + ranges []time.Duration + blocks map[ulid.ULID]*metadata.Meta + expected [][]ulid.ULID + }{ + "test basic grouping": { + ranges: []time.Duration{2 * time.Hour, 4 * time.Hour}, + blocks: map[ulid.ULID]*metadata.Meta{block1ulid: blocks[block1ulid], block2ulid: blocks[block2ulid], block3ulid: blocks[block3ulid], block4ulid: blocks[block4ulid], block5ulid: blocks[block5ulid], block6ulid: blocks[block6ulid]}, + expected: [][]ulid.ULID{ + {block5ulid, block6ulid}, + {block1ulid, block3ulid}, + {block2ulid, block4ulid}, + }, + }, + "test no compaction": { + ranges: []time.Duration{2 * time.Hour, 4 * time.Hour}, + blocks: map[ulid.ULID]*metadata.Meta{block7ulid: blocks[block7ulid], block8ulid: blocks[block8ulid], block9ulid: blocks[block9ulid]}, + expected: [][]ulid.ULID{}, + }, + "test smallest range first": { + ranges: []time.Duration{2 * time.Hour, 4 * time.Hour}, + blocks: map[ulid.ULID]*metadata.Meta{block1ulid: blocks[block1ulid], block2ulid: blocks[block2ulid], block3ulid: blocks[block3ulid], block4ulid: blocks[block4ulid], block10ulid: blocks[block10ulid], block11ulid: blocks[block11ulid]}, + expected: [][]ulid.ULID{ + {block1ulid, block3ulid}, + {block2ulid, block4ulid}, + {block10ulid, block11ulid}, + }, + }, + "test oldest min time first": { + ranges: []time.Duration{2 * time.Hour, 4 * time.Hour}, + blocks: map[ulid.ULID]*metadata.Meta{block1ulid: blocks[block1ulid], block2ulid: blocks[block2ulid], block3ulid: blocks[block3ulid], block4ulid: blocks[block4ulid], block12ulid: blocks[block12ulid]}, + expected: [][]ulid.ULID{ + {block1ulid, block3ulid, block12ulid}, + {block2ulid, block4ulid}, + }, + }, + "test overlapping blocks": { + ranges: []time.Duration{20 * time.Hour, 40 * time.Hour}, + blocks: map[ulid.ULID]*metadata.Meta{block13ulid: blocks[block13ulid], block14ulid: blocks[block14ulid], block15ulid: blocks[block15ulid]}, + expected: [][]ulid.ULID{}, + }, + } + + for testName, testData := range tests { + t.Run(testName, func(t *testing.T) { + compactorCfg := &Config{ + BlockRanges: testData.ranges, + } + + g := NewShuffleShardingGrouper(nil, + nil, + false, // Do not accept malformed indexes + true, // Enable vertical compaction + prometheus.NewRegistry(), + nil, + nil, + nil, + metadata.NoneFunc, + *compactorCfg) + actual, err := g.Groups(testData.blocks) + require.NoError(t, err) + require.Len(t, actual, len(testData.expected)) + + for idx, expectedIDs := range testData.expected { + assert.Equal(t, expectedIDs, actual[idx].IDs()) + } + }) + } +} + +func TestGroupBlocksByCompactableRanges(t *testing.T) { + tests := map[string]struct { + ranges []int64 + blocks []*metadata.Meta + expected []blocksGroup + }{ + "no input blocks": { + ranges: []int64{20}, + blocks: nil, + expected: nil, + }, + "only 1 block in input": { + ranges: []int64{20}, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + }, + expected: nil, + }, + "only 1 block for each range (single range)": { + ranges: []int64{20}, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 60}}, + }, + expected: nil, + }, + "only 1 block for each range (multiple ranges)": { + ranges: []int64{10, 20}, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 60}}, + }, + expected: nil, + }, + "input blocks can be compacted on the 1st range only": { + ranges: []int64{20, 40}, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 25, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 30, MaxTime: 40}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 50}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 50, MaxTime: 60}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 60, MaxTime: 70}}, + }, + expected: []blocksGroup{ + {rangeStart: 20, rangeEnd: 40, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 25, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 30, MaxTime: 40}}, + }}, + {rangeStart: 40, rangeEnd: 60, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 50}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 50, MaxTime: 60}}, + }}, + }, + }, + "input blocks can be compacted on the 2nd range only": { + ranges: []int64{10, 20}, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 30, MaxTime: 40}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 60}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 60, MaxTime: 70}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 70, MaxTime: 80}}, + }, + expected: []blocksGroup{ + {rangeStart: 20, rangeEnd: 40, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 30, MaxTime: 40}}, + }}, + {rangeStart: 60, rangeEnd: 80, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 60, MaxTime: 70}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 70, MaxTime: 80}}, + }}, + }, + }, + "input blocks can be compacted on a mix of 1st and 2nd ranges, guaranteeing no overlaps and giving preference to smaller ranges": { + ranges: []int64{10, 20}, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 10}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 7, MaxTime: 10}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 30, MaxTime: 40}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 60}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 60, MaxTime: 70}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 70, MaxTime: 80}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 75, MaxTime: 80}}, + }, + expected: []blocksGroup{ + {rangeStart: 0, rangeEnd: 10, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 10}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 7, MaxTime: 10}}, + }}, + {rangeStart: 70, rangeEnd: 80, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 70, MaxTime: 80}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 75, MaxTime: 80}}, + }}, + {rangeStart: 20, rangeEnd: 40, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 30, MaxTime: 40}}, + }}, + }, + }, + "input blocks have already been compacted with the largest range": { + ranges: []int64{10, 20, 40}, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 40}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 70}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 80, MaxTime: 120}}, + }, + expected: nil, + }, + "input blocks match the largest range but can be compacted because overlapping": { + ranges: []int64{10, 20, 40}, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 40}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 70}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 80, MaxTime: 120}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 80, MaxTime: 120}}, + }, + expected: []blocksGroup{ + {rangeStart: 80, rangeEnd: 120, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 80, MaxTime: 120}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 80, MaxTime: 120}}, + }}, + }, + }, + "a block with time range crossing two 1st level ranges should be NOT considered for 1st level compaction": { + ranges: []int64{20, 40}, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 30}}, // This block spans across two 1st level ranges. + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 30, MaxTime: 40}}, + }, + expected: []blocksGroup{ + {rangeStart: 20, rangeEnd: 40, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 30, MaxTime: 40}}, + }}, + }, + }, + "a block with time range crossing two 1st level ranges should BE considered for 2nd level compaction": { + ranges: []int64{20, 40}, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 30}}, // This block spans across two 1st level ranges. + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 40}}, + }, + expected: []blocksGroup{ + {rangeStart: 0, rangeEnd: 40, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 30}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 40}}, + }}, + }, + }, + "a block with time range larger then the largest compaction range should NOT be considered for compaction": { + ranges: []int64{10, 20, 40}, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 40}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 30, MaxTime: 150}}, // This block is larger then the largest compaction range. + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 70}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 80, MaxTime: 120}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 80, MaxTime: 120}}, + }, + expected: []blocksGroup{ + {rangeStart: 80, rangeEnd: 120, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 80, MaxTime: 120}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 80, MaxTime: 120}}, + }}, + }, + }, + "a range containing the most recent block shouldn't be prematurely compacted if doesn't cover the full range": { + ranges: []int64{10, 20, 40}, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 5, MaxTime: 8}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 7, MaxTime: 9}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 12}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 13, MaxTime: 15}}, + }, + expected: []blocksGroup{ + {rangeStart: 0, rangeEnd: 10, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 5, MaxTime: 8}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 7, MaxTime: 9}}, + }}, + }, + }, + } + + for testName, testData := range tests { + t.Run(testName, func(t *testing.T) { + assert.Equal(t, testData.expected, groupBlocksByCompactableRanges(testData.blocks, testData.ranges)) + }) + } +} + +func TestGroupBlocksByRange(t *testing.T) { + tests := map[string]struct { + timeRange int64 + blocks []*metadata.Meta + expected []blocksGroup + }{ + "no input blocks": { + timeRange: 20, + blocks: nil, + expected: nil, + }, + "only 1 block in input": { + timeRange: 20, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + }, + expected: []blocksGroup{ + {rangeStart: 0, rangeEnd: 20, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + }}, + }, + }, + "only 1 block per range": { + timeRange: 20, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 15}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 60}}, + }, + expected: []blocksGroup{ + {rangeStart: 0, rangeEnd: 20, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 15}}, + }}, + {rangeStart: 40, rangeEnd: 60, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 60}}, + }}, + }, + }, + "multiple blocks per range": { + timeRange: 20, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 15}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 60}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 50, MaxTime: 55}}, + }, + expected: []blocksGroup{ + {rangeStart: 0, rangeEnd: 20, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 15}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + }}, + {rangeStart: 40, rangeEnd: 60, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 40, MaxTime: 60}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 50, MaxTime: 55}}, + }}, + }, + }, + "a block with time range larger then the range should be excluded": { + timeRange: 20, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 40}}, // This block is larger then the range. + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + }, + expected: []blocksGroup{ + {rangeStart: 0, rangeEnd: 20, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + }}, + {rangeStart: 20, rangeEnd: 40, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + }}, + }, + }, + "blocks with different time ranges but all fitting within the input range": { + timeRange: 40, + blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 40}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + }, + expected: []blocksGroup{ + {rangeStart: 0, rangeEnd: 40, blocks: []*metadata.Meta{ + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 0, MaxTime: 40}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 10, MaxTime: 20}}, + {BlockMeta: tsdb.BlockMeta{MinTime: 20, MaxTime: 30}}, + }}, + }, + }, + } + + for testName, testData := range tests { + t.Run(testName, func(t *testing.T) { + assert.Equal(t, testData.expected, groupBlocksByRange(testData.blocks, testData.timeRange)) + }) + } +} + +func TestBlocksGroup_overlaps(t *testing.T) { + tests := []struct { + first blocksGroup + second blocksGroup + expected bool + }{ + { + first: blocksGroup{rangeStart: 10, rangeEnd: 20}, + second: blocksGroup{rangeStart: 20, rangeEnd: 30}, + expected: false, + }, { + first: blocksGroup{rangeStart: 10, rangeEnd: 20}, + second: blocksGroup{rangeStart: 19, rangeEnd: 30}, + expected: true, + }, { + first: blocksGroup{rangeStart: 10, rangeEnd: 21}, + second: blocksGroup{rangeStart: 20, rangeEnd: 30}, + expected: true, + }, { + first: blocksGroup{rangeStart: 10, rangeEnd: 20}, + second: blocksGroup{rangeStart: 12, rangeEnd: 18}, + expected: true, + }, + } + + for _, tc := range tests { + assert.Equal(t, tc.expected, tc.first.overlaps(tc.second)) + assert.Equal(t, tc.expected, tc.second.overlaps(tc.first)) + } +} diff --git a/pkg/compactor/shuffle_sharding_planner.go b/pkg/compactor/shuffle_sharding_planner.go new file mode 100644 index 0000000000..96e70f59a5 --- /dev/null +++ b/pkg/compactor/shuffle_sharding_planner.go @@ -0,0 +1,39 @@ +package compactor + +import ( + "context" + "fmt" + + "github.com/go-kit/kit/log" + "github.com/thanos-io/thanos/pkg/block/metadata" +) + +type ShuffleShardingPlanner struct { + logger log.Logger + ranges []int64 +} + +func NewShuffleShardingPlanner(logger log.Logger, ranges []int64) *ShuffleShardingPlanner { + return &ShuffleShardingPlanner{ + logger: logger, + ranges: ranges, + } +} + +func (p *ShuffleShardingPlanner) Plan(_ context.Context, metasByMinTime []*metadata.Meta) ([]*metadata.Meta, error) { + // Ensure all blocks fits within the largest range. This is a double check + // to ensure there's no bug in the previous blocks grouping, given this Plan() + // is just a pass-through. + // Modifed from https://github.com/cortexproject/cortex/pull/2616/files#diff-e3051fc530c48bb276ba958dd8fadc684e546bd7964e6bc75cef9a86ef8df344R28-R63 + largestRange := p.ranges[len(p.ranges)-1] + rangeStart := getRangeStart(metasByMinTime[0], largestRange) + rangeEnd := rangeStart + largestRange + + for _, b := range metasByMinTime { + if b.MinTime < rangeStart || b.MaxTime > rangeEnd { + return nil, fmt.Errorf("block %s with time range %d:%d is outside the largest expected range %d:%d", b.ULID.String(), b.MinTime, b.MaxTime, rangeStart, rangeEnd) + } + } + + return metasByMinTime, nil +} diff --git a/pkg/cortex/cortex.go b/pkg/cortex/cortex.go index 3f19861111..76fb18e0f0 100644 --- a/pkg/cortex/cortex.go +++ b/pkg/cortex/cortex.go @@ -170,7 +170,7 @@ func (c *Config) RegisterFlags(f *flag.FlagSet) { c.Alertmanager.RegisterFlags(f) c.AlertmanagerStorage.RegisterFlags(f) c.RuntimeConfig.RegisterFlags(f) - c.MemberlistKV.RegisterFlags(f, "") + c.MemberlistKV.RegisterFlags(f) c.QueryScheduler.RegisterFlags(f) // These don't seem to have a home. diff --git a/pkg/distributor/distributor_ring.go b/pkg/distributor/distributor_ring.go index ee96a7d37c..3655aa9985 100644 --- a/pkg/distributor/distributor_ring.go +++ b/pkg/distributor/distributor_ring.go @@ -42,8 +42,8 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // Ring flags cfg.KVStore.RegisterFlagsWithPrefix("distributor.ring.", "collectors/", f) - f.DurationVar(&cfg.HeartbeatPeriod, "distributor.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring.") - f.DurationVar(&cfg.HeartbeatTimeout, "distributor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which distributors are considered unhealthy within the ring.") + f.DurationVar(&cfg.HeartbeatPeriod, "distributor.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.") + f.DurationVar(&cfg.HeartbeatTimeout, "distributor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which distributors are considered unhealthy within the ring. 0 = never (timeout disabled).") // Instance flags cfg.InstanceInterfaceNames = []string{"eth0", "en0"} diff --git a/pkg/frontend/transport/handler.go b/pkg/frontend/transport/handler.go index 8fcf8e9630..435a022748 100644 --- a/pkg/frontend/transport/handler.go +++ b/pkg/frontend/transport/handler.go @@ -173,6 +173,7 @@ func (f *Handler) reportQueryStats(r *http.Request, queryString url.Values, quer // Log stats. logMessage := append([]interface{}{ "msg", "query stats", + "component", "query-frontend", "method", r.Method, "path", r.URL.Path, "response_time", queryResponseTime, diff --git a/pkg/querier/batch/batch_test.go b/pkg/querier/batch/batch_test.go index baacbcc719..9b9ffec16e 100644 --- a/pkg/querier/batch/batch_test.go +++ b/pkg/querier/batch/batch_test.go @@ -27,6 +27,10 @@ func BenchmarkNewChunkMergeIterator_CreateAndIterate(b *testing.B) { {numChunks: 1000, numSamplesPerChunk: 100, duplicationFactor: 3, enc: promchunk.DoubleDelta}, {numChunks: 1000, numSamplesPerChunk: 100, duplicationFactor: 1, enc: promchunk.PrometheusXorChunk}, {numChunks: 1000, numSamplesPerChunk: 100, duplicationFactor: 3, enc: promchunk.PrometheusXorChunk}, + {numChunks: 100, numSamplesPerChunk: 100, duplicationFactor: 1, enc: promchunk.PrometheusXorChunk}, + {numChunks: 100, numSamplesPerChunk: 100, duplicationFactor: 3, enc: promchunk.PrometheusXorChunk}, + {numChunks: 1, numSamplesPerChunk: 100, duplicationFactor: 1, enc: promchunk.PrometheusXorChunk}, + {numChunks: 1, numSamplesPerChunk: 100, duplicationFactor: 3, enc: promchunk.PrometheusXorChunk}, } for _, scenario := range scenarios { diff --git a/pkg/querier/batch/chunk_test.go b/pkg/querier/batch/chunk_test.go index 40a3feb4b2..e672811827 100644 --- a/pkg/querier/batch/chunk_test.go +++ b/pkg/querier/batch/chunk_test.go @@ -59,6 +59,7 @@ func mkChunk(t require.TestingT, from model.Time, points int, enc promchunk.Enco require.Nil(t, npc) ts = ts.Add(step) } + ts = ts.Add(-step) // undo the add that we did just before exiting the loop return chunk.NewChunk(userID, fp, metric, pc, from, ts) } diff --git a/pkg/querier/batch/merge.go b/pkg/querier/batch/merge.go index 88220bd727..7764b37467 100644 --- a/pkg/querier/batch/merge.go +++ b/pkg/querier/batch/merge.go @@ -31,8 +31,8 @@ func newMergeIterator(cs []GenericChunk) *mergeIterator { c := &mergeIterator{ its: its, h: make(iteratorHeap, 0, len(its)), - batches: make(batchStream, 0, len(its)*2*promchunk.BatchSize), - batchesBuf: make(batchStream, len(its)*2*promchunk.BatchSize), + batches: make(batchStream, 0, len(its)), + batchesBuf: make(batchStream, len(its)), } for _, iter := range c.its { @@ -112,8 +112,7 @@ func (c *mergeIterator) buildNextBatch(size int) bool { for len(c.h) > 0 && (len(c.batches) == 0 || c.nextBatchEndTime() >= c.h[0].AtTime()) { c.nextBatchBuf[0] = c.h[0].Batch() c.batchesBuf = mergeStreams(c.batches, c.nextBatchBuf[:], c.batchesBuf, size) - copy(c.batches[:len(c.batchesBuf)], c.batchesBuf) - c.batches = c.batches[:len(c.batchesBuf)] + c.batches = append(c.batches[:0], c.batchesBuf...) if c.h[0].Next(size) { heap.Fix(&c.h, 0) diff --git a/pkg/ring/basic_lifecycler.go b/pkg/ring/basic_lifecycler.go index 6c19d72c28..64f5f6002c 100644 --- a/pkg/ring/basic_lifecycler.go +++ b/pkg/ring/basic_lifecycler.go @@ -13,6 +13,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/cortexproject/cortex/pkg/ring/kv" + "github.com/cortexproject/cortex/pkg/util" util_log "github.com/cortexproject/cortex/pkg/util/log" "github.com/cortexproject/cortex/pkg/util/services" ) @@ -182,12 +183,12 @@ func (l *BasicLifecycler) starting(ctx context.Context) error { } func (l *BasicLifecycler) running(ctx context.Context) error { - heartbeatTicker := time.NewTicker(l.cfg.HeartbeatPeriod) - defer heartbeatTicker.Stop() + heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(l.cfg.HeartbeatPeriod) + defer heartbeatTickerStop() for { select { - case <-heartbeatTicker.C: + case <-heartbeatTickerChan: l.heartbeat(ctx) case f := <-l.actorChan: @@ -214,13 +215,13 @@ func (l *BasicLifecycler) stopping(runningError error) error { }() // Heartbeat while the stopping delegate function is running. - heartbeatTicker := time.NewTicker(l.cfg.HeartbeatPeriod) - defer heartbeatTicker.Stop() + heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(l.cfg.HeartbeatPeriod) + defer heartbeatTickerStop() heartbeatLoop: for { select { - case <-heartbeatTicker.C: + case <-heartbeatTickerChan: l.heartbeat(context.Background()) case <-done: break heartbeatLoop @@ -292,8 +293,8 @@ func (l *BasicLifecycler) registerInstance(ctx context.Context) error { } func (l *BasicLifecycler) waitStableTokens(ctx context.Context, period time.Duration) error { - heartbeatTicker := time.NewTicker(l.cfg.HeartbeatPeriod) - defer heartbeatTicker.Stop() + heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(l.cfg.HeartbeatPeriod) + defer heartbeatTickerStop() // The first observation will occur after the specified period. level.Info(l.logger).Log("msg", "waiting stable tokens", "ring", l.ringName) @@ -312,7 +313,7 @@ func (l *BasicLifecycler) waitStableTokens(ctx context.Context, period time.Dura level.Info(l.logger).Log("msg", "tokens verification succeeded", "ring", l.ringName) return nil - case <-heartbeatTicker.C: + case <-heartbeatTickerChan: l.heartbeat(ctx) case <-ctx.Done(): diff --git a/pkg/ring/kv/memberlist/kv_init_service.go b/pkg/ring/kv/memberlist/kv_init_service.go index da4d965b7c..517c674281 100644 --- a/pkg/ring/kv/memberlist/kv_init_service.go +++ b/pkg/ring/kv/memberlist/kv_init_service.go @@ -16,7 +16,6 @@ import ( "github.com/hashicorp/memberlist" "go.uber.org/atomic" - "github.com/cortexproject/cortex/pkg/ring/kv/codec" "github.com/cortexproject/cortex/pkg/util" "github.com/cortexproject/cortex/pkg/util/services" ) @@ -106,12 +105,12 @@ func (kvs *KVInitService) ServeHTTP(w http.ResponseWriter, req *http.Request) { if err := req.ParseForm(); err == nil { if req.Form[downloadKeyParam] != nil { - downloadKey(w, kv.storeCopy(), req.Form[downloadKeyParam][0]) // Use first value, ignore the rest. + downloadKey(w, kv, kv.storeCopy(), req.Form[downloadKeyParam][0]) // Use first value, ignore the rest. return } if req.Form[viewKeyParam] != nil { - viewKey(w, kv, kv.storeCopy(), req.Form[viewKeyParam][0], getFormat(req)) + viewKey(w, kv.storeCopy(), req.Form[viewKeyParam][0], getFormat(req)) return } @@ -179,30 +178,25 @@ func viewMessage(w http.ResponseWriter, kv *KV, msg message, format string) { return } - formatValue(w, c, msg.Pair.Value, format) + val, err := c.Decode(msg.Pair.Value) + if err != nil { + http.Error(w, fmt.Sprintf("failed to decode: %v", err), http.StatusInternalServerError) + return + } + + formatValue(w, val, format) } -func viewKey(w http.ResponseWriter, kv *KV, store map[string]valueDesc, key string, format string) { +func viewKey(w http.ResponseWriter, store map[string]valueDesc, key string, format string) { if store[key].value == nil { http.Error(w, "value not found", http.StatusNotFound) return } - c := kv.GetCodec(store[key].codecID) - if c == nil { - http.Error(w, "codec not found", http.StatusNotFound) - return - } - - formatValue(w, c, store[key].value, format) + formatValue(w, store[key].value, format) } -func formatValue(w http.ResponseWriter, codec codec.Codec, value []byte, format string) { - val, err := codec.Decode(value) - if err != nil { - http.Error(w, fmt.Sprintf("failed to decode: %v", err), http.StatusInternalServerError) - return - } +func formatValue(w http.ResponseWriter, val interface{}, format string) { w.WriteHeader(200) w.Header().Add("content-type", "text/plain") @@ -214,7 +208,7 @@ func formatValue(w http.ResponseWriter, codec codec.Codec, value []byte, format enc.SetIndent("", " ") } - err = enc.Encode(val) + err := enc.Encode(val) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } @@ -224,7 +218,7 @@ func formatValue(w http.ResponseWriter, codec codec.Codec, value []byte, format } } -func downloadKey(w http.ResponseWriter, store map[string]valueDesc, key string) { +func downloadKey(w http.ResponseWriter, kv *KV, store map[string]valueDesc, key string) { if store[key].value == nil { http.Error(w, "value not found", http.StatusNotFound) return @@ -232,14 +226,26 @@ func downloadKey(w http.ResponseWriter, store map[string]valueDesc, key string) val := store[key] + c := kv.GetCodec(store[key].codecID) + if c == nil { + http.Error(w, "codec not found", http.StatusNotFound) + return + } + + encoded, err := c.Encode(val.value) + if err != nil { + http.Error(w, fmt.Sprintf("failed to encode: %v", err), http.StatusInternalServerError) + return + } + w.Header().Add("content-type", "application/octet-stream") // Set content-length so that client knows whether it has received full response or not. - w.Header().Add("content-length", strconv.Itoa(len(val.value))) + w.Header().Add("content-length", strconv.Itoa(len(encoded))) w.Header().Add("content-disposition", fmt.Sprintf("attachment; filename=%d-%s", val.version, key)) w.WriteHeader(200) // Ignore errors, we cannot do anything about them. - _, _ = w.Write(val.value) + _, _ = w.Write(encoded) } type pageData struct { diff --git a/pkg/ring/kv/memberlist/kv_init_service_test.go b/pkg/ring/kv/memberlist/kv_init_service_test.go index a6e7001f0d..221da9a66f 100644 --- a/pkg/ring/kv/memberlist/kv_init_service_test.go +++ b/pkg/ring/kv/memberlist/kv_init_service_test.go @@ -7,6 +7,8 @@ import ( "github.com/hashicorp/memberlist" "github.com/stretchr/testify/require" + + "github.com/cortexproject/cortex/pkg/util/flagext" ) func TestPage(t *testing.T) { @@ -53,6 +55,7 @@ func TestPage(t *testing.T) { func TestStop(t *testing.T) { var cfg KVConfig + flagext.DefaultValues(&cfg) kvinit := NewKVInitService(&cfg, nil) require.NoError(t, kvinit.stopping(nil)) } diff --git a/pkg/ring/kv/memberlist/memberlist_client.go b/pkg/ring/kv/memberlist/memberlist_client.go index e2d9ede504..79432d4668 100644 --- a/pkg/ring/kv/memberlist/memberlist_client.go +++ b/pkg/ring/kv/memberlist/memberlist_client.go @@ -136,6 +136,7 @@ type KVConfig struct { GossipNodes int `yaml:"gossip_nodes"` GossipToTheDeadTime time.Duration `yaml:"gossip_to_dead_nodes_time"` DeadNodeReclaimTime time.Duration `yaml:"dead_node_reclaim_time"` + EnableCompression bool `yaml:"compression_enabled"` // List of members to join JoinMembers flagext.StringSlice `yaml:"join_members"` @@ -165,12 +166,14 @@ type KVConfig struct { } // RegisterFlags registers flags. -func (cfg *KVConfig) RegisterFlags(f *flag.FlagSet, prefix string) { +func (cfg *KVConfig) RegisterFlagsWithPrefix(f *flag.FlagSet, prefix string) { + mlDefaults := defaultMemberlistConfig() + // "Defaults to hostname" -- memberlist sets it to hostname by default. f.StringVar(&cfg.NodeName, prefix+"memberlist.nodename", "", "Name of the node in memberlist cluster. Defaults to hostname.") // memberlist.DefaultLANConfig will put hostname here. f.BoolVar(&cfg.RandomizeNodeName, prefix+"memberlist.randomize-node-name", true, "Add random suffix to the node name.") - f.DurationVar(&cfg.StreamTimeout, prefix+"memberlist.stream-timeout", 0, "The timeout for establishing a connection with a remote node, and for read/write operations. Uses memberlist LAN defaults if 0.") - f.IntVar(&cfg.RetransmitMult, prefix+"memberlist.retransmit-factor", 0, "Multiplication factor used when sending out messages (factor * log(N+1)).") + f.DurationVar(&cfg.StreamTimeout, prefix+"memberlist.stream-timeout", mlDefaults.TCPTimeout, "The timeout for establishing a connection with a remote node, and for read/write operations.") + f.IntVar(&cfg.RetransmitMult, prefix+"memberlist.retransmit-factor", mlDefaults.RetransmitMult, "Multiplication factor used when sending out messages (factor * log(N+1)).") f.Var(&cfg.JoinMembers, prefix+"memberlist.join", "Other cluster members to join. Can be specified multiple times. It can be an IP, hostname or an entry specified in the DNS Service Discovery format (see https://cortexmetrics.io/docs/configuration/arguments/#dns-service-discovery for more details).") f.DurationVar(&cfg.MinJoinBackoff, prefix+"memberlist.min-join-backoff", 1*time.Second, "Min backoff duration to join other cluster members.") f.DurationVar(&cfg.MaxJoinBackoff, prefix+"memberlist.max-join-backoff", 1*time.Minute, "Max backoff duration to join other cluster members.") @@ -179,16 +182,21 @@ func (cfg *KVConfig) RegisterFlags(f *flag.FlagSet, prefix string) { f.DurationVar(&cfg.RejoinInterval, prefix+"memberlist.rejoin-interval", 0, "If not 0, how often to rejoin the cluster. Occasional rejoin can help to fix the cluster split issue, and is harmless otherwise. For example when using only few components as a seed nodes (via -memberlist.join), then it's recommended to use rejoin. If -memberlist.join points to dynamic service that resolves to all gossiping nodes (eg. Kubernetes headless service), then rejoin is not needed.") f.DurationVar(&cfg.LeftIngestersTimeout, prefix+"memberlist.left-ingesters-timeout", 5*time.Minute, "How long to keep LEFT ingesters in the ring.") f.DurationVar(&cfg.LeaveTimeout, prefix+"memberlist.leave-timeout", 5*time.Second, "Timeout for leaving memberlist cluster.") - f.DurationVar(&cfg.GossipInterval, prefix+"memberlist.gossip-interval", 0, "How often to gossip. Uses memberlist LAN defaults if 0.") - f.IntVar(&cfg.GossipNodes, prefix+"memberlist.gossip-nodes", 0, "How many nodes to gossip to. Uses memberlist LAN defaults if 0.") - f.DurationVar(&cfg.PushPullInterval, prefix+"memberlist.pullpush-interval", 0, "How often to use pull/push sync. Uses memberlist LAN defaults if 0.") - f.DurationVar(&cfg.GossipToTheDeadTime, prefix+"memberlist.gossip-to-dead-nodes-time", 0, "How long to keep gossiping to dead nodes, to give them chance to refute their death. Uses memberlist LAN defaults if 0.") - f.DurationVar(&cfg.DeadNodeReclaimTime, prefix+"memberlist.dead-node-reclaim-time", 0, "How soon can dead node's name be reclaimed with new address. Defaults to 0, which is disabled.") + f.DurationVar(&cfg.GossipInterval, prefix+"memberlist.gossip-interval", mlDefaults.GossipInterval, "How often to gossip.") + f.IntVar(&cfg.GossipNodes, prefix+"memberlist.gossip-nodes", mlDefaults.GossipNodes, "How many nodes to gossip to.") + f.DurationVar(&cfg.PushPullInterval, prefix+"memberlist.pullpush-interval", mlDefaults.PushPullInterval, "How often to use pull/push sync.") + f.DurationVar(&cfg.GossipToTheDeadTime, prefix+"memberlist.gossip-to-dead-nodes-time", mlDefaults.GossipToTheDeadTime, "How long to keep gossiping to dead nodes, to give them chance to refute their death.") + f.DurationVar(&cfg.DeadNodeReclaimTime, prefix+"memberlist.dead-node-reclaim-time", mlDefaults.DeadNodeReclaimTime, "How soon can dead node's name be reclaimed with new address. 0 to disable.") f.IntVar(&cfg.MessageHistoryBufferBytes, prefix+"memberlist.message-history-buffer-bytes", 0, "How much space to use for keeping received and sent messages in memory for troubleshooting (two buffers). 0 to disable.") + f.BoolVar(&cfg.EnableCompression, prefix+"memberlist.compression-enabled", mlDefaults.EnableCompression, "Enable message compression. This can be used to reduce bandwidth usage at the cost of slightly more CPU utilization.") cfg.TCPTransport.RegisterFlags(f, prefix) } +func (cfg *KVConfig) RegisterFlags(f *flag.FlagSet) { + cfg.RegisterFlagsWithPrefix(f, "") +} + func generateRandomSuffix() string { suffix := make([]byte, 4) _, err := rand.Read(suffix) @@ -259,7 +267,6 @@ type KV struct { watchPrefixDroppedNotifications *prometheus.CounterVec storeValuesDesc *prometheus.Desc - storeSizesDesc *prometheus.Desc storeTombstones *prometheus.GaugeVec storeRemovedTombstones *prometheus.CounterVec @@ -286,9 +293,11 @@ type message struct { } type valueDesc struct { - // We store bytes here. Reason is that clients calling CAS function will modify the object in place, - // but unless CAS succeeds, we don't want those modifications to be visible. - value []byte + // We store the decoded value here to prevent decoding the entire state for every + // update we receive. Whilst the updates are small and fast to decode, + // the total state can be quite large. + // The CAS function is passed a deep copy because it modifies in-place. + value Mergeable // version (local only) is used to keep track of what we're gossiping about, and invalidate old messages version uint @@ -297,8 +306,16 @@ type valueDesc struct { codecID string } +func (v valueDesc) Clone() (result valueDesc) { + result = v + if v.value != nil { + result.value = v.value.Clone() + } + return +} + func (v valueDesc) String() string { - return fmt.Sprintf("size: %d, version: %d, codec: %s", len(v.value), v.version, v.codecID) + return fmt.Sprintf("version: %d, codec: %s", v.version, v.codecID) } var ( @@ -345,36 +362,28 @@ func NewKV(cfg KVConfig, logger log.Logger) *KV { return mlkv } +func defaultMemberlistConfig() *memberlist.Config { + return memberlist.DefaultLANConfig() +} + func (m *KV) buildMemberlistConfig() (*memberlist.Config, error) { tr, err := NewTCPTransport(m.cfg.TCPTransport, m.logger) if err != nil { return nil, fmt.Errorf("failed to create transport: %v", err) } - mlCfg := memberlist.DefaultLANConfig() + mlCfg := defaultMemberlistConfig() mlCfg.Delegate = m - if m.cfg.StreamTimeout != 0 { - mlCfg.TCPTimeout = m.cfg.StreamTimeout - } - if m.cfg.RetransmitMult != 0 { - mlCfg.RetransmitMult = m.cfg.RetransmitMult - } - if m.cfg.PushPullInterval != 0 { - mlCfg.PushPullInterval = m.cfg.PushPullInterval - } - if m.cfg.GossipInterval != 0 { - mlCfg.GossipInterval = m.cfg.GossipInterval - } - if m.cfg.GossipNodes != 0 { - mlCfg.GossipNodes = m.cfg.GossipNodes - } - if m.cfg.GossipToTheDeadTime > 0 { - mlCfg.GossipToTheDeadTime = m.cfg.GossipToTheDeadTime - } - if m.cfg.DeadNodeReclaimTime > 0 { - mlCfg.DeadNodeReclaimTime = m.cfg.DeadNodeReclaimTime - } + mlCfg.TCPTimeout = m.cfg.StreamTimeout + mlCfg.RetransmitMult = m.cfg.RetransmitMult + mlCfg.PushPullInterval = m.cfg.PushPullInterval + mlCfg.GossipInterval = m.cfg.GossipInterval + mlCfg.GossipNodes = m.cfg.GossipNodes + mlCfg.GossipToTheDeadTime = m.cfg.GossipToTheDeadTime + mlCfg.DeadNodeReclaimTime = m.cfg.DeadNodeReclaimTime + mlCfg.EnableCompression = m.cfg.EnableCompression + if m.cfg.NodeName != "" { mlCfg.Name = m.cfg.NodeName } @@ -615,24 +624,16 @@ func (m *KV) Get(key string, codec codec.Codec) (interface{}, error) { // Returns current value with removed tombstones. func (m *KV) get(key string, codec codec.Codec) (out interface{}, version uint, err error) { m.storeMu.Lock() - v := m.store[key] + v := m.store[key].Clone() m.storeMu.Unlock() - out = nil if v.value != nil { - out, err = codec.Decode(v.value) - if err != nil { - return nil, 0, err - } - - if mr, ok := out.(Mergeable); ok { - // remove ALL tombstones before returning to client. - // No need for clients to see them. - _, _ = mr.RemoveTombstones(time.Time{}) - } + // remove ALL tombstones before returning to client. + // No need for clients to see them. + _, _ = v.value.RemoveTombstones(time.Time{}) } - return out, v.version, nil + return v.value, v.version, nil } // WatchKey watches for value changes for given key. When value changes, 'f' function is called with the @@ -1046,9 +1047,21 @@ func (m *KV) LocalState(join bool) []byte { continue } + codec := m.GetCodec(val.codecID) + if codec == nil { + level.Error(m.logger).Log("msg", "failed to encode remote state: unknown codec for key", "codec", val.codecID, "key", key) + continue + } + + encoded, err := codec.Encode(val.value) + if err != nil { + level.Error(m.logger).Log("msg", "failed to encode remote state", "err", err) + continue + } + kvPair.Reset() kvPair.Key = key - kvPair.Value = val.value + kvPair.Value = encoded kvPair.Codec = val.codecID ser, err := kvPair.Marshal() @@ -1058,7 +1071,7 @@ func (m *KV) LocalState(join bool) []byte { } if uint(len(ser)) > math.MaxUint32 { - level.Error(m.logger).Log("msg", "value too long", "key", key, "value_length", len(val.value)) + level.Error(m.logger).Log("msg", "value too long", "key", key, "value_length", len(encoded)) continue } @@ -1180,12 +1193,12 @@ func (m *KV) mergeValueForKey(key string, incomingValue Mergeable, casVersion ui m.storeMu.Lock() defer m.storeMu.Unlock() - curr := m.store[key] + curr := m.store[key].Clone() // if casVersion is 0, then there was no previous value, so we will just do normal merge, without localCAS flag set. if casVersion > 0 && curr.version != casVersion { return nil, 0, errVersionMismatch } - result, change, err := computeNewValue(incomingValue, curr.value, codec, casVersion > 0) + result, change, err := computeNewValue(incomingValue, curr.value, casVersion > 0) if err != nil { return nil, 0, err } @@ -1202,14 +1215,9 @@ func (m *KV) mergeValueForKey(key string, incomingValue Mergeable, casVersion ui m.storeRemovedTombstones.WithLabelValues(key).Add(float64(removed)) } - encoded, err := codec.Encode(result) - if err != nil { - return nil, 0, fmt.Errorf("failed to encode merged result: %v", err) - } - newVersion := curr.version + 1 m.store[key] = valueDesc{ - value: encoded, + value: result, version: newVersion, codecID: codec.CodecID(), } @@ -1218,25 +1226,7 @@ func (m *KV) mergeValueForKey(key string, incomingValue Mergeable, casVersion ui } // returns [result, change, error] -func computeNewValue(incoming Mergeable, stored []byte, c codec.Codec, cas bool) (Mergeable, Mergeable, error) { - if len(stored) == 0 { - return incoming, incoming, nil - } - - old, err := c.Decode(stored) - if err != nil { - return incoming, incoming, fmt.Errorf("failed to decode stored value: %v", err) - } - - if old == nil { - return incoming, incoming, nil - } - - oldVal, ok := old.(Mergeable) - if !ok { - return incoming, incoming, fmt.Errorf("stored value is not Mergeable, got %T", old) - } - +func computeNewValue(incoming Mergeable, oldVal Mergeable, cas bool) (Mergeable, Mergeable, error) { if oldVal == nil { return incoming, incoming, nil } @@ -1252,7 +1242,7 @@ func (m *KV) storeCopy() map[string]valueDesc { result := make(map[string]valueDesc, len(m.store)) for k, v := range m.store { - result[k] = v + result[k] = v.Clone() } return result } diff --git a/pkg/ring/kv/memberlist/memberlist_client_test.go b/pkg/ring/kv/memberlist/memberlist_client_test.go index 22e598197f..2782c6f931 100644 --- a/pkg/ring/kv/memberlist/memberlist_client_test.go +++ b/pkg/ring/kv/memberlist/memberlist_client_test.go @@ -19,6 +19,7 @@ import ( "github.com/stretchr/testify/require" "github.com/cortexproject/cortex/pkg/ring/kv/codec" + "github.com/cortexproject/cortex/pkg/util/flagext" "github.com/cortexproject/cortex/pkg/util/services" "github.com/cortexproject/cortex/pkg/util/test" ) @@ -88,6 +89,26 @@ func (d *data) RemoveTombstones(limit time.Time) (_, _ int) { return } +func (m member) clone() member { + out := member{ + Timestamp: m.Timestamp, + Tokens: make([]uint32, len(m.Tokens)), + State: m.State, + } + copy(out.Tokens, m.Tokens) + return out +} + +func (d *data) Clone() Mergeable { + out := &data{ + Members: make(map[string]member, len(d.Members)), + } + for k, v := range d.Members { + out.Members[k] = v.clone() + } + return out +} + func (d *data) getAllTokens() []uint32 { out := []uint32(nil) for _, m := range d.Members { @@ -207,12 +228,12 @@ func TestBasicGetAndCas(t *testing.T) { c := dataCodec{} name := "Ing 1" - cfg := KVConfig{ - TCPTransport: TCPTransportConfig{ - BindAddrs: []string{"localhost"}, - }, - Codecs: []codec.Codec{c}, + var cfg KVConfig + flagext.DefaultValues(&cfg) + cfg.TCPTransport = TCPTransportConfig{ + BindAddrs: []string{"localhost"}, } + cfg.Codecs = []codec.Codec{c} mkv := NewKV(cfg, log.NewNopLogger()) require.NoError(t, services.StartAndAwaitRunning(context.Background(), mkv)) @@ -265,10 +286,10 @@ func withFixtures(t *testing.T, testFN func(t *testing.T, kv *Client)) { c := dataCodec{} - cfg := KVConfig{ - TCPTransport: TCPTransportConfig{}, - Codecs: []codec.Codec{c}, - } + var cfg KVConfig + flagext.DefaultValues(&cfg) + cfg.TCPTransport = TCPTransportConfig{} + cfg.Codecs = []codec.Codec{c} mkv := NewKV(cfg, log.NewNopLogger()) require.NoError(t, services.StartAndAwaitRunning(context.Background(), mkv)) @@ -410,9 +431,9 @@ func TestCASFailedBecauseOfVersionChanges(t *testing.T) { func TestMultipleCAS(t *testing.T) { c := dataCodec{} - cfg := KVConfig{ - Codecs: []codec.Codec{c}, - } + var cfg KVConfig + flagext.DefaultValues(&cfg) + cfg.Codecs = []codec.Codec{c} mkv := NewKV(cfg, log.NewNopLogger()) mkv.maxCasRetries = 20 @@ -501,26 +522,21 @@ func TestMultipleClients(t *testing.T) { for i := 0; i < members; i++ { id := fmt.Sprintf("Member-%d", i) - cfg := KVConfig{ - NodeName: id, - - // some useful parameters when playing with higher number of members - // RetransmitMult: 2, - GossipInterval: 100 * time.Millisecond, - GossipNodes: 3, - PushPullInterval: 5 * time.Second, - // PacketDialTimeout: 5 * time.Second, - // StreamTimeout: 5 * time.Second, - // PacketWriteTimeout: 2 * time.Second, - - TCPTransport: TCPTransportConfig{ - BindAddrs: []string{"localhost"}, - BindPort: 0, // randomize ports - }, - - Codecs: []codec.Codec{c}, + var cfg KVConfig + flagext.DefaultValues(&cfg) + cfg.NodeName = id + + cfg.GossipInterval = 100 * time.Millisecond + cfg.GossipNodes = 3 + cfg.PushPullInterval = 5 * time.Second + + cfg.TCPTransport = TCPTransportConfig{ + BindAddrs: []string{"localhost"}, + BindPort: 0, // randomize ports } + cfg.Codecs = []codec.Codec{c} + mkv := NewKV(cfg, log.NewNopLogger()) require.NoError(t, services.StartAndAwaitRunning(context.Background(), mkv)) @@ -645,26 +661,26 @@ func TestJoinMembersWithRetryBackoff(t *testing.T) { for i, port := range ports { id := fmt.Sprintf("Member-%d", i) - cfg := KVConfig{ - NodeName: id, + var cfg KVConfig + flagext.DefaultValues(&cfg) + cfg.NodeName = id - GossipInterval: 100 * time.Millisecond, - GossipNodes: 3, - PushPullInterval: 5 * time.Second, + cfg.GossipInterval = 100 * time.Millisecond + cfg.GossipNodes = 3 + cfg.PushPullInterval = 5 * time.Second - MinJoinBackoff: 100 * time.Millisecond, - MaxJoinBackoff: 1 * time.Minute, - MaxJoinRetries: 10, - AbortIfJoinFails: true, + cfg.MinJoinBackoff = 100 * time.Millisecond + cfg.MaxJoinBackoff = 1 * time.Minute + cfg.MaxJoinRetries = 10 + cfg.AbortIfJoinFails = true - TCPTransport: TCPTransportConfig{ - BindAddrs: []string{"localhost"}, - BindPort: port, - }, - - Codecs: []codec.Codec{c}, + cfg.TCPTransport = TCPTransportConfig{ + BindAddrs: []string{"localhost"}, + BindPort: port, } + cfg.Codecs = []codec.Codec{c} + if i == 0 { // Add members to first KV config to join immediately on initialization. // This will enforce backoff as each next members listener is not open yet. @@ -732,21 +748,21 @@ func TestMemberlistFailsToJoin(t *testing.T) { ports, err := getFreePorts(1) require.NoError(t, err) - cfg := KVConfig{ - MinJoinBackoff: 100 * time.Millisecond, - MaxJoinBackoff: 100 * time.Millisecond, - MaxJoinRetries: 2, - AbortIfJoinFails: true, + var cfg KVConfig + flagext.DefaultValues(&cfg) + cfg.MinJoinBackoff = 100 * time.Millisecond + cfg.MaxJoinBackoff = 100 * time.Millisecond + cfg.MaxJoinRetries = 2 + cfg.AbortIfJoinFails = true - TCPTransport: TCPTransportConfig{ - BindAddrs: []string{"localhost"}, - BindPort: 0, - }, + cfg.TCPTransport = TCPTransportConfig{ + BindAddrs: []string{"localhost"}, + BindPort: 0, + } - JoinMembers: []string{fmt.Sprintf("127.0.0.1:%d", ports[0])}, + cfg.JoinMembers = []string{fmt.Sprintf("127.0.0.1:%d", ports[0])} - Codecs: []codec.Codec{c}, - } + cfg.Codecs = []codec.Codec{c} mkv := NewKV(cfg, log.NewNopLogger()) require.NoError(t, services.StartAndAwaitRunning(context.Background(), mkv)) @@ -876,6 +892,14 @@ func (dc distributedCounter) RemoveTombstones(limit time.Time) (_, _ int) { return } +func (dc distributedCounter) Clone() Mergeable { + out := make(distributedCounter, len(dc)) + for k, v := range dc { + out[k] = v + } + return out +} + type distributedCounterCodec struct{} func (d distributedCounterCodec) CodecID() string { @@ -899,16 +923,16 @@ func (d distributedCounterCodec) Encode(val interface{}) ([]byte, error) { var _ codec.Codec = &distributedCounterCodec{} func TestMultipleCodecs(t *testing.T) { - cfg := KVConfig{ - TCPTransport: TCPTransportConfig{ - BindAddrs: []string{"localhost"}, - BindPort: 0, // randomize - }, + var cfg KVConfig + flagext.DefaultValues(&cfg) + cfg.TCPTransport = TCPTransportConfig{ + BindAddrs: []string{"localhost"}, + BindPort: 0, // randomize + } - Codecs: []codec.Codec{ - dataCodec{}, - distributedCounterCodec{}, - }, + cfg.Codecs = []codec.Codec{ + dataCodec{}, + distributedCounterCodec{}, } mkv1 := NewKV(cfg, log.NewNopLogger()) @@ -990,17 +1014,17 @@ func TestRejoin(t *testing.T) { ports, err := getFreePorts(2) require.NoError(t, err) - cfg1 := KVConfig{ - TCPTransport: TCPTransportConfig{ - BindAddrs: []string{"localhost"}, - BindPort: ports[0], - }, - - RandomizeNodeName: true, - Codecs: []codec.Codec{dataCodec{}}, - AbortIfJoinFails: false, + var cfg1 KVConfig + flagext.DefaultValues(&cfg1) + cfg1.TCPTransport = TCPTransportConfig{ + BindAddrs: []string{"localhost"}, + BindPort: ports[0], } + cfg1.RandomizeNodeName = true + cfg1.Codecs = []codec.Codec{dataCodec{}} + cfg1.AbortIfJoinFails = false + cfg2 := cfg1 cfg2.TCPTransport.BindPort = ports[1] cfg2.JoinMembers = []string{fmt.Sprintf("localhost:%d", ports[0])} diff --git a/pkg/ring/kv/memberlist/mergeable.go b/pkg/ring/kv/memberlist/mergeable.go index 0cb9964f1f..a013e34988 100644 --- a/pkg/ring/kv/memberlist/mergeable.go +++ b/pkg/ring/kv/memberlist/mergeable.go @@ -40,4 +40,7 @@ type Mergeable interface { // time when client is accessing value from the store. It can be used to hide tombstones from the clients. // Returns the total number of tombstones present and the number of removed tombstones by this invocation. RemoveTombstones(limit time.Time) (total, removed int) + + // Clone should return a deep copy of the state. + Clone() Mergeable } diff --git a/pkg/ring/kv/memberlist/metrics.go b/pkg/ring/kv/memberlist/metrics.go index ff729723da..e6e7d9bb4d 100644 --- a/pkg/ring/kv/memberlist/metrics.go +++ b/pkg/ring/kv/memberlist/metrics.go @@ -117,11 +117,6 @@ func (m *KV) createAndRegisterMetrics() { "Number of values in KV Store", nil, nil) - m.storeSizesDesc = prometheus.NewDesc( - prometheus.BuildFQName(m.cfg.MetricsNamespace, subsystem, "kv_store_value_bytes"), // gauge - "Sizes of values in KV Store in bytes", - []string{"key"}, nil) - m.storeTombstones = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: m.cfg.MetricsNamespace, Subsystem: subsystem, @@ -222,7 +217,6 @@ func (m *KV) createAndRegisterMetrics() { // Describe returns prometheus descriptions via supplied channel func (m *KV) Describe(ch chan<- *prometheus.Desc) { ch <- m.storeValuesDesc - ch <- m.storeSizesDesc } // Collect returns extra metrics via supplied channel @@ -231,8 +225,4 @@ func (m *KV) Collect(ch chan<- prometheus.Metric) { defer m.storeMu.Unlock() ch <- prometheus.MustNewConstMetric(m.storeValuesDesc, prometheus.GaugeValue, float64(len(m.store))) - - for k, v := range m.store { - ch <- prometheus.MustNewConstMetric(m.storeSizesDesc, prometheus.GaugeValue, float64(len(v.value)), k) - } } diff --git a/pkg/ring/lifecycler.go b/pkg/ring/lifecycler.go index 73584ea0a2..37897b70ba 100644 --- a/pkg/ring/lifecycler.go +++ b/pkg/ring/lifecycler.go @@ -17,6 +17,7 @@ import ( "go.uber.org/atomic" "github.com/cortexproject/cortex/pkg/ring/kv" + "github.com/cortexproject/cortex/pkg/util" "github.com/cortexproject/cortex/pkg/util/flagext" "github.com/cortexproject/cortex/pkg/util/log" "github.com/cortexproject/cortex/pkg/util/services" @@ -83,7 +84,7 @@ func (cfg *LifecyclerConfig) RegisterFlagsWithPrefix(prefix string, f *flag.Flag } f.IntVar(&cfg.NumTokens, prefix+"num-tokens", 128, "Number of tokens for each ingester.") - f.DurationVar(&cfg.HeartbeatPeriod, prefix+"heartbeat-period", 5*time.Second, "Period at which to heartbeat to consul.") + f.DurationVar(&cfg.HeartbeatPeriod, prefix+"heartbeat-period", 5*time.Second, "Period at which to heartbeat to consul. 0 = disabled.") f.DurationVar(&cfg.JoinAfter, prefix+"join-after", 0*time.Second, "Period to wait for a claim from another member; will join automatically after this.") f.DurationVar(&cfg.ObservePeriod, prefix+"observe-period", 0*time.Second, "Observe tokens after generating to resolve collisions. Useful when using gossiping ring.") f.DurationVar(&cfg.MinReadyDuration, prefix+"min-ready-duration", 1*time.Minute, "Minimum duration to wait before becoming ready. This is to work around race conditions with ingesters exiting and updating the ring.") @@ -392,8 +393,8 @@ func (i *Lifecycler) loop(ctx context.Context) error { autoJoinAfter := time.After(i.cfg.JoinAfter) var observeChan <-chan time.Time = nil - heartbeatTicker := time.NewTicker(i.cfg.HeartbeatPeriod) - defer heartbeatTicker.Stop() + heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(i.cfg.HeartbeatPeriod) + defer heartbeatTickerStop() for { select { @@ -442,7 +443,7 @@ func (i *Lifecycler) loop(ctx context.Context) error { observeChan = time.After(i.cfg.ObservePeriod) } - case <-heartbeatTicker.C: + case <-heartbeatTickerChan: consulHeartbeats.WithLabelValues(i.RingName).Inc() if err := i.updateConsul(context.Background()); err != nil { level.Error(log.Logger).Log("msg", "failed to write to the KV store, sleeping", "ring", i.RingName, "err", err) @@ -469,8 +470,8 @@ func (i *Lifecycler) stopping(runningError error) error { return nil } - heartbeatTicker := time.NewTicker(i.cfg.HeartbeatPeriod) - defer heartbeatTicker.Stop() + heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(i.cfg.HeartbeatPeriod) + defer heartbeatTickerStop() // Mark ourselved as Leaving so no more samples are send to us. err := i.changeState(context.Background(), LEAVING) @@ -489,7 +490,7 @@ func (i *Lifecycler) stopping(runningError error) error { heartbeatLoop: for { select { - case <-heartbeatTicker.C: + case <-heartbeatTickerChan: consulHeartbeats.WithLabelValues(i.RingName).Inc() if err := i.updateConsul(context.Background()); err != nil { level.Error(log.Logger).Log("msg", "failed to write to the KV store, sleeping", "ring", i.RingName, "err", err) diff --git a/pkg/ring/model.go b/pkg/ring/model.go index c10281080e..3a257c3952 100644 --- a/pkg/ring/model.go +++ b/pkg/ring/model.go @@ -101,7 +101,7 @@ func (d *Desc) FindIngestersByState(state InstanceState) []InstanceDesc { func (d *Desc) Ready(now time.Time, heartbeatTimeout time.Duration) error { numTokens := 0 for id, ingester := range d.Ingesters { - if now.Sub(time.Unix(ingester.Timestamp, 0)) > heartbeatTimeout { + if !ingester.IsHeartbeatHealthy(heartbeatTimeout, now) { return fmt.Errorf("instance %s past heartbeat timeout", id) } else if ingester.State != ACTIVE { return fmt.Errorf("instance %s in state %v", id, ingester.State) @@ -136,7 +136,16 @@ func (i *InstanceDesc) GetRegisteredAt() time.Time { func (i *InstanceDesc) IsHealthy(op Operation, heartbeatTimeout time.Duration, now time.Time) bool { healthy := op.IsInstanceInStateHealthy(i.State) - return healthy && now.Unix()-i.Timestamp <= heartbeatTimeout.Milliseconds()/1000 + return healthy && i.IsHeartbeatHealthy(heartbeatTimeout, now) +} + +// IsHeartbeatHealthy returns whether the heartbeat timestamp for the ingester is within the +// specified timeout period. A timeout of zero disables the timeout; the heartbeat is ignored. +func (i *InstanceDesc) IsHeartbeatHealthy(heartbeatTimeout time.Duration, now time.Time) bool { + if heartbeatTimeout == 0 { + return true + } + return now.Sub(time.Unix(i.Timestamp, 0)) <= heartbeatTimeout } // Merge merges other ring into this one. Returns sub-ring that represents the change, @@ -389,6 +398,11 @@ func (d *Desc) RemoveTombstones(limit time.Time) (total, removed int) { return } +// Clone returns a deep copy of the ring state. +func (d *Desc) Clone() memberlist.Mergeable { + return proto.Clone(d).(*Desc) +} + func (d *Desc) getTokensInfo() map[uint32]instanceInfo { out := map[uint32]instanceInfo{} diff --git a/pkg/ring/model_test.go b/pkg/ring/model_test.go index 9570abf17d..1d73e6f98b 100644 --- a/pkg/ring/model_test.go +++ b/pkg/ring/model_test.go @@ -136,10 +136,18 @@ func TestDesc_Ready(t *testing.T) { t.Fatal("expected ready, got", err) } + if err := r.Ready(now, 0); err != nil { + t.Fatal("expected ready, got", err) + } + if err := r.Ready(now.Add(5*time.Minute), 10*time.Second); err == nil { t.Fatal("expected !ready (no heartbeat from active ingester), but got no error") } + if err := r.Ready(now.Add(5*time.Minute), 0); err != nil { + t.Fatal("expected ready (no heartbeat but timeout disabled), got", err) + } + r = &Desc{ Ingesters: map[string]InstanceDesc{ "ing1": { diff --git a/pkg/ring/ring.go b/pkg/ring/ring.go index 539ec65e75..72165ed369 100644 --- a/pkg/ring/ring.go +++ b/pkg/ring/ring.go @@ -71,6 +71,10 @@ type ReadRing interface { // and size (number of instances). ShuffleShard(identifier string, size int) ReadRing + // GetInstanceState returns the current state of an instance or an error if the + // instance does not exist in the ring. + GetInstanceState(instanceID string) (InstanceState, error) + // ShuffleShardWithLookback is like ShuffleShard() but the returned subring includes // all instances that have been part of the identifier's shard since "now - lookbackPeriod". ShuffleShardWithLookback(identifier string, size int, lookbackPeriod time.Duration, now time.Time) ReadRing @@ -143,7 +147,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { cfg.KVStore.RegisterFlagsWithPrefix(prefix, "collectors/", f) - f.DurationVar(&cfg.HeartbeatTimeout, prefix+"ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which ingesters are skipped for reads/writes.") + f.DurationVar(&cfg.HeartbeatTimeout, prefix+"ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which ingesters are skipped for reads/writes. 0 = never (timeout disabled).") f.IntVar(&cfg.ReplicationFactor, prefix+"distributor.replication-factor", 3, "The number of ingesters to write to and read from.") f.BoolVar(&cfg.ZoneAwarenessEnabled, prefix+"distributor.zone-awareness-enabled", false, "True to enable the zone-awareness and replicate ingested samples across different availability zones.") } diff --git a/pkg/ring/ring_test.go b/pkg/ring/ring_test.go index f1d038d291..c4898e939c 100644 --- a/pkg/ring/ring_test.go +++ b/pkg/ring/ring_test.go @@ -390,11 +390,11 @@ func TestRing_GetAllHealthy(t *testing.T) { } func TestRing_GetReplicationSetForOperation(t *testing.T) { - const heartbeatTimeout = time.Minute now := time.Now() tests := map[string]struct { ringInstances map[string]InstanceDesc + ringHeartbeatTimeout time.Duration ringReplicationFactor int expectedErrForRead error expectedSetForRead []string @@ -405,6 +405,7 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { }{ "should return error on empty ring": { ringInstances: nil, + ringHeartbeatTimeout: time.Minute, ringReplicationFactor: 1, expectedErrForRead: ErrEmptyRing, expectedErrForWrite: ErrEmptyRing, @@ -418,6 +419,21 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-40 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, }, + ringHeartbeatTimeout: time.Minute, + ringReplicationFactor: 1, + expectedSetForRead: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, + expectedSetForWrite: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, + expectedSetForReporting: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, + }, + "should succeed on instances with old timestamps but heartbeat timeout disabled": { + ringInstances: map[string]InstanceDesc{ + "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, + "instance-2": {Addr: "127.0.0.2", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, + "instance-3": {Addr: "127.0.0.3", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, + "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, + "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, + }, + ringHeartbeatTimeout: 0, ringReplicationFactor: 1, expectedSetForRead: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, expectedSetForWrite: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, @@ -431,6 +447,7 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, }, + ringHeartbeatTimeout: time.Minute, ringReplicationFactor: 1, expectedErrForRead: ErrTooManyUnhealthyInstances, expectedErrForWrite: ErrTooManyUnhealthyInstances, @@ -444,6 +461,7 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, }, + ringHeartbeatTimeout: time.Minute, ringReplicationFactor: 3, expectedSetForRead: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4"}, expectedSetForWrite: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4"}, @@ -457,6 +475,7 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, }, + ringHeartbeatTimeout: time.Minute, ringReplicationFactor: 3, expectedErrForRead: ErrTooManyUnhealthyInstances, expectedErrForWrite: ErrTooManyUnhealthyInstances, @@ -474,7 +493,7 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { ring := Ring{ cfg: Config{ - HeartbeatTimeout: heartbeatTimeout, + HeartbeatTimeout: testData.ringHeartbeatTimeout, ReplicationFactor: testData.ringReplicationFactor, }, ringDesc: ringDesc, diff --git a/pkg/ring/util.go b/pkg/ring/util.go index ac5c27388c..06e053dd26 100644 --- a/pkg/ring/util.go +++ b/pkg/ring/util.go @@ -69,7 +69,7 @@ func GetInstancePort(configPort, listenPort int) int { // WaitInstanceState waits until the input instanceID is registered within the // ring matching the provided state. A timeout should be provided within the context. -func WaitInstanceState(ctx context.Context, r *Ring, instanceID string, state InstanceState) error { +func WaitInstanceState(ctx context.Context, r ReadRing, instanceID string, state InstanceState) error { backoff := util.NewBackoff(ctx, util.BackoffConfig{ MinBackoff: 100 * time.Millisecond, MaxBackoff: time.Second, diff --git a/pkg/ring/util_test.go b/pkg/ring/util_test.go index c3a3e037e9..9a4a69c690 100644 --- a/pkg/ring/util_test.go +++ b/pkg/ring/util_test.go @@ -6,10 +6,65 @@ import ( "testing" "time" + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" ) +type RingMock struct { + mock.Mock +} + +func (r *RingMock) Collect(ch chan<- prometheus.Metric) {} + +func (r *RingMock) Describe(ch chan<- *prometheus.Desc) {} + +func (r *RingMock) Get(key uint32, op Operation, bufDescs []InstanceDesc, bufHosts, bufZones []string) (ReplicationSet, error) { + args := r.Called(key, op, bufDescs, bufHosts, bufZones) + return args.Get(0).(ReplicationSet), args.Error(1) +} + +func (r *RingMock) GetAllHealthy(op Operation) (ReplicationSet, error) { + args := r.Called(op) + return args.Get(0).(ReplicationSet), args.Error(1) +} + +func (r *RingMock) GetReplicationSetForOperation(op Operation) (ReplicationSet, error) { + args := r.Called(op) + return args.Get(0).(ReplicationSet), args.Error(1) +} + +func (r *RingMock) ReplicationFactor() int { + return 0 +} + +func (r *RingMock) InstancesCount() int { + return 0 +} + +func (r *RingMock) ShuffleShard(identifier string, size int) ReadRing { + args := r.Called(identifier, size) + return args.Get(0).(ReadRing) +} + +func (r *RingMock) GetInstanceState(instanceID string) (InstanceState, error) { + args := r.Called(instanceID) + return args.Get(0).(InstanceState), args.Error(1) +} + +func (r *RingMock) ShuffleShardWithLookback(identifier string, size int, lookbackPeriod time.Duration, now time.Time) ReadRing { + args := r.Called(identifier, size, lookbackPeriod, now) + return args.Get(0).(ReadRing) +} + +func (r *RingMock) HasInstance(instanceID string) bool { + return true +} + +func (r *RingMock) CleanupShuffleShardCache(identifier string) {} + func TestGenerateTokens(t *testing.T) { tokens := GenerateTokens(1000000, nil) @@ -184,3 +239,63 @@ func TestWaitRingStabilityShouldReturnErrorIfMaxWaitingIsReached(t *testing.T) { assert.InDelta(t, maxWaiting, elapsedTime, float64(2*time.Second)) } + +func TestWaitInstanceStateTimeout(t *testing.T) { + t.Parallel() + + const ( + instanceID = "test" + timeoutDuration = time.Second + ) + + ctx, cancel := context.WithTimeout(context.Background(), timeoutDuration) + defer cancel() + + ring := &RingMock{} + ring.On("GetInstanceState", mock.Anything, mock.Anything).Return(ACTIVE, nil) + + err := WaitInstanceState(ctx, ring, instanceID, PENDING) + + assert.Equal(t, context.DeadlineExceeded, err) + ring.AssertCalled(t, "GetInstanceState", instanceID) +} + +func TestWaitInstanceStateTimeoutOnError(t *testing.T) { + t.Parallel() + + const ( + instanceID = "test" + timeoutDuration = time.Second + ) + + ctx, cancel := context.WithTimeout(context.Background(), timeoutDuration) + defer cancel() + + ring := &RingMock{} + ring.On("GetInstanceState", mock.Anything, mock.Anything).Return(PENDING, errors.New("instance not found in the ring")) + + err := WaitInstanceState(ctx, ring, instanceID, ACTIVE) + + assert.Equal(t, context.DeadlineExceeded, err) + ring.AssertCalled(t, "GetInstanceState", instanceID) +} + +func TestWaitInstanceStateExitsAfterActualStateEqualsState(t *testing.T) { + t.Parallel() + + const ( + instanceID = "test" + timeoutDuration = time.Second + ) + + ctx, cancel := context.WithTimeout(context.Background(), timeoutDuration) + defer cancel() + + ring := &RingMock{} + ring.On("GetInstanceState", mock.Anything, mock.Anything).Return(ACTIVE, nil) + + err := WaitInstanceState(ctx, ring, instanceID, ACTIVE) + + assert.Nil(t, err) + ring.AssertNumberOfCalls(t, "GetInstanceState", 1) +} diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 3c02695a9c..c95cfcd944 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -6,6 +6,7 @@ import ( "time" "github.com/go-kit/kit/log" + "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/prometheus/notifier" @@ -20,6 +21,7 @@ import ( "github.com/cortexproject/cortex/pkg/cortexpb" "github.com/cortexproject/cortex/pkg/querier" + util_log "github.com/cortexproject/cortex/pkg/util/log" ) // Pusher is an ingester server that accepts pushes. @@ -146,9 +148,7 @@ func EngineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides Rules func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter) rules.QueryFunc { return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { queries.Inc() - result, err := qf(ctx, qs, t) - // We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil. // It returns promql.ErrStorage, if error should be reported back as 500. // Other errors it returns are either for canceled or timed-out queriers (we're not reporting those as failures), @@ -163,6 +163,33 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun } } +func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc { + if queryTime == nil { + return qf + } + + return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { + // If we've been passed a counter we want to record the wall time spent executing this request. + timer := prometheus.NewTimer(nil) + defer func() { + querySeconds := timer.ObserveDuration().Seconds() + queryTime.Add(querySeconds) + + // Log ruler query stats. + logMessage := []interface{}{ + "msg", "query stats", + "component", "ruler", + "cortex_ruler_query_seconds_total", querySeconds, + "query", qs, + } + level.Info(util_log.WithContext(ctx, logger)).Log(logMessage...) + }() + + result, err := qf(ctx, qs, t) + return result, err + } +} + // This interface mimicks rules.Manager API. Interface is used to simplify tests. type RulesManager interface { // Starts rules manager. Blocks until Stop is called. @@ -199,12 +226,24 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi Name: "cortex_ruler_queries_failed_total", Help: "Number of failed queries by ruler.", }) + var rulerQuerySeconds *prometheus.CounterVec + if cfg.EnableQueryStats { + rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ruler_query_seconds_total", + Help: "Total amount of wall clock time spent processing queries by the ruler.", + }, []string{"user"}) + } return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager { + var queryTime prometheus.Counter = nil + if rulerQuerySeconds != nil { + queryTime = rulerQuerySeconds.WithLabelValues(userID) + } + return rules.NewManager(&rules.ManagerOptions{ Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites), Queryable: q, - QueryFunc: MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries), + QueryFunc: RecordAndReportRuleQueryMetrics(MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries), queryTime, logger), Context: user.InjectOrgID(ctx, userID), ExternalURL: cfg.ExternalURL.URL, NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()), diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 1c5a9fe17b..dfcb251803 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -8,6 +8,7 @@ import ( "testing" "time" + "github.com/go-kit/kit/log" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/common/model" @@ -230,7 +231,6 @@ func TestMetricsQueryFuncErrors(t *testing.T) { mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { return promql.Vector{}, tc.returnedError } - qf := MetricsQueryFunc(mockFunc, queries, failures) _, err := qf(context.Background(), "test", time.Now()) @@ -241,3 +241,16 @@ func TestMetricsQueryFuncErrors(t *testing.T) { }) } } + +func TestRecordAndReportRuleQueryMetrics(t *testing.T) { + queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}) + + mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { + time.Sleep(1 * time.Second) + return promql.Vector{}, nil + } + qf := RecordAndReportRuleQueryMetrics(mockFunc, queryTime.WithLabelValues("userID"), log.NewNopLogger()) + _, _ = qf(context.Background(), "test", time.Now()) + + require.GreaterOrEqual(t, testutil.ToFloat64(queryTime.WithLabelValues("userID")), float64(1)) +} diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 25061a8d35..57b51ed103 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -115,6 +115,8 @@ type Config struct { DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"` RingCheckPeriod time.Duration `yaml:"-"` + + EnableQueryStats bool `yaml:"query_stats_enabled"` } // Validate config and returns error on failure @@ -173,6 +175,8 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.") f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.") + f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a per user metric and as an info level log message.") + cfg.RingCheckPeriod = 5 * time.Second } diff --git a/pkg/ruler/ruler_ring.go b/pkg/ruler/ruler_ring.go index 20d2e900ab..e8577e8289 100644 --- a/pkg/ruler/ruler_ring.go +++ b/pkg/ruler/ruler_ring.go @@ -56,8 +56,8 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // Ring flags cfg.KVStore.RegisterFlagsWithPrefix("ruler.ring.", "rulers/", f) - f.DurationVar(&cfg.HeartbeatPeriod, "ruler.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring.") - f.DurationVar(&cfg.HeartbeatTimeout, "ruler.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which rulers are considered unhealthy within the ring.") + f.DurationVar(&cfg.HeartbeatPeriod, "ruler.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.") + f.DurationVar(&cfg.HeartbeatTimeout, "ruler.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which rulers are considered unhealthy within the ring. 0 = never (timeout disabled).") // Instance flags cfg.InstanceInterfaceNames = []string{"eth0", "en0"} diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index a6e89918b6..503474afd8 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -60,6 +60,7 @@ func defaultRulerConfig(store rulestore.RuleStore) (Config, func()) { cfg.Ring.ListenPort = 0 cfg.Ring.InstanceAddr = "localhost" cfg.Ring.InstanceID = "localhost" + cfg.EnableQueryStats = false // Create a cleanup function that will be called at the end of the test cleanup := func() { diff --git a/pkg/storegateway/gateway_ring.go b/pkg/storegateway/gateway_ring.go index 4ca5ed08a7..c868c2b789 100644 --- a/pkg/storegateway/gateway_ring.go +++ b/pkg/storegateway/gateway_ring.go @@ -94,8 +94,8 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // Ring flags cfg.KVStore.RegisterFlagsWithPrefix(ringFlagsPrefix, "collectors/", f) - f.DurationVar(&cfg.HeartbeatPeriod, ringFlagsPrefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring.") - f.DurationVar(&cfg.HeartbeatTimeout, ringFlagsPrefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which store gateways are considered unhealthy within the ring."+sharedOptionWithQuerier) + f.DurationVar(&cfg.HeartbeatPeriod, ringFlagsPrefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.") + f.DurationVar(&cfg.HeartbeatTimeout, ringFlagsPrefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which store gateways are considered unhealthy within the ring. 0 = never (timeout disabled)."+sharedOptionWithQuerier) f.IntVar(&cfg.ReplicationFactor, ringFlagsPrefix+"replication-factor", 3, "The replication factor to use when sharding blocks."+sharedOptionWithQuerier) f.StringVar(&cfg.TokensFilePath, ringFlagsPrefix+"tokens-file-path", "", "File path where tokens are stored. If empty, tokens are not stored at shutdown and restored at startup.") f.BoolVar(&cfg.ZoneAwarenessEnabled, ringFlagsPrefix+"zone-awareness-enabled", false, "True to enable zone-awareness and replicate blocks across different availability zones.") diff --git a/pkg/util/time.go b/pkg/util/time.go index 58f951a8b3..8816b1d7d2 100644 --- a/pkg/util/time.go +++ b/pkg/util/time.go @@ -73,3 +73,14 @@ func DurationWithPositiveJitter(input time.Duration, variancePerc float64) time. return input + time.Duration(jitter) } + +// NewDisableableTicker essentially wraps NewTicker but allows the ticker to be disabled by passing +// zero duration as the interval. Returns a function for stopping the ticker, and the ticker channel. +func NewDisableableTicker(interval time.Duration) (func(), <-chan time.Time) { + if interval == 0 { + return func() {}, nil + } + + tick := time.NewTicker(interval) + return func() { tick.Stop() }, tick.C +} diff --git a/pkg/util/time_test.go b/pkg/util/time_test.go index 700fe55fab..ab1da4a85b 100644 --- a/pkg/util/time_test.go +++ b/pkg/util/time_test.go @@ -103,3 +103,31 @@ func TestParseTime(t *testing.T) { assert.Equal(t, TimeToMillis(test.result), ts) } } + +func TestNewDisableableTicker_Enabled(t *testing.T) { + stop, ch := NewDisableableTicker(10 * time.Millisecond) + defer stop() + + time.Sleep(100 * time.Millisecond) + + select { + case <-ch: + break + default: + t.Error("ticker should have ticked when enabled") + } +} + +func TestNewDisableableTicker_Disabled(t *testing.T) { + stop, ch := NewDisableableTicker(0) + defer stop() + + time.Sleep(100 * time.Millisecond) + + select { + case <-ch: + t.Error("ticker should not have ticked when disabled") + default: + break + } +}