Skip to content

Commit

Permalink
prombench: Added support for --bench.version flag.
Browse files Browse the repository at this point in the history
See prometheus/proposals#41 for rationale.

Signed-off-by: bwplotka <bwplotka@gmail.com>
  • Loading branch information
bwplotka committed Dec 17, 2024
1 parent c037ee0 commit f979141
Show file tree
Hide file tree
Showing 8 changed files with 217 additions and 120 deletions.
71 changes: 55 additions & 16 deletions prombench/Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
INFRA_CMD ?= ../infra/infra

PROVIDER ?= gke

.PHONY: deploy clean
deploy: node_create resource_apply
# GCP sometimes takes longer than 30 tries when trying to delete nodes
# if k8s resources are not already cleared
clean: resource_delete node_delete
INFRA_CMD ?= ../infra/infra
PROVIDER ?= gke

cluster_create:
${INFRA_CMD} ${PROVIDER} cluster create -a ${AUTH_FILE} \
Expand Down Expand Up @@ -37,50 +30,96 @@ cluster_delete:
-v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \
-f manifests/cluster_${PROVIDER}.yaml

# /prombench <...> --bench.directory
BENCHMARK_DIRECTORY := $(if $(BENCHMARK_DIRECTORY),$(BENCHMARK_DIRECTORY),manifests/prombench)
# /prombench <...> --bench.version
BENCHMARK_VERSION := $(if $(BENCHMARK_VERSION),$(BENCHMARK_VERSION),master)
PROMBENCH_GIT_REPOSITORY ?= git@github.com:prometheus/test-infra.git
PROMBENCH_DIR ?= .

# maybe_pull_custom_version allows custom benchmarking as designed in
# https://github.com/prometheus/proposals/pull/41. It allows calling
# /prombench <release> --bench.version=<@commit or branch> which will cause
# prombench GH job on Prometheus repo to call infra CLI with the non-master BENCHMARK_VERSION.
# In such a case we pull a prombench repository for the given branch or commit version
# and adjust PROMBENCH_DIR. As a result `make deploy` and `make clean` jobs
# will apply /manifests/ apply custom manifests or even node pools.
.PHONE: maybe_pull_custom_version
maybe_pull_custom_version:
ifeq (${BENCHMARK_VERSION},master)
@echo ">> Using standard benchmark configuration, from the docker image"
else
@echo ">> Git pulling custom benchmark configuration from the ${BENCHMARK_VERSION}"
@$(eval $@_TMP_DIR=$(shell mktemp -d -t "prombench"))
cd ${$@_TMP_DIR} && git clone ${PROMBENCH_GIT_REPOSITORY}
ifeq ($(subst @,,${BENCHMARK_VERSION}),${BENCHMARK_VERSION})
@echo ">> --bench.version is a branch, reseting to origin/${BENCHMARK_VERSION}"
cd ${$@_TMP_DIR}/test-infra && git reset --hard origin/${BENCHMARK_VERSION}
else
@echo ">> --bench.version is a commit SHA, reseting to $(subst @,,${BENCHMARK_VERSION})"
cd ${$@_TMP_DIR}/test-infra && git reset --hard $(subst @,,${BENCHMARK_VERSION})
endif
$(eval PROMBENCH_DIR=${$@_TMP_DIR}/test-infra/prombench)
endif
@echo ">> Using following files in ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}"
@ls -lR ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}

.PHONE: clean_tmp_dir
clean_tmp_dir: # Clean after maybe_pull_custom_version
[ -z ${maybe_pull_custom_version_TMP_DIR} ] || rm -rf ${maybe_pull_custom_version_TMP_DIR}

.PHONY: deploy
deploy: maybe_pull_custom_version node_create resource_apply clean_tmp_dir

.PHONE: clean
# GCP sometimes takes longer than 30 tries when trying to delete nodes
# if k8s resources are not already cleared
clean: maybe_pull_custom_version resource_delete node_delete clean_tmp_dir

node_create:
${INFRA_CMD} ${PROVIDER} nodes create -a ${AUTH_FILE} \
-v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \
-v EKS_WORKER_ROLE_ARN:${EKS_WORKER_ROLE_ARN} -v EKS_CLUSTER_ROLE_ARN:${EKS_CLUSTER_ROLE_ARN} \
-v EKS_SUBNET_IDS:${EKS_SUBNET_IDS} \
-v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \
-f manifests/prombench/nodes_${PROVIDER}.yaml
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/nodes_${PROVIDER}.yaml

resource_apply:
$(INFRA_CMD) ${PROVIDER} resource apply -a ${AUTH_FILE} \
-v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \
-v CLUSTER_NAME:${CLUSTER_NAME} \
-v PR_NUMBER:${PR_NUMBER} -v RELEASE:${RELEASE} -v DOMAIN_NAME:${DOMAIN_NAME} \
-v GITHUB_ORG:${GITHUB_ORG} -v GITHUB_REPO:${GITHUB_REPO} \
-f manifests/prombench/benchmark
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/benchmark

# Required because namespace and cluster-role are not part of the created nodes
resource_delete:
$(INFRA_CMD) ${PROVIDER} resource delete -a ${AUTH_FILE} \
-v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \
-v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \
-f manifests/prombench/benchmark/1c_cluster-role-binding.yaml \
-f manifests/prombench/benchmark/1a_namespace.yaml
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/benchmark/1c_cluster-role-binding.yaml \
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/benchmark/1a_namespace.yaml

node_delete:
$(INFRA_CMD) ${PROVIDER} nodes delete -a ${AUTH_FILE} \
-v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \
-v EKS_WORKER_ROLE_ARN:${EKS_WORKER_ROLE_ARN} -v EKS_CLUSTER_ROLE_ARN:${EKS_CLUSTER_ROLE_ARN} \
-v EKS_SUBNET_IDS:${EKS_SUBNET_IDS} \
-v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \
-f manifests/prombench/nodes_${PROVIDER}.yaml
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/nodes_${PROVIDER}.yaml

all_nodes_running:
$(INFRA_CMD) ${PROVIDER} nodes check-running -a ${AUTH_FILE} \
-v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \
-v EKS_WORKER_ROLE_ARN:${EKS_WORKER_ROLE_ARN} -v EKS_CLUSTER_ROLE_ARN:${EKS_CLUSTER_ROLE_ARN} \
-v EKS_SUBNET_IDS:${EKS_SUBNET_IDS} -v SEPARATOR:${SEPARATOR} \
-v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \
-f manifests/prombench/nodes_${PROVIDER}.yaml
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/nodes_${PROVIDER}.yaml

all_nodes_deleted:
$(INFRA_CMD) ${PROVIDER} nodes check-deleted -a ${AUTH_FILE} \
-v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \
-v EKS_WORKER_ROLE_ARN:${EKS_WORKER_ROLE_ARN} -v EKS_CLUSTER_ROLE_ARN:${EKS_CLUSTER_ROLE_ARN} \
-v EKS_SUBNET_IDS:${EKS_SUBNET_IDS} -v SEPARATOR:${SEPARATOR} \
-v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \
-f manifests/prombench/nodes_${PROVIDER}.yaml
-f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/nodes_${PROVIDER}.yaml
31 changes: 23 additions & 8 deletions prombench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,24 @@

This setup leverages **GitHub Actions** and **Google Kubernetes Engine (GKE)**, but is designed to be extendable to other Kubernetes providers.

## Overview of Manifest Files
## Configuration Files

The `/manifest` directory contains Kubernetes manifest files:
The `./manifest` directory contains configuration files. We can outline :

- **`cluster_gke.yaml`**: Creates the Main Node in GKE.
- **`cluster_eks.yaml`**: Creates the Main Node in EKS.
- **`cluster-infra/`**: Contains persistent components of the Main Node.
- **`prombench/`**: Resources created and destroyed for each Prombench test.
- **`./manifest/cluster_gke.yaml`**: Creates the Main Node in GKE.
- **`./manifest/cluster_eks.yaml`**: Creates the Main Node in EKS.
- **`./manifest/cluster-infra/`**: Contains persistent components of the Main Node.
- **`./manifest/prombench/`**: Resources created and destroyed for each Prombench test. See [`its README.md`](./manifests/prombench/README.md) for details.

## Setup and Running Prombench
## Prombench Setup

Prombench can be run on different providers. Follow these instructions based on your provider:

- [Google Kubernetes Engine (GKE)](docs/gke.md)
- [Kubernetes In Docker (KIND)](docs/kind.md)
- [Elastic Kubernetes Service (EKS)](docs/eks.md)

## Setting Up GitHub Actions
### Setting Up GitHub Actions

1. Place a workflow file in the `.github` directory of your repository. Refer to the [Prometheus GitHub repository](https://github.com/prometheus/prometheus) for an example.

Expand All @@ -30,22 +30,34 @@ Prombench can be run on different providers. Follow these instructions based on
```bash
cat $AUTH_FILE | base64 -w 0
```

3. Configure webhook to cluster's comment-monitor as described [here](../tools/comment-monitor/README.md#setting-up-the-github-webhook).

## Prombench Usage

### Triggering Tests via GitHub Comment

**Starting Tests:**

- `/prombench main` or `/prombench master` - Compare PR with the main/master branch.
- `/prombench v2.4.0` - Compare PR with a specific release version (e.g., from [quay.io/prometheus/prometheus:releaseVersion](https://quay.io/prometheus/prometheus:releaseVersion)).
- `/prombench v2.4.0 --bench.version=@aca1803ccf5d795eee4b0848707eab26d05965cc` - Compare with 2.4.0 release, but use a specific `aca1803ccf5d795eee4b0848707eab26d05965cc` commit on this repository for `./manifests/prombench` resources.
- `/prombench v2.4.0 --bench.version=mybranch` - Compare with 2.4.0 release, but use a specific `mybranch` on this repository for `./manifests/prombench` resources.
- `/prombench v2.4.0 --bench.directory=manifests/prombench-agent-mode` - Compare with 2.4.0 release, but use a specific resource directory on `master` branch for this repository. Currently there is only `./manifests/prombench` available (default), we might add more modes in the future.

**Restarting Tests:**

- `/prombench restart <release_version>`
- `/prombench restart <release_version> --bench.version=... --bench.directory...`

**Stopping Tests:**

- `/prombench cancel`

**Printing available commands:**

- `/prombench help`

### Building the Docker Image

Build the Docker image with:
Expand All @@ -54,3 +66,6 @@ Build the Docker image with:
docker build -t prominfra/prombench:master .
```




45 changes: 45 additions & 0 deletions prombench/manifests/prombench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
## Prombench Benchmark Scenario Configuration

This directory contains resources that are applied (and cleaned) on every benchmark request
via `infra` CLI using [`make deploy`](../../Makefile) and cleaned using [`make clean`](../../Makefile).

It assumes running cluster was created via `infra` CLI using `make cluster_create` and `make cluster_delete`.

### Variables

It expects the following templated variables:

* `.PR_NUMBER`: The PR number from which `/prombench` was triggered. This PR number also tells what commit to use for the `prometheus-test-pr-{{ .PR_NUMBER }}` Prometheus image building (in the init container).
* `.RELEASE`: The argument provided by `/prombench` caller representing the Prometheus version (docker image tag for `quay.io/prometheus/prometheus:{{ .RELEASE }}`) to compare with, deployed as the `prometheus-test-{{ .RELEASE }}`.
* `.DOMAIN_NAME`
* `.LOADGEN_SCALE_UP_REPLICAS`
* `.GITHUB_ORG`
* `.GITHUB_REPO`

### Customizations

> NOTE: See https://github.com/prometheus/proposals/pull/41 for design.
On the `master` branch, in this directory, we maintain the standard, single benchmarking scenario used
as an acceptance validation for Prometheus. It's important to ensure it represents common Prometheus configuration.

The only user related parameter for the standard scenario is `RELEASE` version.

However, it's possible to create, a fully custom benchmarking scenarios for `/prombench` via `--bench.version=<branch|@commit>` flag.

Here are an example steps:

1. Create a new branch on https://github.com/prometheus/test-infra e.g. `benchmark/scenario1`.
2. Modify this directory to your liking e.g. changing query load, metric load of advanced Prometheus configuration. It's also possible to make Prometheus deployments and versions exactly the same, but vary in a single configuration flag, for feature benchmarking.

> WARN: When customizing this directory, don't change `1a_namespace.yaml` or `1c_cluster-role-binding.yaml` filenames as they are used for cleanup routine. Or, if you change it, know what you're doing in relation to [`make clean` job](../../Makefile).
3. Push changes to the new branch.
4. From the Prometheus PR comment, call prombench as `/prombench <release> --bench.version=benchmark/scenario1` or `/prombench <release> --bench.version=@<relevant commit SHA from the benchmark/scenario1>` to use configuration files from this custom branch.

Other details:

* Other custom branch modifications other than to this directory do not affect prombench (e.g. to infra CLI or makefiles).
* `--bench.version` is designed for a short-term or even one-off benchmark scenario configurations. It's not designed for long-term, well maintained scenarios. For the latter reason we can later e.g. maintain multiple `manifests/prombench` directories and introduce a new `--bench.directory` flag.
* Non-maintainers can follow similar process, but they will need to ask maintainer for a new branch and PR review. We can consider extending `--bench.version` to support remote repositories if this becomes a problem.
* Custom benchmarking logic is implemented in the [`maybe_pull_custom_version` make job](../../Makefile) and invoked by the prombench GH job on Prometheus repo on `deploy` and `clean`.
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ spec:
name: prometheus-test
- name: instance-ssd
hostPath:
path: /mnt/disks/ssd0 #gke ssds
# /mnt is where GKE keeps it's SSD.
path: /mnt/disks/ssd0
- name: prometheus-executable
emptyDir: {}
terminationGracePeriodSeconds: 300
Expand All @@ -113,91 +114,3 @@ spec:
selector:
app: prometheus
prometheus: test-pr-{{ .PR_NUMBER }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-test-{{ normalise .RELEASE }}
namespace: prombench-{{ .PR_NUMBER }}
labels:
app: prometheus
prometheus: test-{{ normalise .RELEASE }}
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
prometheus: test-{{ normalise .RELEASE }}
template:
metadata:
namespace: prombench-{{ .PR_NUMBER }}
labels:
app: prometheus
prometheus: test-{{ normalise .RELEASE }}
spec:
serviceAccountName: prometheus
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- topologyKey: kubernetes.io/hostname
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- prometheus
securityContext:
runAsUser: 0
containers:
- name: prometheus
image: quay.io/prometheus/prometheus:{{ .RELEASE }}
imagePullPolicy: Always
command: [ "/bin/prometheus" ]
args: [
"--web.external-url=http://{{ .DOMAIN_NAME }}/{{ .PR_NUMBER }}/prometheus-release",
"--storage.tsdb.path=/prometheus",
"--config.file=/etc/prometheus/prometheus.yml",
"--log.level=debug"
]
resources:
requests:
cpu: 2
memory: 20Gi
volumeMounts:
- name: config-volume
mountPath: /etc/prometheus
- name: instance-ssd
mountPath: /prometheus
ports:
- name: prom-web
containerPort: 9090
volumes:
- name: config-volume
configMap:
name: prometheus-test
- name: instance-ssd
hostPath:
# /mnt is where GKE keeps it's SSD
# don't change this if you want Prometheus to take advantage of these local SSDs
path: /mnt/disks/ssd0
terminationGracePeriodSeconds: 300
nodeSelector:
node-name: prometheus-{{ .PR_NUMBER }}
isolation: prometheus
---
apiVersion: v1
kind: Service
metadata:
name: prometheus-test-{{ normalise .RELEASE }}
namespace: prombench-{{ .PR_NUMBER }}
labels:
app: prometheus
prometheus: test-{{ normalise .RELEASE }}
spec:
ports:
- name: prom-web
port: 80
targetPort: prom-web
selector:
app: prometheus
prometheus: test-{{ normalise .RELEASE }}
Loading

0 comments on commit f979141

Please sign in to comment.