diff --git a/prombench/Makefile b/prombench/Makefile index 7462d80a6..e969473c0 100644 --- a/prombench/Makefile +++ b/prombench/Makefile @@ -1,12 +1,5 @@ -INFRA_CMD ?= ../infra/infra - -PROVIDER ?= gke - -.PHONY: deploy clean -deploy: node_create resource_apply -# GCP sometimes takes longer than 30 tries when trying to delete nodes -# if k8s resources are not already cleared -clean: resource_delete node_delete +INFRA_CMD ?= ../infra/infra +PROVIDER ?= gke cluster_create: ${INFRA_CMD} ${PROVIDER} cluster create -a ${AUTH_FILE} \ @@ -37,13 +30,59 @@ cluster_delete: -v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \ -f manifests/cluster_${PROVIDER}.yaml +# /prombench <...> --bench.directory +BENCHMARK_DIRECTORY := $(if $(BENCHMARK_DIRECTORY),$(BENCHMARK_DIRECTORY),manifests/prombench) +# /prombench <...> --bench.version +BENCHMARK_VERSION := $(if $(BENCHMARK_VERSION),$(BENCHMARK_VERSION),master) +PROMBENCH_GIT_REPOSITORY ?= git@github.com:prometheus/test-infra.git +PROMBENCH_DIR ?= . + +# maybe_pull_custom_version allows custom benchmarking as designed in +# https://github.com/prometheus/proposals/pull/41. It allows calling +# /prombench --bench.version=<@commit or branch> which will cause +# prombench GH job on Prometheus repo to call infra CLI with the non-master BENCHMARK_VERSION. +# In such a case we pull a prombench repository for the given branch or commit version +# and adjust PROMBENCH_DIR. As a result `make deploy` and `make clean` jobs +# will apply /manifests/ apply custom manifests or even node pools. +.PHONE: maybe_pull_custom_version +maybe_pull_custom_version: +ifeq (${BENCHMARK_VERSION},master) + @echo ">> Using standard benchmark configuration, from the docker image" +else + @echo ">> Git pulling custom benchmark configuration from the ${BENCHMARK_VERSION}" + @$(eval $@_TMP_DIR=$(shell mktemp -d -t "prombench")) + cd ${$@_TMP_DIR} && git clone ${PROMBENCH_GIT_REPOSITORY} +ifeq ($(subst @,,${BENCHMARK_VERSION}),${BENCHMARK_VERSION}) + @echo ">> --bench.version is a branch, reseting to origin/${BENCHMARK_VERSION}" + cd ${$@_TMP_DIR}/test-infra && git reset --hard origin/${BENCHMARK_VERSION} +else + @echo ">> --bench.version is a commit SHA, reseting to $(subst @,,${BENCHMARK_VERSION})" + cd ${$@_TMP_DIR}/test-infra && git reset --hard $(subst @,,${BENCHMARK_VERSION}) +endif + $(eval PROMBENCH_DIR=${$@_TMP_DIR}/test-infra/prombench) +endif + @echo ">> Using following files in ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}" + @ls -lR ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY} + +.PHONE: clean_tmp_dir +clean_tmp_dir: # Clean after maybe_pull_custom_version + [ -z ${maybe_pull_custom_version_TMP_DIR} ] || rm -rf ${maybe_pull_custom_version_TMP_DIR} + +.PHONY: deploy +deploy: maybe_pull_custom_version node_create resource_apply clean_tmp_dir + +.PHONE: clean +# GCP sometimes takes longer than 30 tries when trying to delete nodes +# if k8s resources are not already cleared +clean: maybe_pull_custom_version resource_delete node_delete clean_tmp_dir + node_create: ${INFRA_CMD} ${PROVIDER} nodes create -a ${AUTH_FILE} \ -v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \ -v EKS_WORKER_ROLE_ARN:${EKS_WORKER_ROLE_ARN} -v EKS_CLUSTER_ROLE_ARN:${EKS_CLUSTER_ROLE_ARN} \ -v EKS_SUBNET_IDS:${EKS_SUBNET_IDS} \ -v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \ - -f manifests/prombench/nodes_${PROVIDER}.yaml + -f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/nodes_${PROVIDER}.yaml resource_apply: $(INFRA_CMD) ${PROVIDER} resource apply -a ${AUTH_FILE} \ @@ -51,15 +90,15 @@ resource_apply: -v CLUSTER_NAME:${CLUSTER_NAME} \ -v PR_NUMBER:${PR_NUMBER} -v RELEASE:${RELEASE} -v DOMAIN_NAME:${DOMAIN_NAME} \ -v GITHUB_ORG:${GITHUB_ORG} -v GITHUB_REPO:${GITHUB_REPO} \ - -f manifests/prombench/benchmark + -f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/benchmark # Required because namespace and cluster-role are not part of the created nodes resource_delete: $(INFRA_CMD) ${PROVIDER} resource delete -a ${AUTH_FILE} \ -v ZONE:${ZONE} -v GKE_PROJECT_ID:${GKE_PROJECT_ID} \ -v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \ - -f manifests/prombench/benchmark/1c_cluster-role-binding.yaml \ - -f manifests/prombench/benchmark/1a_namespace.yaml + -f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/benchmark/1c_cluster-role-binding.yaml \ + -f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/benchmark/1a_namespace.yaml node_delete: $(INFRA_CMD) ${PROVIDER} nodes delete -a ${AUTH_FILE} \ @@ -67,7 +106,7 @@ node_delete: -v EKS_WORKER_ROLE_ARN:${EKS_WORKER_ROLE_ARN} -v EKS_CLUSTER_ROLE_ARN:${EKS_CLUSTER_ROLE_ARN} \ -v EKS_SUBNET_IDS:${EKS_SUBNET_IDS} \ -v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \ - -f manifests/prombench/nodes_${PROVIDER}.yaml + -f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/nodes_${PROVIDER}.yaml all_nodes_running: $(INFRA_CMD) ${PROVIDER} nodes check-running -a ${AUTH_FILE} \ @@ -75,7 +114,7 @@ all_nodes_running: -v EKS_WORKER_ROLE_ARN:${EKS_WORKER_ROLE_ARN} -v EKS_CLUSTER_ROLE_ARN:${EKS_CLUSTER_ROLE_ARN} \ -v EKS_SUBNET_IDS:${EKS_SUBNET_IDS} -v SEPARATOR:${SEPARATOR} \ -v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \ - -f manifests/prombench/nodes_${PROVIDER}.yaml + -f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/nodes_${PROVIDER}.yaml all_nodes_deleted: $(INFRA_CMD) ${PROVIDER} nodes check-deleted -a ${AUTH_FILE} \ @@ -83,4 +122,4 @@ all_nodes_deleted: -v EKS_WORKER_ROLE_ARN:${EKS_WORKER_ROLE_ARN} -v EKS_CLUSTER_ROLE_ARN:${EKS_CLUSTER_ROLE_ARN} \ -v EKS_SUBNET_IDS:${EKS_SUBNET_IDS} -v SEPARATOR:${SEPARATOR} \ -v CLUSTER_NAME:${CLUSTER_NAME} -v PR_NUMBER:${PR_NUMBER} \ - -f manifests/prombench/nodes_${PROVIDER}.yaml + -f ${PROMBENCH_DIR}/${BENCHMARK_DIRECTORY}/nodes_${PROVIDER}.yaml diff --git a/prombench/README.md b/prombench/README.md index 19caf88e9..fbf891055 100644 --- a/prombench/README.md +++ b/prombench/README.md @@ -4,16 +4,16 @@ This setup leverages **GitHub Actions** and **Google Kubernetes Engine (GKE)**, but is designed to be extendable to other Kubernetes providers. -## Overview of Manifest Files +## Configuration Files -The `/manifest` directory contains Kubernetes manifest files: +The `./manifest` directory contains configuration files. We can outline : -- **`cluster_gke.yaml`**: Creates the Main Node in GKE. -- **`cluster_eks.yaml`**: Creates the Main Node in EKS. -- **`cluster-infra/`**: Contains persistent components of the Main Node. -- **`prombench/`**: Resources created and destroyed for each Prombench test. +- **`./manifest/cluster_gke.yaml`**: Creates the Main Node in GKE. +- **`./manifest/cluster_eks.yaml`**: Creates the Main Node in EKS. +- **`./manifest/cluster-infra/`**: Contains persistent components of the Main Node. +- **`./manifest/prombench/`**: Resources created and destroyed for each Prombench test. See [`its README.md`](./manifests/prombench/README.md) for details. -## Setup and Running Prombench +## Prombench Setup Prombench can be run on different providers. Follow these instructions based on your provider: @@ -21,7 +21,7 @@ Prombench can be run on different providers. Follow these instructions based on - [Kubernetes In Docker (KIND)](docs/kind.md) - [Elastic Kubernetes Service (EKS)](docs/eks.md) -## Setting Up GitHub Actions +### Setting Up GitHub Actions 1. Place a workflow file in the `.github` directory of your repository. Refer to the [Prometheus GitHub repository](https://github.com/prometheus/prometheus) for an example. @@ -30,6 +30,10 @@ Prombench can be run on different providers. Follow these instructions based on ```bash cat $AUTH_FILE | base64 -w 0 ``` + +3. Configure webhook to cluster's comment-monitor as described [here](../tools/comment-monitor/README.md#setting-up-the-github-webhook). + +## Prombench Usage ### Triggering Tests via GitHub Comment @@ -37,15 +41,23 @@ Prombench can be run on different providers. Follow these instructions based on - `/prombench main` or `/prombench master` - Compare PR with the main/master branch. - `/prombench v2.4.0` - Compare PR with a specific release version (e.g., from [quay.io/prometheus/prometheus:releaseVersion](https://quay.io/prometheus/prometheus:releaseVersion)). +- `/prombench v2.4.0 --bench.version=@aca1803ccf5d795eee4b0848707eab26d05965cc` - Compare with 2.4.0 release, but use a specific `aca1803ccf5d795eee4b0848707eab26d05965cc` commit on this repository for `./manifests/prombench` resources. +- `/prombench v2.4.0 --bench.version=mybranch` - Compare with 2.4.0 release, but use a specific `mybranch` on this repository for `./manifests/prombench` resources. +- `/prombench v2.4.0 --bench.directory=manifests/prombench-agent-mode` - Compare with 2.4.0 release, but use a specific resource directory on `master` branch for this repository. Currently there is only `./manifests/prombench` available (default), we might add more modes in the future. **Restarting Tests:** - `/prombench restart ` +- `/prombench restart --bench.version=... --bench.directory...` **Stopping Tests:** - `/prombench cancel` +**Printing available commands:** + +- `/prombench help` + ### Building the Docker Image Build the Docker image with: @@ -54,3 +66,6 @@ Build the Docker image with: docker build -t prominfra/prombench:master . ``` + + + diff --git a/prombench/manifests/prombench/README.md b/prombench/manifests/prombench/README.md new file mode 100644 index 000000000..73857b237 --- /dev/null +++ b/prombench/manifests/prombench/README.md @@ -0,0 +1,45 @@ +## Prombench Benchmark Scenario Configuration + +This directory contains resources that are applied (and cleaned) on every benchmark request +via `infra` CLI using [`make deploy`](../../Makefile) and cleaned using [`make clean`](../../Makefile). + +It assumes running cluster was created via `infra` CLI using `make cluster_create` and `make cluster_delete`. + +### Variables + +It expects the following templated variables: + +* `.PR_NUMBER`: The PR number from which `/prombench` was triggered. This PR number also tells what commit to use for the `prometheus-test-pr-{{ .PR_NUMBER }}` Prometheus image building (in the init container). +* `.RELEASE`: The argument provided by `/prombench` caller representing the Prometheus version (docker image tag for `quay.io/prometheus/prometheus:{{ .RELEASE }}`) to compare with, deployed as the `prometheus-test-{{ .RELEASE }}`. +* `.DOMAIN_NAME` +* `.LOADGEN_SCALE_UP_REPLICAS` +* `.GITHUB_ORG` +* `.GITHUB_REPO` + +### Customizations + +> NOTE: See https://github.com/prometheus/proposals/pull/41 for design. + +On the `master` branch, in this directory, we maintain the standard, single benchmarking scenario used +as an acceptance validation for Prometheus. It's important to ensure it represents common Prometheus configuration. + +The only user related parameter for the standard scenario is `RELEASE` version. + +However, it's possible to create, a fully custom benchmarking scenarios for `/prombench` via `--bench.version=` flag. + +Here are an example steps: + +1. Create a new branch on https://github.com/prometheus/test-infra e.g. `benchmark/scenario1`. +2. Modify this directory to your liking e.g. changing query load, metric load of advanced Prometheus configuration. It's also possible to make Prometheus deployments and versions exactly the same, but vary in a single configuration flag, for feature benchmarking. + + > WARN: When customizing this directory, don't change `1a_namespace.yaml` or `1c_cluster-role-binding.yaml` filenames as they are used for cleanup routine. Or, if you change it, know what you're doing in relation to [`make clean` job](../../Makefile). + +3. Push changes to the new branch. +4. From the Prometheus PR comment, call prombench as `/prombench --bench.version=benchmark/scenario1` or `/prombench --bench.version=@` to use configuration files from this custom branch. + +Other details: + +* Other custom branch modifications other than to this directory do not affect prombench (e.g. to infra CLI or makefiles). +* `--bench.version` is designed for a short-term or even one-off benchmark scenario configurations. It's not designed for long-term, well maintained scenarios. For the latter reason we can later e.g. maintain multiple `manifests/prombench` directories and introduce a new `--bench.directory` flag. +* Non-maintainers can follow similar process, but they will need to ask maintainer for a new branch and PR review. We can consider extending `--bench.version` to support remote repositories if this becomes a problem. +* Custom benchmarking logic is implemented in the [`maybe_pull_custom_version` make job](../../Makefile) and invoked by the prombench GH job on Prometheus repo on `deploy` and `clean`. diff --git a/prombench/manifests/prombench/benchmark/3b_prometheus-test_deployment.yaml b/prombench/manifests/prombench/benchmark/3b_prometheus-test-pr_deployment.yaml similarity index 57% rename from prombench/manifests/prombench/benchmark/3b_prometheus-test_deployment.yaml rename to prombench/manifests/prombench/benchmark/3b_prometheus-test-pr_deployment.yaml index 2af21aaa4..871773807 100644 --- a/prombench/manifests/prombench/benchmark/3b_prometheus-test_deployment.yaml +++ b/prombench/manifests/prombench/benchmark/3b_prometheus-test-pr_deployment.yaml @@ -89,7 +89,8 @@ spec: name: prometheus-test - name: instance-ssd hostPath: - path: /mnt/disks/ssd0 #gke ssds + # /mnt is where GKE keeps it's SSD. + path: /mnt/disks/ssd0 - name: prometheus-executable emptyDir: {} terminationGracePeriodSeconds: 300 @@ -113,91 +114,3 @@ spec: selector: app: prometheus prometheus: test-pr-{{ .PR_NUMBER }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prometheus-test-{{ normalise .RELEASE }} - namespace: prombench-{{ .PR_NUMBER }} - labels: - app: prometheus - prometheus: test-{{ normalise .RELEASE }} -spec: - replicas: 1 - selector: - matchLabels: - app: prometheus - prometheus: test-{{ normalise .RELEASE }} - template: - metadata: - namespace: prombench-{{ .PR_NUMBER }} - labels: - app: prometheus - prometheus: test-{{ normalise .RELEASE }} - spec: - serviceAccountName: prometheus - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - topologyKey: kubernetes.io/hostname - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - prometheus - securityContext: - runAsUser: 0 - containers: - - name: prometheus - image: quay.io/prometheus/prometheus:{{ .RELEASE }} - imagePullPolicy: Always - command: [ "/bin/prometheus" ] - args: [ - "--web.external-url=http://{{ .DOMAIN_NAME }}/{{ .PR_NUMBER }}/prometheus-release", - "--storage.tsdb.path=/prometheus", - "--config.file=/etc/prometheus/prometheus.yml", - "--log.level=debug" - ] - resources: - requests: - cpu: 2 - memory: 20Gi - volumeMounts: - - name: config-volume - mountPath: /etc/prometheus - - name: instance-ssd - mountPath: /prometheus - ports: - - name: prom-web - containerPort: 9090 - volumes: - - name: config-volume - configMap: - name: prometheus-test - - name: instance-ssd - hostPath: - # /mnt is where GKE keeps it's SSD - # don't change this if you want Prometheus to take advantage of these local SSDs - path: /mnt/disks/ssd0 - terminationGracePeriodSeconds: 300 - nodeSelector: - node-name: prometheus-{{ .PR_NUMBER }} - isolation: prometheus ---- -apiVersion: v1 -kind: Service -metadata: - name: prometheus-test-{{ normalise .RELEASE }} - namespace: prombench-{{ .PR_NUMBER }} - labels: - app: prometheus - prometheus: test-{{ normalise .RELEASE }} -spec: - ports: - - name: prom-web - port: 80 - targetPort: prom-web - selector: - app: prometheus - prometheus: test-{{ normalise .RELEASE }} diff --git a/prombench/manifests/prombench/benchmark/3b_prometheus-test-release_deployment.yaml b/prombench/manifests/prombench/benchmark/3b_prometheus-test-release_deployment.yaml new file mode 100644 index 000000000..f643ad57d --- /dev/null +++ b/prombench/manifests/prombench/benchmark/3b_prometheus-test-release_deployment.yaml @@ -0,0 +1,86 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-test-{{ normalise .RELEASE }} + namespace: prombench-{{ .PR_NUMBER }} + labels: + app: prometheus + prometheus: test-{{ normalise .RELEASE }} +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + prometheus: test-{{ normalise .RELEASE }} + template: + metadata: + namespace: prombench-{{ .PR_NUMBER }} + labels: + app: prometheus + prometheus: test-{{ normalise .RELEASE }} + spec: + serviceAccountName: prometheus + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - topologyKey: kubernetes.io/hostname + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - prometheus + securityContext: + runAsUser: 0 + containers: + - name: prometheus + image: quay.io/prometheus/prometheus:{{ .RELEASE }} + imagePullPolicy: Always + command: [ "/bin/prometheus" ] + args: [ + "--web.external-url=http://{{ .DOMAIN_NAME }}/{{ .PR_NUMBER }}/prometheus-release", + "--storage.tsdb.path=/prometheus", + "--config.file=/etc/prometheus/prometheus.yml", + "--log.level=debug" + ] + resources: + requests: + cpu: 2 + memory: 20Gi + volumeMounts: + - name: config-volume + mountPath: /etc/prometheus + - name: instance-ssd + mountPath: /prometheus + ports: + - name: prom-web + containerPort: 9090 + volumes: + - name: config-volume + configMap: + name: prometheus-test + - name: instance-ssd + hostPath: + # /mnt is where GKE keeps it's SSD. + path: /mnt/disks/ssd0 + terminationGracePeriodSeconds: 300 + nodeSelector: + node-name: prometheus-{{ .PR_NUMBER }} + isolation: prometheus +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus-test-{{ normalise .RELEASE }} + namespace: prombench-{{ .PR_NUMBER }} + labels: + app: prometheus + prometheus: test-{{ normalise .RELEASE }} +spec: + ports: + - name: prom-web + port: 80 + targetPort: prom-web + selector: + app: prometheus + prometheus: test-{{ normalise .RELEASE }} diff --git a/prombench/manifests/prombench/nodes_gke.yaml b/prombench/manifests/prombench/nodes_gke.yaml index 6e37a5266..dc2dbeb8d 100644 --- a/prombench/manifests/prombench/nodes_gke.yaml +++ b/prombench/manifests/prombench/nodes_gke.yaml @@ -3,18 +3,18 @@ projectid: {{ .GKE_PROJECT_ID }} cluster: name: {{ .CLUSTER_NAME }} nodepools: - # These node-pools will be deployed on triggering benchmark - - name: prometheus-{{ .PR_NUMBER }} + # These node-pools will be deployed on triggered benchmark. + - name: prometheus-{{ .PR_NUMBER }} # Each for single Prometheus. initialnodecount: 2 config: machinetype: n1-highmem-8 imagetype: COS_CONTAINERD disksizegb: 100 - localssdcount: 1 #SSD is used to give fast-lookup to Prometheus servers being benchmarked + localssdcount: 1 #SSD is used to give fast-lookup to Prometheus servers being benchmarked. labels: isolation: prometheus node-name: prometheus-{{ .PR_NUMBER }} - - name: nodes-{{ .PR_NUMBER }} + - name: nodes-{{ .PR_NUMBER }} # For fake-webservers, loadgen and sink. initialnodecount: 1 config: machinetype: n1-highcpu-16 @@ -23,4 +23,4 @@ cluster: localssdcount: 0 #use standard HDD. SSD not needed for fake-webservers. labels: isolation: none - node-name: nodes-{{ .PR_NUMBER }} \ No newline at end of file + node-name: nodes-{{ .PR_NUMBER }} diff --git a/tools/comment-monitor/internal/command.go b/tools/comment-monitor/internal/command.go index eb2511e93..42f4d41b0 100644 --- a/tools/comment-monitor/internal/command.go +++ b/tools/comment-monitor/internal/command.go @@ -148,7 +148,7 @@ func ParseCommand(cfg *Config, comment string) (_ *Command, ok bool, err *Comman cmdLine := comment[:i] rest := cmdLine[len(prefix.Prefix):] - // Is it help? + // Is it a help command? if hasExactPrefix(rest, " help") { return &Command{ Args: map[string]string{}, diff --git a/tools/comment-monitor/internal/command_test.go b/tools/comment-monitor/internal/command_test.go index 59af29104..c774fd12c 100644 --- a/tools/comment-monitor/internal/command_test.go +++ b/tools/comment-monitor/internal/command_test.go @@ -106,7 +106,6 @@ func testParseCommand(t *testing.T, c *Config, cases []parseCommandCase) { } func TestParseCommand(t *testing.T) { - c, err := ParseConfig("./testconfig.yaml") if err != nil { t.Fatal(err)