From 814e417b3fbcd19ef298d522779f2ac6e682ccfe Mon Sep 17 00:00:00 2001 From: williamlardier Date: Tue, 3 Dec 2024 14:52:31 +0100 Subject: [PATCH] Ensure count items is not already running A race condition where a count items was running when the new one started at the beggining of the quota tests, and the old one completing after the new one, would lead to an override of the infostore collection, in turn leading to missing metrics for the quota buckets/accounts, making the quota tests fail till a new count items starts. Use a file locking mechanism to ensure no concurrent job runs - Watching the resource or using kube api is prone to race condition, as their is a delay between the request to create a job, and its run, plus, the logic is not easy to centralize if we consider job already completed or not yet started, as we delete the old jobs upon completion - Using alock mechanism is similar to what we already have for quotas, plus, it ensures no race condition and tests being able to restart as soon as possible. Issue: ZENKO-4941 --- tests/ctst/steps/utils/kubernetes.ts | 70 +++++++++++++++++++++------- 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/tests/ctst/steps/utils/kubernetes.ts b/tests/ctst/steps/utils/kubernetes.ts index 32e9aceb91..147cf099b9 100644 --- a/tests/ctst/steps/utils/kubernetes.ts +++ b/tests/ctst/steps/utils/kubernetes.ts @@ -1,3 +1,6 @@ +import fs from 'fs'; +import * as path from 'path'; +import lockFile from 'proper-lockfile'; import { KubernetesHelper, Utils } from 'cli-testing'; import Zenko from 'world/Zenko'; import { @@ -71,12 +74,40 @@ export function createKubeCustomObjectClient(world: Zenko): CustomObjectsApi { return KubernetesHelper.customObject; } -export async function createJobAndWaitForCompletion(world: Zenko, jobName: string, customMetadata?: string) { +export async function createJobAndWaitForCompletion( + world: Zenko, + jobName: string, + customMetadata?: string +) { const batchClient = createKubeBatchClient(world); const watchClient = createKubeWatchClient(world); + + const lockFilePath = path.join('/tmp', `${jobName}.lock`); + let releaseLock: (() => Promise) | false = false; + + if (!fs.existsSync(lockFilePath)) { + fs.writeFileSync(lockFilePath, ''); + } + try { + releaseLock = await lockFile.lock(lockFilePath, { + // Expect the job does not take more than 2 minutes to complete + stale: 2 * 60 * 1000, + // use a non-exponential backoff strategy + // try once per second for 2min 10s + retries: { + retries: 130, + factor: 1, + minTimeout: 1000, + maxTimeout: 1000, + }, + }); + world.logger.debug(`Acquired lock for job: ${jobName}`); + + // Read the cron job and prepare the job spec const cronJob = await batchClient.readNamespacedCronJob(jobName, 'default'); const cronJobSpec = cronJob.body.spec?.jobTemplate.spec; + const job = new V1Job(); const metadata = new V1ObjectMeta(); job.apiVersion = 'batch/v1'; @@ -87,50 +118,57 @@ export async function createJobAndWaitForCompletion(world: Zenko, jobName: strin 'cronjob.kubernetes.io/instantiate': 'ctst', }; if (customMetadata) { - metadata.annotations = { - custom: customMetadata, - }; + metadata.annotations.custom = customMetadata; } job.metadata = metadata; + // Create the job const response = await batchClient.createNamespacedJob('default', job); - world.logger.debug('job created', { - job: response.body.metadata, - }); + world.logger.debug('Job created', { job: response.body.metadata }); const expectedJobName = response.body.metadata?.name; + // Watch for job completion await new Promise((resolve, reject) => { void watchClient.watch( '/apis/batch/v1/namespaces/default/jobs', {}, (type: string, apiObj, watchObj) => { - if (job.metadata?.name && expectedJobName && - (watchObj.object?.metadata?.name as string)?.startsWith?.(expectedJobName)) { + if ( + expectedJobName && + (watchObj.object?.metadata?.name as string)?.startsWith?.(expectedJobName) + ) { if (watchObj.object?.status?.succeeded) { - world.logger.debug('job succeeded', { - job: job.metadata, - }); + world.logger.debug('Job succeeded', { job: job.metadata }); resolve(); } else if (watchObj.object?.status?.failed) { - world.logger.debug('job failed', { + world.logger.debug('Job failed', { job: job.metadata, object: watchObj.object, }); - reject(new Error('job failed')); + reject(new Error('Job failed')); } } - }, reject); + }, + reject + ); }); } catch (err: unknown) { - world.logger.error('error creating job', { + world.logger.error('Error creating or waiting for job completion', { jobName, err, }); throw err; + } finally { + // Ensure the lock is released + if (releaseLock) { + await releaseLock(); + world.logger.debug(`Released lock for job: ${jobName}`); + } } } + export async function waitForZenkoToStabilize( world: Zenko, needsReconciliation = false, timeout = 15 * 60 * 1000, namespace = 'default') { // ZKOP pulls the overlay configuration from Pensieve every 5 seconds