weekly-tests #46

Workflow file for this run

.github/workflows/weekly_test.yaml at 330c03a

	name: weekly-tests
	on:
	workflow_dispatch:
	schedule:
	- cron: '56 18 * * 5'
	env:
	WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE \|cut -d '/' -f 1-4)
	SLURM_PARTITION: llm_s

	jobs:
	training_4GPU:
	runs-on: [t_cluster]
	timeout-minutes: 15
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_4GPU
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=$jobname -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	training_8GPU_4DP2TP:
	runs-on: [t_cluster]
	timeout-minutes: 10
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_8GPU_4DP2TP
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	training_8GPU_4DP2TPSP:
	runs-on: [t_cluster]
	timeout-minutes: 10
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_8GPU_4DP2TPSP
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	training_8GPU_4DP2TPSP_optimizer_v2:
	runs-on: [t_cluster]
	timeout-minutes: 10
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_8GPU_4DP2TPSP_optimizer_v2
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP_optimizer_v2" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	training_8GPU_4DP2PP:
	runs-on: [t_cluster]
	timeout-minutes: 10
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_8GPU_4DP2PP
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	training_8GPU_4DP2PP_optimizer_v2:
	runs-on: [t_cluster]
	timeout-minutes: 10
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_8GPU_4DP2PP_optimizer_v2
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_optimizer_v2" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	training_8GPU_4DP2PP_InterleavedOverlap:
	runs-on: [t_cluster]
	timeout-minutes: 10
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_8GPU_4DP2PP_InterleavedOverlap
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	training_16GPU_4DP2TP2PP_MTP:
	runs-on: [t_cluster]
	timeout-minutes: 10
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_16GPU_4DP2TP2PP_MTP
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	training_16GPU_4DP2TP2PP_MSP:
	runs-on: [t_cluster]
	timeout-minutes: 10
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_16GPU_4DP2TP2PP_MSP
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	training_16GPU_4DP2TP2PP_MSP_optimizer_v2:
	runs-on: [t_cluster]
	timeout-minutes: 15
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_16GPU_4DP2TP2PP_MSP_optimizer_v2
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP_optimizer_v2" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	training_16GPU_4DP2TP2PP_FSP:
	runs-on: [t_cluster]
	timeout-minutes: 10
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_16GPU_4DP2TP2PP_FSP
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	training_8GPU_ISP:
	runs-on: [t_cluster]
	timeout-minutes: 10
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_8GPU_ISP
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	training_8GPU_ISP_CKPT:
	runs-on: [t_cluster]
	timeout-minutes: 20
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: training_8GPU_ISP_CKPT
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	jobname=LOAD_${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	unit_test_optimizer:
	runs-on: [t_cluster]
	timeout-minutes: 35
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: test_optimizer
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	load_ckpt_then_assert_loss:
	runs-on: [t_cluster]
	timeout-minutes: 10
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event_name == 'schedule' && 'develop' \|\| github.event_name == 'workflow_dispatch' && '' }}

	- name: test_ckpt_loss
	run: \|
	source activate ${evo_env_torch21_flash2}
	jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
	srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_training/test_load_ckpt_loss.py
	exit_code=$?
	sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

	notify_to_feishu:
	if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' \|\| github.ref_name == 'main') }}
	needs: [
	training_4GPU,
	training_8GPU_4DP2TP,
	training_8GPU_4DP2TPSP,
	training_8GPU_4DP2PP,
	training_8GPU_4DP2PP_InterleavedOverlap,
	training_16GPU_4DP2TP2PP_MTP,
	training_16GPU_4DP2TP2PP_MSP,
	training_16GPU_4DP2TP2PP_FSP,
	training_8GPU_ISP,
	training_8GPU_ISP_CKPT,
	unit_test_optimizer,
	load_ckpt_then_assert_loss
	]
	runs-on: [t_cluster]
	steps:
	- name: mask env
	run: \|
	echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
	echo "::add-mask::$path_prefix"
	- name: notify
	run: \|
	curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

weekly-tests #46

Workflow file

weekly-tests #46

Jobs

Run details

Workflow file for this run