Skip to content

Commit

Permalink
add additional step to pipeline to generate a metrics report
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Clifford <mcliffor@redhat.com>
  • Loading branch information
MichaelClifford committed Dec 19, 2024
1 parent cd8bce7 commit 8a6be72
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 46 deletions.
4 changes: 2 additions & 2 deletions eval/final/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .components import run_final_eval_op
from .components import generate_metrics_report_op, run_final_eval_op

# from . import faked

__all__ = ["run_final_eval_op"]
__all__ = ["run_final_eval_op", "generate_metrics_report_op"]
53 changes: 47 additions & 6 deletions eval/final/components.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
# type: ignore
# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error

from kfp.dsl import Artifact, Output, component
from kfp.dsl import Artifact, Input, Metrics, Output, component

from utils.consts import RHELAI_IMAGE
from utils.consts import PYTHON_IMAGE, RHELAI_IMAGE


@component(base_image=RHELAI_IMAGE, install_kfp_package=False)
def run_final_eval_op(
mmlu_branch_output: Output[Artifact],
mt_bench_branch_output: Output[Artifact],
base_model_dir: str,
base_branch: str,
candidate_branch: str,
Expand All @@ -20,6 +18,8 @@ def run_final_eval_op(
candidate_model: str = None,
taxonomy_path: str = "/input/taxonomy",
sdg_path: str = "/input/sdg",
mmlu_branch_output_path: str = "/output/mmlu_branch",
mt_bench_branch_output_path: str = "/output/mt_bench_branch",
):
import json
import os
Expand Down Expand Up @@ -336,8 +336,13 @@ def find_node_dataset_directories(base_dir: str):
"summary": summary,
}

with open(mmlu_branch_output.path, "w", encoding="utf-8") as f:
if not os.path.exists(mmlu_branch_output_path):
os.makedirs(mmlu_branch_output_path)
with open(
f"{mmlu_branch_output_path}/mmlu_branch_data.json", "w", encoding="utf-8"
) as f:
json.dump(mmlu_branch_data, f, indent=4)

else:
print("No MMLU tasks directories found, skipping MMLU_branch evaluation.")

Expand Down Expand Up @@ -478,5 +483,41 @@ def find_node_dataset_directories(base_dir: str):
"summary": summary,
}

with open(mt_bench_branch_output.path, "w", encoding="utf-8") as f:
if not os.path.exists(mt_bench_branch_output_path):
os.makedirs(mt_bench_branch_output_path)
with open(
f"{mt_bench_branch_output_path}/mt_bench_branch_data.json",
"w",
encoding="utf-8",
) as f:
json.dump(mt_bench_branch_data, f, indent=4)


@component(base_image=PYTHON_IMAGE, install_kfp_package=False)
def generate_metrics_report_op(
metrics: Output[Metrics],
):
import ast
import json

with open("/output/mt_bench_data.json", "r") as f:
mt_bench_data = f.read()
mt_bench_data = ast.literal_eval(mt_bench_data)[0]

metrics.log_metric("mt_bench_best_model", mt_bench_data["model"])
metrics.log_metric("mt_bench_best_score", mt_bench_data["overall_score"])
metrics.log_metric("mt_bench_best_model_error_rate", mt_bench_data["error_rate"])

with open("/output/mt_bench_branch/mt_bench_branch_data.json", "r") as f:
mt_bench_branch_data = json.loads(f.read())

metrics.log_metric("mt_bench_branch_score", mt_bench_branch_data["overall_score"])
metrics.log_metric(
"mt_bench_branch_base_score", mt_bench_branch_data["base_overall_score"]
)

with open("/output/mmlu_branch/mmlu_branch_data.json", "r") as f:
mmlu_branch_data = json.loads(f.read())

metrics.log_metric("mmlu_branch_score", mmlu_branch_data["model_score"])
metrics.log_metric("mmlu_branch_base_score", mmlu_branch_data["base_model_score"])
24 changes: 18 additions & 6 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def ilab_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
)

# Imports for evaluation
from eval.final import run_final_eval_op
from eval.final import generate_metrics_report_op, run_final_eval_op
from eval.mt_bench import run_mt_bench_op

@dsl.pipeline(
Expand Down Expand Up @@ -417,17 +417,29 @@ def pipeline(
mount_path="/output",
)

output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output)
output_pvc_delete_task.after(
output_model_task, output_mt_bench_task, final_eval_task
)

sdg_pvc_delete_task = DeletePVC(pvc_name=sdg_input_pvc_task.output)
sdg_pvc_delete_task.after(final_eval_task)

model_pvc_delete_task = DeletePVC(pvc_name=model_pvc_task.output)
model_pvc_delete_task.after(final_eval_task)

generate_metrics_report_task = generate_metrics_report_op()
generate_metrics_report_task.after(output_mt_bench_task, final_eval_task)
generate_metrics_report_task.set_caching_options(False)
mount_pvc(
task=generate_metrics_report_task,
pvc_name=output_pvc_task.output,
mount_path="/output",
)

output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output)
output_pvc_delete_task.after(
output_model_task,
output_mt_bench_task,
final_eval_task,
generate_metrics_report_task,
)

return

return pipeline
Expand Down
142 changes: 110 additions & 32 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
# train_num_warmup_steps_phase_2: int [Default: 1000.0]
# train_save_samples: int [Default: 250000.0]
# train_seed: int [Default: 42.0]
# Outputs:
# generate-metrics-report-op-metrics: system.Metrics
components:
comp-createpvc:
executorLabel: exec-createpvc
Expand Down Expand Up @@ -266,6 +268,14 @@ components:
description: Name of the PVC to delete. Supports passing a runtime-generated
name, such as a name provided by ``kubernetes.CreatePvcOp().outputs['name']``.
parameterType: STRING
comp-generate-metrics-report-op:
executorLabel: exec-generate-metrics-report-op
outputDefinitions:
artifacts:
metrics:
artifactType:
schemaTitle: system.Metrics
schemaVersion: 0.0.1
comp-git-clone-op:
executorLabel: exec-git-clone-op
inputDefinitions:
Expand Down Expand Up @@ -464,6 +474,14 @@ components:
parameterType: STRING
merge_system_user_message:
parameterType: BOOLEAN
mmlu_branch_output_path:
defaultValue: /output/mmlu_branch
isOptional: true
parameterType: STRING
mt_bench_branch_output_path:
defaultValue: /output/mt_bench_branch
isOptional: true
parameterType: STRING
sdg_path:
defaultValue: /input/sdg
isOptional: true
Expand All @@ -472,16 +490,6 @@ components:
defaultValue: /input/taxonomy
isOptional: true
parameterType: STRING
outputDefinitions:
artifacts:
mmlu_branch_output:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
mt_bench_branch_output:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
comp-run-mt-bench-op:
executorLabel: exec-run-mt-bench-op
inputDefinitions:
Expand Down Expand Up @@ -658,6 +666,41 @@ deploymentSpec:
exec-deletepvc-3:
container:
image: argostub/deletepvc
exec-generate-metrics-report-op:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- generate_metrics_report_op
command:
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef generate_metrics_report_op(\n metrics: Output[Metrics],\n\
):\n import ast\n import json\n\n with open(\"/output/mt_bench_data.json\"\
, \"r\") as f:\n mt_bench_data = f.read()\n mt_bench_data = ast.literal_eval(mt_bench_data)[0]\n\
\n metrics.log_metric(\"mt_bench_best_model\", mt_bench_data[\"model\"\
])\n metrics.log_metric(\"mt_bench_best_score\", mt_bench_data[\"overall_score\"\
])\n metrics.log_metric(\"mt_bench_best_model_error_rate\", mt_bench_data[\"\
error_rate\"])\n\n with open(\"/output/mt_bench_branch/mt_bench_branch_data.json\"\
, \"r\") as f:\n mt_bench_branch_data = json.loads(f.read())\n\n\
\ metrics.log_metric(\"mt_bench_branch_score\", mt_bench_branch_data[\"\
overall_score\"])\n metrics.log_metric(\n \"mt_bench_branch_base_score\"\
, mt_bench_branch_data[\"base_overall_score\"]\n )\n\n with open(\"\
/output/mmlu_branch/mmlu_branch_data.json\", \"r\") as f:\n mmlu_branch_data\
\ = json.loads(f.read())\n\n metrics.log_metric(\"mmlu_branch_score\"\
, mmlu_branch_data[\"model_score\"])\n metrics.log_metric(\"mmlu_branch_base_score\"\
, mmlu_branch_data[\"base_model_score\"])\n\n"
image: quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111
exec-git-clone-op:
container:
args:
Expand Down Expand Up @@ -1138,15 +1181,16 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef run_final_eval_op(\n mmlu_branch_output: Output[Artifact],\n\
\ mt_bench_branch_output: Output[Artifact],\n base_model_dir: str,\n\
\ base_branch: str,\n candidate_branch: str,\n max_workers: str,\n\
\ few_shots: int,\n batch_size: str,\n merge_system_user_message:\
\ bool,\n candidate_model: str = None,\n taxonomy_path: str = \"/input/taxonomy\"\
,\n sdg_path: str = \"/input/sdg\",\n):\n import json\n import\
\ os\n import subprocess\n\n import torch\n from instructlab.eval.mmlu\
\ import MMLUBranchEvaluator\n from instructlab.eval.mt_bench import\
\ MTBenchBranchEvaluator\n from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores,\
\ *\n\ndef run_final_eval_op(\n base_model_dir: str,\n base_branch:\
\ str,\n candidate_branch: str,\n max_workers: str,\n few_shots:\
\ int,\n batch_size: str,\n merge_system_user_message: bool,\n \
\ candidate_model: str = None,\n taxonomy_path: str = \"/input/taxonomy\"\
,\n sdg_path: str = \"/input/sdg\",\n mmlu_branch_output_path: str\
\ = \"/output/mmlu_branch\",\n mt_bench_branch_output_path: str = \"\
/output/mt_bench_branch\",\n):\n import json\n import os\n import\
\ subprocess\n\n import torch\n from instructlab.eval.mmlu import\
\ MMLUBranchEvaluator\n from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n\
\ from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores,\
\ sort_score\n\n if judge_ca_cert := os.getenv(\"JUDGE_CA_CERT_PATH\"\
):\n import httpx\n import openai\n\n # Create a custom\
\ HTTP client\n class CustomHttpClient(httpx.Client):\n \
Expand Down Expand Up @@ -1299,9 +1343,11 @@ deploymentSpec:
: candidate_model,\n \"model_score\": round(overall_score, 2),\n\
\ \"base_model\": base_model_dir,\n \"base_model_score\"\
: round(base_overall_score, 2),\n \"summary\": summary,\n \
\ }\n\n with open(mmlu_branch_output.path, \"w\", encoding=\"\
utf-8\") as f:\n json.dump(mmlu_branch_data, f, indent=4)\n \
\ else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\
\ }\n\n if not os.path.exists(mmlu_branch_output_path):\n \
\ os.makedirs(mmlu_branch_output_path)\n with open(\n \
\ f\"{mmlu_branch_output_path}/mmlu_branch_data.json\", \"w\", encoding=\"\
utf-8\"\n ) as f:\n json.dump(mmlu_branch_data, f, indent=4)\n\
\n else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\
\ evaluation.\")\n\n # MT_BENCH_BRANCH\n\n print(\"Starting MT_BENCH_BRANCH\
\ ...\")\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n \
\ judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"\
Expand Down Expand Up @@ -1366,8 +1412,11 @@ deploymentSpec:
\ \"judge_model\": judge_model_name,\n \"max_score\": \"10.0\"\
,\n \"overall_score\": overall_score,\n \"base_overall_score\"\
: base_overall_score,\n \"error_rate\": error_rate,\n \"summary\"\
: summary,\n }\n\n with open(mt_bench_branch_output.path, \"w\", encoding=\"\
utf-8\") as f:\n json.dump(mt_bench_branch_data, f, indent=4)\n\n"
: summary,\n }\n\n if not os.path.exists(mt_bench_branch_output_path):\n\
\ os.makedirs(mt_bench_branch_output_path)\n with open(\n \
\ f\"{mt_bench_branch_output_path}/mt_bench_branch_data.json\",\n \
\ \"w\",\n encoding=\"utf-8\",\n ) as f:\n json.dump(mt_bench_branch_data,\
\ f, indent=4)\n\n"
env:
- name: HOME
value: /tmp
Expand Down Expand Up @@ -1650,6 +1699,12 @@ pipelineInfo:
name: instructlab
root:
dag:
outputs:
artifacts:
generate-metrics-report-op-metrics:
artifactSelectors:
- outputArtifactKey: metrics
producerSubtask: generate-metrics-report-op
tasks:
createpvc:
cachingOptions:
Expand Down Expand Up @@ -1735,16 +1790,14 @@ root:
componentRef:
name: comp-deletepvc
dependentTasks:
- createpvc-3
- pvc-to-model-op
- pvc-to-mt-bench-op
- createpvc
- run-final-eval-op
inputs:
parameters:
pvc_name:
taskOutputParameter:
outputParameterKey: name
producerTask: createpvc-3
producerTask: createpvc
taskInfo:
name: deletepvc
deletepvc-2:
Expand All @@ -1753,14 +1806,14 @@ root:
componentRef:
name: comp-deletepvc-2
dependentTasks:
- createpvc
- createpvc-2
- run-final-eval-op
inputs:
parameters:
pvc_name:
taskOutputParameter:
outputParameterKey: name
producerTask: createpvc
producerTask: createpvc-2
taskInfo:
name: deletepvc-2
deletepvc-3:
Expand All @@ -1769,16 +1822,29 @@ root:
componentRef:
name: comp-deletepvc-3
dependentTasks:
- createpvc-2
- createpvc-3
- generate-metrics-report-op
- pvc-to-model-op
- pvc-to-mt-bench-op
- run-final-eval-op
inputs:
parameters:
pvc_name:
taskOutputParameter:
outputParameterKey: name
producerTask: createpvc-2
producerTask: createpvc-3
taskInfo:
name: deletepvc-3
generate-metrics-report-op:
cachingOptions: {}
componentRef:
name: comp-generate-metrics-report-op
dependentTasks:
- createpvc-3
- pvc-to-mt-bench-op
- run-final-eval-op
taskInfo:
name: generate-metrics-report-op
git-clone-op:
cachingOptions: {}
componentRef:
Expand Down Expand Up @@ -2232,6 +2298,12 @@ root:
description: Training parameter. Random seed for initializing training.
isOptional: true
parameterType: NUMBER_INTEGER
outputDefinitions:
artifacts:
generate-metrics-report-op-metrics:
artifactType:
schemaTitle: system.Metrics
schemaVersion: 0.0.1
schemaVersion: 2.1.0
sdkVersion: kfp-2.9.0
---
Expand All @@ -2249,6 +2321,12 @@ platforms:
taskOutputParameter:
outputParameterKey: name
producerTask: createpvc
exec-generate-metrics-report-op:
pvcMount:
- mountPath: /output
taskOutputParameter:
outputParameterKey: name
producerTask: createpvc-3
exec-git-clone-op:
pvcMount:
- mountPath: /data
Expand Down

0 comments on commit 8a6be72

Please sign in to comment.