You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi, I was checking the greedy decoding pass@1 score for Llama3-8B base model and Llama3-8B-Instruct model against https://evalplus.github.io/leaderboard.html
I found that in the diagram, Llama3-8B-Instruct gets 59.5 pass@1 accuracy for greedy decoding, while Llama3-8B pass@1 accuracy is 51.6. The task is MBPP.
However, when I use this code base, I get 51.6 for Llama3-8B-Instruct and 45.4 for Llama3-8B base. I set the max_generation_length to 2048 when evaluating for MBPP. Strangely, I was managed to reproduce the greedy decoding pass@1 score for HumanEval. Do you know what can go wrong with the evaluation?
Here is the full result
My understanding is the evalplus MBPP has been sanitized so that the prompts / answers are substantially different from the original: https://github.com/evalplus/evalplus/releases/tag/v0.2.0 . The MBPPPlus task here might be closer to what's in evalplus.
Hi, I was checking the greedy decoding pass@1 score for Llama3-8B base model and Llama3-8B-Instruct model against https://evalplus.github.io/leaderboard.html
I found that in the diagram, Llama3-8B-Instruct gets 59.5 pass@1 accuracy for greedy decoding, while Llama3-8B pass@1 accuracy is 51.6. The task is MBPP.
However, when I use this code base, I get 51.6 for Llama3-8B-Instruct and 45.4 for Llama3-8B base. I set the max_generation_length to 2048 when evaluating for MBPP. Strangely, I was managed to reproduce the greedy decoding pass@1 score for HumanEval. Do you know what can go wrong with the evaluation?
Here is the full result
{
"mbpp": {
"pass@1": 0.454
},
"config": {
"prefix": "",
"do_sample": false,
"temperature": 0.2,
"top_k": 0,
"top_p": 0.95,
"n_samples": 1,
"eos": "<|endoftext|>",
"seed": 0,
"model": "meta-llama/Meta-Llama-3-8B",
"modeltype": "causal",
"peft_model": null,
"revision": null,
"use_auth_token": false,
"trust_remote_code": false,
"tasks": "mbpp",
"instruction_tokens": null,
"batch_size": 1,
"max_length_generation": 2048,
"precision": "fp32",
"load_in_8bit": false,
"load_in_4bit": false,
"left_padding": false,
"limit": null,
"limit_start": 0,
"save_every_k_tasks": -1,
"postprocess": true,
"allow_code_execution": true,
"generation_only": false,
"load_generations_path": null,
"load_data_path": null,
"metric_output_path": "evaluation_results.json",
"save_generations": false,
"load_generations_intermediate_paths": null,
"save_generations_path": "generations.json",
"save_references": false,
"save_references_path": "references.json",
"prompt": "prompt",
"max_memory_per_gpu": null,
"check_references": false,
}
}
{
"mbpp": {
"pass@1": 0.516
},
"config": {
"prefix": "",
"do_sample": false,
"temperature": 0.2,
"top_k": 0,
"top_p": 0.95,
"n_samples": 1,
"eos": "<|endoftext|>",
"seed": 0,
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"modeltype": "causal",
"peft_model": null,
"revision": null,
"use_auth_token": false,
"trust_remote_code": false,
"tasks": "mbpp",
"instruction_tokens": null,
"batch_size": 1,
"max_length_generation": 2048,
"precision": "fp32",
"load_in_8bit": false,
"load_in_4bit": false,
"left_padding": false,
"limit": null,
"limit_start": 0,
"save_every_k_tasks": -1,
"postprocess": true,
"allow_code_execution": true,
"generation_only": false,
"load_generations_path": null,
"load_data_path": null,
"metric_output_path": "evaluation_results.json",
"save_generations": false,
"load_generations_intermediate_paths": null,
"save_generations_path": "generations.json",
"save_references": false,
"save_references_path": "references.json",
"prompt": "prompt",
"max_memory_per_gpu": null,
"check_references": false,
}
} Thank you for your attention
The text was updated successfully, but these errors were encountered: