From f02aeba229846aedcb3cf41691caa3fd8f9b9862 Mon Sep 17 00:00:00 2001 From: Daran He Date: Thu, 27 Apr 2023 17:27:39 -0700 Subject: [PATCH] fix: Fix GPT-NeoX example copy collision during container startup inspired by Liam As Liam investigated, there is a copy collision with the GPT-NeoX demo currently. This change mitigates that. We might still need to convert to a shared_fs approach in the future to take advantage of larger scale data. [ghstack-poisoned] --- .../gpt_neox/gpt_neox_config/determined_cluster.yml | 11 ++++++++--- examples/deepspeed/gpt_neox/startup-hook.sh | 7 ++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/deepspeed/gpt_neox/gpt_neox_config/determined_cluster.yml b/examples/deepspeed/gpt_neox/gpt_neox_config/determined_cluster.yml index 1b2caf1ce25..0c91b9a3acf 100644 --- a/examples/deepspeed/gpt_neox/gpt_neox_config/determined_cluster.yml +++ b/examples/deepspeed/gpt_neox/gpt_neox_config/determined_cluster.yml @@ -1,6 +1,5 @@ # Data paths and options when using EleutherAI cluster { - "data-path": "/run/determined/workdir/shared_fs/data/enron/enron_text_document", # or for weighted datasets: # "train-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"], # "test-data-paths": ["/mnt/ssd-1/data/enron/enron_text_document", "/mnt/ssd-cluster/data/enron/enron_text_document"], @@ -9,8 +8,14 @@ # "test-data-weights": [2., 1.], # "valid-data-weights": [0.5, 0.4], - "vocab-file": "/run/determined/workdir/shared_fs/data/gpt2-vocab.json", - "merge-file": "/run/determined/workdir/shared_fs/data/gpt2-merges.txt", + # Use docker image data for now. + "data-path": "/gpt-neox/data/enron/enron_text_document", + "vocab-file": "/gpt-neox/data/gpt2-vocab.json", + "merge-file": "/gpt-neox/data/gpt2-merges.txt", + # TODO(daran.he|liam) Use shared fs data. + # "data-path": "/run/determined/workdir/shared_fs/data/enron/enron_text_document", + # "vocab-file": "/run/determined/workdir/shared_fs/data/gpt2-vocab.json", + # "merge-file": "/run/determined/workdir/shared_fs/data/gpt2-merges.txt", "save": "/tmp/checkpoints", "load": "/tmp/checkpoints", "log-dir": null, diff --git a/examples/deepspeed/gpt_neox/startup-hook.sh b/examples/deepspeed/gpt_neox/startup-hook.sh index 128f0e8504f..f7b4a60d99a 100644 --- a/examples/deepspeed/gpt_neox/startup-hook.sh +++ b/examples/deepspeed/gpt_neox/startup-hook.sh @@ -1,10 +1,7 @@ export PYTHONPATH=$PYTHONPATH:/gpt-neox -# Copy dataset from docker image to shared filesystem +# Use data in docker image and copy over needed configs. USER=$(whoami) mkdir /tmp/${USER} -mkdir -p /run/determined/workdir/shared_fs/data -cp -r -n /gpt-neox/data /run/determined/workdir/shared_fs/ - cd /run/determined/workdir -cp gpt_neox_config/determined_cluster.yml /gpt-neox/configs +cp gpt_neox_config/* /gpt-neox/configs