From 8fca0603f9d197eef5e6126b1b077574788150dd Mon Sep 17 00:00:00 2001 From: arslantu Date: Sat, 9 Mar 2024 11:04:24 +0800 Subject: [PATCH] =?UTF-8?q?chore=F0=9F=8D=9F:?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- realign/run_log.txt | 53 ----------------- train/run_log.txt | 136 -------------------------------------------- 2 files changed, 189 deletions(-) delete mode 100644 realign/run_log.txt delete mode 100644 train/run_log.txt diff --git a/realign/run_log.txt b/realign/run_log.txt deleted file mode 100644 index 2ccfb7c..0000000 --- a/realign/run_log.txt +++ /dev/null @@ -1,53 +0,0 @@ -WARNING:torch.distributed.run: -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -/home/tushilong/anaconda3/envs/realign/bin/python: can't open file '/home/tushilong/code/realign/realign/train.py': [Errno 2] No such file or directory -/home/tushilong/anaconda3/envs/realign/bin/python: can't open file '/home/tushilong/code/realign/realign/train.py': [Errno 2] No such file or directory -/home/tushilong/anaconda3/envs/realign/bin/python: can't open file '/home/tushilong/code/realign/realign/train.py': [Errno 2] No such file or directory -ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 44359) of binary: /home/tushilong/anaconda3/envs/realign/bin/python -Traceback (most recent call last): - File "/home/tushilong/anaconda3/envs/realign/bin/torchrun", line 33, in - sys.exit(load_entry_point('torch==2.0.1', 'console_scripts', 'torchrun')()) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper - return f(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/run.py", line 794, in main - run(args) - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/run.py", line 785, in run - elastic_launch( - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-03-09_02:08:09 - host : ubuntu - rank : 1 (local_rank: 1) - exitcode : 2 (pid: 44360) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-03-09_02:08:09 - host : ubuntu - rank : 2 (local_rank: 2) - exitcode : 2 (pid: 44361) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-03-09_02:08:09 - host : ubuntu - rank : 0 (local_rank: 0) - exitcode : 2 (pid: 44359) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ diff --git a/train/run_log.txt b/train/run_log.txt deleted file mode 100644 index a2538c4..0000000 --- a/train/run_log.txt +++ /dev/null @@ -1,136 +0,0 @@ -WARNING:torch.distributed.run: -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -Training model with params: -base_model: /home/tushilong/hf/models/Llama-2-7b-hf -output_dir: ../ckpts/stylish -micro_batch_size: 2 -gradient_accumulation_steps: 1 -train_batch_size: 2 -gradient_checkpointing: True -num_epochs: 1 -learning_rate: 2e-05 -weight_decay: 0.0001 -warmup_ratio: 0.06 -deepspeed_config: None -fsdp: shard_grad_op auto_wrap offload -fsdp_config: ./configs/fsdp/llama2_fsdp_config.json -smart_embedding: False -wandb_project: -wandb_run_name: -wandb_watch: -wandb_log_model: -resume_from_checkpoint: False - - Loading checkpoint shards: 0%| | 0/2 [00:00 - fire.Fire(train) - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/fire/core.py", line 141, in Fire - component_trace = _Fire(component, args, parsed_flag_args, context, name) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/fire/core.py", line 475, in _Fire - component, remaining_args = _CallAndUpdateTrace( - ^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace - component = fn(*varargs, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/code/realign/train/train.py", line 160, in train - trainer.train(resume_from_checkpoint=resume_from_checkpoint) - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/transformers/trainer.py", line 1555, in train - return inner_training_loop( - ^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/transformers/trainer.py", line 1837, in _inner_training_loop - tr_loss_step = self.training_step(model, inputs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/transformers/trainer.py", line 2682, in training_step - loss = self.compute_loss(model, inputs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/transformers/trainer.py", line 2707, in compute_loss - outputs = model(**inputs) - ^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/accelerate/utils/operations.py", line 659, in forward - return model_forward(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/accelerate/utils/operations.py", line 647, in __call__ - return convert_to_fp32(self.model_forward(*args, **kwargs)) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast - return func(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/accelerate/utils/operations.py", line 659, in forward - return model_forward(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/accelerate/utils/operations.py", line 647, in __call__ - return convert_to_fp32(self.model_forward(*args, **kwargs)) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast - return func(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward - output = self._fsdp_wrapped_module(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/code/realign/llama/rellama.py", line 129, in forward - assert torch.isnan(target_logits).sum() == 0, f"target_logits has nan: {torch.isnan(target_logits).sum()}" - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -AssertionError: target_logits has nan: 10752000 -WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 20205 closing signal SIGTERM -WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 20206 closing signal SIGTERM -ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 2 (pid: 20207) of binary: /home/tushilong/anaconda3/envs/realign/bin/python -Traceback (most recent call last): - File "/home/tushilong/anaconda3/envs/realign/bin/torchrun", line 33, in - sys.exit(load_entry_point('torch==2.0.1', 'console_scripts', 'torchrun')()) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper - return f(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/run.py", line 794, in main - run(args) - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/run.py", line 785, in run - elastic_launch( - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-03-09_10:45:34 - host : ubuntu - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 20207) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================