Compare commits
No commits in common. "8fca0603f9d197eef5e6126b1b077574788150dd" and "bc182d09e050d4c1e6729163ee45a75b6a37156a" have entirely different histories.
8fca0603f9
...
bc182d09e0
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -2,5 +2,4 @@ __pycache__/
|
||||||
ckpts/
|
ckpts/
|
||||||
data/
|
data/
|
||||||
outputs/
|
outputs/
|
||||||
.vscode/
|
.vscode/
|
||||||
*/run_log.txt
|
|
|
@ -133,10 +133,8 @@ class Method_1(ReLlamaForCausalLM):
|
||||||
for i in range(predict_logits.size(0)):
|
for i in range(predict_logits.size(0)):
|
||||||
# iterate over the batch
|
# iterate over the batch
|
||||||
|
|
||||||
# token [1] is the start of response (bos token)
|
|
||||||
start_idx = torch.where(labels[i] == 1)[0].item()
|
start_idx = torch.where(labels[i] == 1)[0].item()
|
||||||
|
|
||||||
# if [-100] in response, we should calculate kl_div loss for that position
|
|
||||||
maintain_position: List[int] = []
|
maintain_position: List[int] = []
|
||||||
for idx in range(start_idx, labels[i].size(0)):
|
for idx in range(start_idx, labels[i].size(0)):
|
||||||
if labels[i][idx] == -100:
|
if labels[i][idx] == -100:
|
||||||
|
|
53
realign/run_log.txt
Normal file
53
realign/run_log.txt
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
WARNING:torch.distributed.run:
|
||||||
|
*****************************************
|
||||||
|
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
||||||
|
*****************************************
|
||||||
|
/home/tushilong/anaconda3/envs/realign/bin/python: can't open file '/home/tushilong/code/realign/realign/train.py': [Errno 2] No such file or directory
|
||||||
|
/home/tushilong/anaconda3/envs/realign/bin/python: can't open file '/home/tushilong/code/realign/realign/train.py': [Errno 2] No such file or directory
|
||||||
|
/home/tushilong/anaconda3/envs/realign/bin/python: can't open file '/home/tushilong/code/realign/realign/train.py': [Errno 2] No such file or directory
|
||||||
|
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 44359) of binary: /home/tushilong/anaconda3/envs/realign/bin/python
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/bin/torchrun", line 33, in <module>
|
||||||
|
sys.exit(load_entry_point('torch==2.0.1', 'console_scripts', 'torchrun')())
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
|
||||||
|
return f(*args, **kwargs)
|
||||||
|
^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/run.py", line 794, in main
|
||||||
|
run(args)
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/run.py", line 785, in run
|
||||||
|
elastic_launch(
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
|
||||||
|
return launch_agent(self._config, self._entrypoint, list(args))
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
|
||||||
|
raise ChildFailedError(
|
||||||
|
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
||||||
|
============================================================
|
||||||
|
train.py FAILED
|
||||||
|
------------------------------------------------------------
|
||||||
|
Failures:
|
||||||
|
[1]:
|
||||||
|
time : 2024-03-09_02:08:09
|
||||||
|
host : ubuntu
|
||||||
|
rank : 1 (local_rank: 1)
|
||||||
|
exitcode : 2 (pid: 44360)
|
||||||
|
error_file: <N/A>
|
||||||
|
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
||||||
|
[2]:
|
||||||
|
time : 2024-03-09_02:08:09
|
||||||
|
host : ubuntu
|
||||||
|
rank : 2 (local_rank: 2)
|
||||||
|
exitcode : 2 (pid: 44361)
|
||||||
|
error_file: <N/A>
|
||||||
|
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
||||||
|
------------------------------------------------------------
|
||||||
|
Root Cause (first observed failure):
|
||||||
|
[0]:
|
||||||
|
time : 2024-03-09_02:08:09
|
||||||
|
host : ubuntu
|
||||||
|
rank : 0 (local_rank: 0)
|
||||||
|
exitcode : 2 (pid: 44359)
|
||||||
|
error_file: <N/A>
|
||||||
|
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
||||||
|
============================================================
|
136
train/run_log.txt
Normal file
136
train/run_log.txt
Normal file
|
@ -0,0 +1,136 @@
|
||||||
|
WARNING:torch.distributed.run:
|
||||||
|
*****************************************
|
||||||
|
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
||||||
|
*****************************************
|
||||||
|
Training model with params:
|
||||||
|
base_model: /home/tushilong/hf/models/Llama-2-7b-hf
|
||||||
|
output_dir: ../ckpts/stylish
|
||||||
|
micro_batch_size: 2
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
train_batch_size: 2
|
||||||
|
gradient_checkpointing: True
|
||||||
|
num_epochs: 1
|
||||||
|
learning_rate: 2e-05
|
||||||
|
weight_decay: 0.0001
|
||||||
|
warmup_ratio: 0.06
|
||||||
|
deepspeed_config: None
|
||||||
|
fsdp: shard_grad_op auto_wrap offload
|
||||||
|
fsdp_config: ./configs/fsdp/llama2_fsdp_config.json
|
||||||
|
smart_embedding: False
|
||||||
|
wandb_project:
|
||||||
|
wandb_run_name:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_log_model:
|
||||||
|
resume_from_checkpoint: False
|
||||||
|
|
||||||
|
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 50%|█████ | 1/2 [00:03<00:03, 3.52s/it]
Loading checkpoint shards: 50%|█████ | 1/2 [00:03<00:03, 3.51s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00, 2.20s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00, 2.40s/it]
|
||||||
|
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00, 2.20s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00, 2.39s/it]
|
||||||
|
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 50%|█████ | 1/2 [00:03<00:03, 3.45s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00, 2.17s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00, 2.36s/it]
|
||||||
|
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
|
||||||
|
StateDictType.FULL_STATE_DICT FullStateDictConfig(offload_to_cpu=False, rank0_only=False)
|
||||||
|
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
|
||||||
|
StateDictType.FULL_STATE_DICT FullStateDictConfig(offload_to_cpu=False, rank0_only=False)
|
||||||
|
0%| | 0/167 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
|
||||||
|
StateDictType.FULL_STATE_DICT FullStateDictConfig(offload_to_cpu=False, rank0_only=False)
|
||||||
|
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
|
||||||
|
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
|
||||||
|
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s][A
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 50%|█████ | 1/2 [00:00<00:00, 4.90it/s]
Loading checkpoint shards: 50%|█████ | 1/2 [00:00<00:00, 4.20it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 6.97it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 6.55it/s]
|
||||||
|
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 6.33it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 5.88it/s]
|
||||||
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
|
||||||
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
|
||||||
|
|
||||||
|
Loading checkpoint shards: 50%|█████ | 1/2 [00:14<00:14, 14.71s/it][A
|
||||||
|
Loading checkpoint shards: 100%|██████████| 2/2 [00:19<00:00, 8.65s/it][A
Loading checkpoint shards: 100%|██████████| 2/2 [00:19<00:00, 9.55s/it]
|
||||||
|
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/home/tushilong/code/realign/train/train.py", line 167, in <module>
|
||||||
|
fire.Fire(train)
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/fire/core.py", line 141, in Fire
|
||||||
|
component_trace = _Fire(component, args, parsed_flag_args, context, name)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/fire/core.py", line 475, in _Fire
|
||||||
|
component, remaining_args = _CallAndUpdateTrace(
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
|
||||||
|
component = fn(*varargs, **kwargs)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/code/realign/train/train.py", line 160, in train
|
||||||
|
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/transformers/trainer.py", line 1555, in train
|
||||||
|
return inner_training_loop(
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/transformers/trainer.py", line 1837, in _inner_training_loop
|
||||||
|
tr_loss_step = self.training_step(model, inputs)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/transformers/trainer.py", line 2682, in training_step
|
||||||
|
loss = self.compute_loss(model, inputs)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/transformers/trainer.py", line 2707, in compute_loss
|
||||||
|
outputs = model(**inputs)
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
|
||||||
|
return forward_call(*args, **kwargs)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/accelerate/utils/operations.py", line 659, in forward
|
||||||
|
return model_forward(*args, **kwargs)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/accelerate/utils/operations.py", line 647, in __call__
|
||||||
|
return convert_to_fp32(self.model_forward(*args, **kwargs))
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/accelerate/utils/operations.py", line 659, in forward
|
||||||
|
return model_forward(*args, **kwargs)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/accelerate/utils/operations.py", line 647, in __call__
|
||||||
|
return convert_to_fp32(self.model_forward(*args, **kwargs))
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
|
||||||
|
output = self._fsdp_wrapped_module(*args, **kwargs)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
|
||||||
|
return forward_call(*args, **kwargs)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/code/realign/llama/rellama.py", line 129, in forward
|
||||||
|
assert torch.isnan(target_logits).sum() == 0, f"target_logits has nan: {torch.isnan(target_logits).sum()}"
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
AssertionError: target_logits has nan: 10752000
|
||||||
|
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 20205 closing signal SIGTERM
|
||||||
|
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 20206 closing signal SIGTERM
|
||||||
|
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 2 (pid: 20207) of binary: /home/tushilong/anaconda3/envs/realign/bin/python
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/bin/torchrun", line 33, in <module>
|
||||||
|
sys.exit(load_entry_point('torch==2.0.1', 'console_scripts', 'torchrun')())
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
|
||||||
|
return f(*args, **kwargs)
|
||||||
|
^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/run.py", line 794, in main
|
||||||
|
run(args)
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/run.py", line 785, in run
|
||||||
|
elastic_launch(
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
|
||||||
|
return launch_agent(self._config, self._entrypoint, list(args))
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
|
||||||
|
raise ChildFailedError(
|
||||||
|
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
||||||
|
============================================================
|
||||||
|
train.py FAILED
|
||||||
|
------------------------------------------------------------
|
||||||
|
Failures:
|
||||||
|
<NO_OTHER_FAILURES>
|
||||||
|
------------------------------------------------------------
|
||||||
|
Root Cause (first observed failure):
|
||||||
|
[0]:
|
||||||
|
time : 2024-03-09_10:45:34
|
||||||
|
host : ubuntu
|
||||||
|
rank : 2 (local_rank: 2)
|
||||||
|
exitcode : 1 (pid: 20207)
|
||||||
|
error_file: <N/A>
|
||||||
|
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
||||||
|
============================================================
|
Loading…
Reference in New Issue
Block a user