Compare commits
	
		
			3 Commits
		
	
	
		
			bc182d09e0
			...
			8fca0603f9
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 8fca0603f9 | |||
| 716155f5ba | |||
| 104f521f79 | 
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -3,3 +3,4 @@ ckpts/
 | 
				
			|||||||
data/
 | 
					data/
 | 
				
			||||||
outputs/
 | 
					outputs/
 | 
				
			||||||
.vscode/
 | 
					.vscode/
 | 
				
			||||||
 | 
					*/run_log.txt
 | 
				
			||||||
 | 
				
			|||||||
@ -133,8 +133,10 @@ class Method_1(ReLlamaForCausalLM):
 | 
				
			|||||||
            for i in range(predict_logits.size(0)):
 | 
					            for i in range(predict_logits.size(0)):
 | 
				
			||||||
                # iterate over the batch
 | 
					                # iterate over the batch
 | 
				
			||||||
                
 | 
					                
 | 
				
			||||||
 | 
					                # token [1] is the start of response (bos token)
 | 
				
			||||||
                start_idx = torch.where(labels[i] == 1)[0].item()
 | 
					                start_idx = torch.where(labels[i] == 1)[0].item()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                # if [-100] in response, we should calculate kl_div loss for that position
 | 
				
			||||||
                maintain_position: List[int] = []
 | 
					                maintain_position: List[int] = []
 | 
				
			||||||
                for idx in range(start_idx, labels[i].size(0)):
 | 
					                for idx in range(start_idx, labels[i].size(0)):
 | 
				
			||||||
                    if labels[i][idx] == -100:
 | 
					                    if labels[i][idx] == -100:
 | 
				
			||||||
 | 
				
			|||||||
@ -1,53 +0,0 @@
 | 
				
			|||||||
WARNING:torch.distributed.run:
 | 
					 | 
				
			||||||
*****************************************
 | 
					 | 
				
			||||||
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
 | 
					 | 
				
			||||||
*****************************************
 | 
					 | 
				
			||||||
/home/tushilong/anaconda3/envs/realign/bin/python: can't open file '/home/tushilong/code/realign/realign/train.py': [Errno 2] No such file or directory
 | 
					 | 
				
			||||||
/home/tushilong/anaconda3/envs/realign/bin/python: can't open file '/home/tushilong/code/realign/realign/train.py': [Errno 2] No such file or directory
 | 
					 | 
				
			||||||
/home/tushilong/anaconda3/envs/realign/bin/python: can't open file '/home/tushilong/code/realign/realign/train.py': [Errno 2] No such file or directory
 | 
					 | 
				
			||||||
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 44359) of binary: /home/tushilong/anaconda3/envs/realign/bin/python
 | 
					 | 
				
			||||||
Traceback (most recent call last):
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/bin/torchrun", line 33, in <module>
 | 
					 | 
				
			||||||
    sys.exit(load_entry_point('torch==2.0.1', 'console_scripts', 'torchrun')())
 | 
					 | 
				
			||||||
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
 | 
					 | 
				
			||||||
    return f(*args, **kwargs)
 | 
					 | 
				
			||||||
           ^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/run.py", line 794, in main
 | 
					 | 
				
			||||||
    run(args)
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/run.py", line 785, in run
 | 
					 | 
				
			||||||
    elastic_launch(
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
 | 
					 | 
				
			||||||
    return launch_agent(self._config, self._entrypoint, list(args))
 | 
					 | 
				
			||||||
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
 | 
					 | 
				
			||||||
    raise ChildFailedError(
 | 
					 | 
				
			||||||
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
 | 
					 | 
				
			||||||
============================================================
 | 
					 | 
				
			||||||
train.py FAILED
 | 
					 | 
				
			||||||
------------------------------------------------------------
 | 
					 | 
				
			||||||
Failures:
 | 
					 | 
				
			||||||
[1]:
 | 
					 | 
				
			||||||
  time      : 2024-03-09_02:08:09
 | 
					 | 
				
			||||||
  host      : ubuntu
 | 
					 | 
				
			||||||
  rank      : 1 (local_rank: 1)
 | 
					 | 
				
			||||||
  exitcode  : 2 (pid: 44360)
 | 
					 | 
				
			||||||
  error_file: <N/A>
 | 
					 | 
				
			||||||
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
 | 
					 | 
				
			||||||
[2]:
 | 
					 | 
				
			||||||
  time      : 2024-03-09_02:08:09
 | 
					 | 
				
			||||||
  host      : ubuntu
 | 
					 | 
				
			||||||
  rank      : 2 (local_rank: 2)
 | 
					 | 
				
			||||||
  exitcode  : 2 (pid: 44361)
 | 
					 | 
				
			||||||
  error_file: <N/A>
 | 
					 | 
				
			||||||
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
 | 
					 | 
				
			||||||
------------------------------------------------------------
 | 
					 | 
				
			||||||
Root Cause (first observed failure):
 | 
					 | 
				
			||||||
[0]:
 | 
					 | 
				
			||||||
  time      : 2024-03-09_02:08:09
 | 
					 | 
				
			||||||
  host      : ubuntu
 | 
					 | 
				
			||||||
  rank      : 0 (local_rank: 0)
 | 
					 | 
				
			||||||
  exitcode  : 2 (pid: 44359)
 | 
					 | 
				
			||||||
  error_file: <N/A>
 | 
					 | 
				
			||||||
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
 | 
					 | 
				
			||||||
============================================================
 | 
					 | 
				
			||||||
@ -1,136 +0,0 @@
 | 
				
			|||||||
WARNING:torch.distributed.run:
 | 
					 | 
				
			||||||
*****************************************
 | 
					 | 
				
			||||||
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
 | 
					 | 
				
			||||||
*****************************************
 | 
					 | 
				
			||||||
Training model with params:
 | 
					 | 
				
			||||||
base_model: /home/tushilong/hf/models/Llama-2-7b-hf
 | 
					 | 
				
			||||||
output_dir: ../ckpts/stylish
 | 
					 | 
				
			||||||
micro_batch_size: 2
 | 
					 | 
				
			||||||
gradient_accumulation_steps: 1
 | 
					 | 
				
			||||||
train_batch_size: 2
 | 
					 | 
				
			||||||
gradient_checkpointing: True
 | 
					 | 
				
			||||||
num_epochs: 1
 | 
					 | 
				
			||||||
learning_rate: 2e-05
 | 
					 | 
				
			||||||
weight_decay: 0.0001
 | 
					 | 
				
			||||||
warmup_ratio: 0.06
 | 
					 | 
				
			||||||
deepspeed_config: None
 | 
					 | 
				
			||||||
fsdp: shard_grad_op auto_wrap offload
 | 
					 | 
				
			||||||
fsdp_config: ./configs/fsdp/llama2_fsdp_config.json
 | 
					 | 
				
			||||||
smart_embedding: False
 | 
					 | 
				
			||||||
wandb_project: 
 | 
					 | 
				
			||||||
wandb_run_name: 
 | 
					 | 
				
			||||||
wandb_watch: 
 | 
					 | 
				
			||||||
wandb_log_model: 
 | 
					 | 
				
			||||||
resume_from_checkpoint: False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]
 | 
					 | 
				
			||||||
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]
 | 
					 | 
				
			||||||
Loading checkpoint shards:  50%|█████     | 1/2 [00:03<00:03,  3.52s/it]
 | 
					 | 
				
			||||||
Loading checkpoint shards:  50%|█████     | 1/2 [00:03<00:03,  3.51s/it]
 | 
					 | 
				
			||||||
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.20s/it]
 | 
					 | 
				
			||||||
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.40s/it]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.20s/it]
 | 
					 | 
				
			||||||
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.39s/it]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]
 | 
					 | 
				
			||||||
Loading checkpoint shards:  50%|█████     | 1/2 [00:03<00:03,  3.45s/it]
 | 
					 | 
				
			||||||
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.17s/it]
 | 
					 | 
				
			||||||
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.36s/it]
 | 
					 | 
				
			||||||
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
 | 
					 | 
				
			||||||
StateDictType.FULL_STATE_DICT FullStateDictConfig(offload_to_cpu=False, rank0_only=False)
 | 
					 | 
				
			||||||
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 | 
					 | 
				
			||||||
StateDictType.FULL_STATE_DICT FullStateDictConfig(offload_to_cpu=False, rank0_only=False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  0%|          | 0/167 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 | 
					 | 
				
			||||||
StateDictType.FULL_STATE_DICT FullStateDictConfig(offload_to_cpu=False, rank0_only=False)
 | 
					 | 
				
			||||||
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A
 | 
					 | 
				
			||||||
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]
 | 
					 | 
				
			||||||
Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  4.90it/s]
 | 
					 | 
				
			||||||
Loading checkpoint shards:  50%|█████     | 1/2 [00:00<00:00,  4.20it/s]
 | 
					 | 
				
			||||||
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  6.97it/s]
 | 
					 | 
				
			||||||
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  6.55it/s]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  6.33it/s]
 | 
					 | 
				
			||||||
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  5.88it/s]
 | 
					 | 
				
			||||||
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
 | 
					 | 
				
			||||||
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Loading checkpoint shards:  50%|█████     | 1/2 [00:14<00:14, 14.71s/it][A
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Loading checkpoint shards: 100%|██████████| 2/2 [00:19<00:00,  8.65s/it][A
 | 
					 | 
				
			||||||
Loading checkpoint shards: 100%|██████████| 2/2 [00:19<00:00,  9.55s/it]
 | 
					 | 
				
			||||||
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
 | 
					 | 
				
			||||||
Traceback (most recent call last):
 | 
					 | 
				
			||||||
  File "/home/tushilong/code/realign/train/train.py", line 167, in <module>
 | 
					 | 
				
			||||||
    fire.Fire(train)
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/fire/core.py", line 141, in Fire
 | 
					 | 
				
			||||||
    component_trace = _Fire(component, args, parsed_flag_args, context, name)
 | 
					 | 
				
			||||||
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/fire/core.py", line 475, in _Fire
 | 
					 | 
				
			||||||
    component, remaining_args = _CallAndUpdateTrace(
 | 
					 | 
				
			||||||
                                ^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
 | 
					 | 
				
			||||||
    component = fn(*varargs, **kwargs)
 | 
					 | 
				
			||||||
                ^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/code/realign/train/train.py", line 160, in train
 | 
					 | 
				
			||||||
    trainer.train(resume_from_checkpoint=resume_from_checkpoint)
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/transformers/trainer.py", line 1555, in train
 | 
					 | 
				
			||||||
    return inner_training_loop(
 | 
					 | 
				
			||||||
           ^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/transformers/trainer.py", line 1837, in _inner_training_loop
 | 
					 | 
				
			||||||
    tr_loss_step = self.training_step(model, inputs)
 | 
					 | 
				
			||||||
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/transformers/trainer.py", line 2682, in training_step
 | 
					 | 
				
			||||||
    loss = self.compute_loss(model, inputs)
 | 
					 | 
				
			||||||
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/transformers/trainer.py", line 2707, in compute_loss
 | 
					 | 
				
			||||||
    outputs = model(**inputs)
 | 
					 | 
				
			||||||
              ^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
 | 
					 | 
				
			||||||
    return forward_call(*args, **kwargs)
 | 
					 | 
				
			||||||
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/accelerate/utils/operations.py", line 659, in forward
 | 
					 | 
				
			||||||
    return model_forward(*args, **kwargs)
 | 
					 | 
				
			||||||
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/accelerate/utils/operations.py", line 647, in __call__
 | 
					 | 
				
			||||||
    return convert_to_fp32(self.model_forward(*args, **kwargs))
 | 
					 | 
				
			||||||
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast
 | 
					 | 
				
			||||||
    return func(*args, **kwargs)
 | 
					 | 
				
			||||||
           ^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/accelerate/utils/operations.py", line 659, in forward
 | 
					 | 
				
			||||||
    return model_forward(*args, **kwargs)
 | 
					 | 
				
			||||||
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/accelerate/utils/operations.py", line 647, in __call__
 | 
					 | 
				
			||||||
    return convert_to_fp32(self.model_forward(*args, **kwargs))
 | 
					 | 
				
			||||||
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast
 | 
					 | 
				
			||||||
    return func(*args, **kwargs)
 | 
					 | 
				
			||||||
           ^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
 | 
					 | 
				
			||||||
    output = self._fsdp_wrapped_module(*args, **kwargs)
 | 
					 | 
				
			||||||
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
 | 
					 | 
				
			||||||
    return forward_call(*args, **kwargs)
 | 
					 | 
				
			||||||
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/code/realign/llama/rellama.py", line 129, in forward
 | 
					 | 
				
			||||||
    assert torch.isnan(target_logits).sum() == 0, f"target_logits has nan: {torch.isnan(target_logits).sum()}"
 | 
					 | 
				
			||||||
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
AssertionError: target_logits has nan: 10752000
 | 
					 | 
				
			||||||
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 20205 closing signal SIGTERM
 | 
					 | 
				
			||||||
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 20206 closing signal SIGTERM
 | 
					 | 
				
			||||||
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 2 (pid: 20207) of binary: /home/tushilong/anaconda3/envs/realign/bin/python
 | 
					 | 
				
			||||||
Traceback (most recent call last):
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/bin/torchrun", line 33, in <module>
 | 
					 | 
				
			||||||
    sys.exit(load_entry_point('torch==2.0.1', 'console_scripts', 'torchrun')())
 | 
					 | 
				
			||||||
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
  File "/home/tushilong/anaconda3/envs/realign/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
 | 
					 | 
				
			||||||
    return f(*args, **kwargs)
 | 
					 | 
				
			||||||
           ^^^^^^^^^^^^^^^^^^
 | 
					 | 
				
			||||||
		Reference in New Issue
	
	Block a user