478 lines
25 KiB
Python
478 lines
25 KiB
Python
import sys
|
|
import os
|
|
|
|
import transformers
|
|
from transformers.training_args import *
|
|
|
|
from .utils import ExtendedFSDPOption
|
|
|
|
|
|
@dataclass
|
|
class FSDPTrainingArguments(transformers.TrainingArguments):
|
|
# about data-efficient sampler
|
|
use_ffd_sampler: bool = False
|
|
model_avg_context: int = 2048
|
|
|
|
# about saving
|
|
# if not save with fsdp, then must not load with fsdp
|
|
save_with_fsdp: bool = False
|
|
|
|
def __post_init__(self):
|
|
# expand paths, if not os.makedirs("~/bar") will make directory
|
|
# in the current directory instead of the actual home
|
|
# see https://github.com/huggingface/transformers/issues/10628
|
|
if self.output_dir is not None:
|
|
self.output_dir = os.path.expanduser(self.output_dir)
|
|
if self.logging_dir is None and self.output_dir is not None:
|
|
self.logging_dir = os.path.join(self.output_dir, default_logdir())
|
|
if self.logging_dir is not None:
|
|
self.logging_dir = os.path.expanduser(self.logging_dir)
|
|
|
|
if self.disable_tqdm is None:
|
|
self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
|
|
|
|
if isinstance(self.evaluation_strategy, EvaluationStrategy):
|
|
warnings.warn(
|
|
"using `EvaluationStrategy` for `evaluation_strategy` is deprecated and will be removed in version 5"
|
|
" of 🤗 Transformers. Use `IntervalStrategy` instead",
|
|
FutureWarning,
|
|
)
|
|
# Go back to the underlying string or we won't be able to instantiate `IntervalStrategy` on it.
|
|
self.evaluation_strategy = self.evaluation_strategy.value
|
|
|
|
# if self.xpu_backend is not None:
|
|
# warnings.warn(
|
|
# "using `xpu_backend` is deprecated and will be removed in version 4.31"
|
|
# " of 🤗 Transformers. Use `ddp_backend` instead",
|
|
# FutureWarning,
|
|
# )
|
|
# self.ddp_backend = self.xpu_backend
|
|
|
|
self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
|
|
self.logging_strategy = IntervalStrategy(self.logging_strategy)
|
|
self.save_strategy = IntervalStrategy(self.save_strategy)
|
|
self.hub_strategy = HubStrategy(self.hub_strategy)
|
|
|
|
self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type)
|
|
if self.do_eval is False and self.evaluation_strategy != IntervalStrategy.NO:
|
|
self.do_eval = True
|
|
|
|
# eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero
|
|
if self.evaluation_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
|
|
if self.logging_steps > 0:
|
|
logger.info(f"using `logging_steps` to initialize `eval_steps` to {self.logging_steps}")
|
|
self.eval_steps = self.logging_steps
|
|
else:
|
|
raise ValueError(
|
|
f"evaluation strategy {self.evaluation_strategy} requires either non-zero --eval_steps or"
|
|
" --logging_steps"
|
|
)
|
|
|
|
# logging_steps must be non-zero for logging_strategy that is other than 'no'
|
|
if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0:
|
|
raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps")
|
|
|
|
if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps > 1:
|
|
if self.logging_steps != int(self.logging_steps):
|
|
raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}")
|
|
self.logging_steps = int(self.logging_steps)
|
|
if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1:
|
|
if self.eval_steps != int(self.eval_steps):
|
|
raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}")
|
|
self.eval_steps = int(self.eval_steps)
|
|
if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1:
|
|
if self.save_steps != int(self.save_steps):
|
|
raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}")
|
|
self.save_steps = int(self.save_steps)
|
|
|
|
# Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
|
|
if self.load_best_model_at_end:
|
|
if self.evaluation_strategy != self.save_strategy:
|
|
raise ValueError(
|
|
"--load_best_model_at_end requires the save and eval strategy to match, but found\n- Evaluation "
|
|
f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
|
|
)
|
|
if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
|
|
if self.eval_steps < 1 or self.save_steps < 1:
|
|
if not (self.eval_steps < 1 and self.save_steps < 1):
|
|
raise ValueError(
|
|
"--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
|
|
"steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps"
|
|
f"{self.save_steps} and eval_steps {self.eval_steps}."
|
|
)
|
|
# Work around floating point precision issues
|
|
LARGE_MULTIPLIER = 1_000_000
|
|
if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0:
|
|
raise ValueError(
|
|
"--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
|
|
f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}."
|
|
)
|
|
raise ValueError(
|
|
"--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation "
|
|
f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
|
|
)
|
|
|
|
safetensors_available = is_safetensors_available()
|
|
if self.save_safetensors and not safetensors_available:
|
|
raise ValueError(f"--save_safetensors={self.save_safetensors} requires safetensors to be installed!")
|
|
if not self.save_safetensors and safetensors_available:
|
|
logger.info(
|
|
f"Found safetensors installation, but --save_safetensors={self.save_safetensors}. "
|
|
f"Safetensors should be a preferred weights saving format due to security and performance reasons. "
|
|
f"If your model cannot be saved by safetensors please feel free to open an issue at "
|
|
f"https://github.com/huggingface/safetensors!"
|
|
)
|
|
|
|
if (
|
|
self.load_best_model_at_end or self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU
|
|
) and self.metric_for_best_model is None:
|
|
self.metric_for_best_model = "loss"
|
|
if self.greater_is_better is None and self.metric_for_best_model is not None:
|
|
self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
|
|
if self.run_name is None:
|
|
self.run_name = self.output_dir
|
|
if self.framework == "pt" and is_torch_available():
|
|
if self.fp16_backend and self.fp16_backend != "auto":
|
|
warnings.warn(
|
|
"`fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
|
|
" `half_precision_backend` instead",
|
|
FutureWarning,
|
|
)
|
|
self.half_precision_backend = self.fp16_backend
|
|
|
|
if self.bf16 or self.bf16_full_eval:
|
|
if self.no_cuda and not is_torch_bf16_cpu_available() and not is_torch_tpu_available():
|
|
# cpu
|
|
raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10")
|
|
elif not self.no_cuda and torch.cuda.is_available() and not is_torch_bf16_gpu_available():
|
|
# gpu
|
|
raise ValueError(
|
|
"Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
|
|
)
|
|
|
|
if self.fp16 and self.bf16:
|
|
raise ValueError("At most one of fp16 and bf16 can be True, but not both")
|
|
|
|
if self.fp16_full_eval and self.bf16_full_eval:
|
|
raise ValueError("At most one of fp16 and bf16 can be True for full eval, but not both")
|
|
|
|
if self.bf16:
|
|
if self.half_precision_backend == "apex":
|
|
raise ValueError(
|
|
" `--half_precision_backend apex`: GPU bf16 is not supported by apex. Use"
|
|
" `--half_precision_backend cuda_amp` instead"
|
|
)
|
|
if not (self.sharded_ddp == "" or not self.sharded_ddp):
|
|
raise ValueError("sharded_ddp is not supported with bf16")
|
|
|
|
if self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU:
|
|
if self.evaluation_strategy == IntervalStrategy.NO:
|
|
raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires an eval strategy")
|
|
if not is_torch_available():
|
|
raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires torch>=0.2.0")
|
|
|
|
self.optim = OptimizerNames(self.optim)
|
|
if self.adafactor:
|
|
warnings.warn(
|
|
"`--adafactor` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--optim"
|
|
" adafactor` instead",
|
|
FutureWarning,
|
|
)
|
|
self.optim = OptimizerNames.ADAFACTOR
|
|
if self.optim == OptimizerNames.ADAMW_TORCH_FUSED and is_torch_available():
|
|
if version.parse(version.parse(torch.__version__).base_version) < version.parse("2.0.0"):
|
|
raise ValueError("--optim adamw_torch_fused requires PyTorch 2.0 or higher")
|
|
# there is a bug in fp16/AMP in pt-2.0.0
|
|
if version.parse(version.parse(torch.__version__).base_version) == version.parse("2.0.0") and self.fp16:
|
|
raise ValueError("--optim adamw_torch_fused with --fp16 requires PyTorch>2.0")
|
|
|
|
if (
|
|
self.framework == "pt"
|
|
and is_torch_available()
|
|
and (self.device.type != "cuda")
|
|
and (get_xla_device_type(self.device) != "GPU")
|
|
and (self.fp16 or self.fp16_full_eval)
|
|
):
|
|
raise ValueError(
|
|
"FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
|
|
" (`--fp16_full_eval`) can only be used on CUDA devices."
|
|
)
|
|
|
|
if (
|
|
self.framework == "pt"
|
|
and is_torch_available()
|
|
and (self.device.type != "cuda")
|
|
and (get_xla_device_type(self.device) != "GPU")
|
|
and (get_xla_device_type(self.device) != "TPU")
|
|
and (self.device.type != "cpu")
|
|
and (self.bf16 or self.bf16_full_eval)
|
|
):
|
|
raise ValueError(
|
|
"BF16 Mixed precision training with AMP (`--bf16`) and BF16 half precision evaluation"
|
|
" (`--bf16_full_eval`) can only be used on CUDA or CPU/TPU/NeuronCore devices."
|
|
)
|
|
|
|
if self.torchdynamo is not None:
|
|
warnings.warn(
|
|
"`torchdynamo` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
|
|
" `torch_compile_backend` instead",
|
|
FutureWarning,
|
|
)
|
|
self.torch_compile_backend = self.torchdynamo
|
|
if (self.torch_compile_mode is not None or self.torch_compile_backend is not None) and not self.torch_compile:
|
|
self.torch_compile = True
|
|
if self.torch_compile and self.torch_compile_backend is None:
|
|
self.torch_compile_backend = "inductor"
|
|
|
|
# accelerate integration for torch compile
|
|
if self.torch_compile:
|
|
# set env vars for accelerate
|
|
prefix = "ACCELERATE_DYNAMO_"
|
|
os.environ[prefix + "BACKEND"] = self.torch_compile_backend
|
|
if self.torch_compile_mode is not None:
|
|
os.environ[prefix + "MODE"] = self.torch_compile_mode
|
|
|
|
if self.framework == "pt" and is_torch_available() and self.torch_compile:
|
|
if is_torch_tf32_available():
|
|
if self.tf32 is None and not self.fp16 or self.bf16:
|
|
logger.info(
|
|
"Setting TF32 in CUDA backends to speedup torch compile, you won't see any improvement"
|
|
" otherwise."
|
|
)
|
|
torch.backends.cuda.matmul.allow_tf32 = True
|
|
else:
|
|
logger.warning(
|
|
"The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here."
|
|
)
|
|
if self.framework == "pt" and is_torch_available() and self.tf32 is not None:
|
|
if self.tf32:
|
|
if is_torch_tf32_available():
|
|
torch.backends.cuda.matmul.allow_tf32 = True
|
|
else:
|
|
raise ValueError("--tf32 requires Ampere or a newer GPU arch, cuda>=11 and torch>=1.7")
|
|
else:
|
|
if is_torch_tf32_available():
|
|
torch.backends.cuda.matmul.allow_tf32 = False
|
|
# no need to assert on else
|
|
|
|
if self.report_to is None:
|
|
logger.info(
|
|
"The default value for the training argument `--report_to` will change in v5 (from all installed "
|
|
"integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as "
|
|
"now. You should start updating your code and make this info disappear :-)."
|
|
)
|
|
self.report_to = "all"
|
|
if self.report_to == "all" or self.report_to == ["all"]:
|
|
# Import at runtime to avoid a circular import.
|
|
from transformers.integrations import get_available_reporting_integrations
|
|
|
|
self.report_to = get_available_reporting_integrations()
|
|
elif self.report_to == "none" or self.report_to == ["none"]:
|
|
self.report_to = []
|
|
elif not isinstance(self.report_to, list):
|
|
self.report_to = [self.report_to]
|
|
|
|
if self.warmup_ratio < 0 or self.warmup_ratio > 1:
|
|
raise ValueError("warmup_ratio must lie in range [0,1]")
|
|
elif self.warmup_ratio > 0 and self.warmup_steps > 0:
|
|
logger.info(
|
|
"Both warmup_ratio and warmup_steps given, warmup_steps will override any effect of warmup_ratio"
|
|
" during training"
|
|
)
|
|
|
|
if not (self.sharded_ddp == "" or not self.sharded_ddp):
|
|
warnings.warn(
|
|
"using `sharded_ddp` is deprecated and will be removed in version 4.33"
|
|
" of 🤗 Transformers. Use `fsdp` instead",
|
|
FutureWarning,
|
|
)
|
|
if isinstance(self.sharded_ddp, bool):
|
|
self.sharded_ddp = "simple" if self.sharded_ddp else ""
|
|
if isinstance(self.sharded_ddp, str):
|
|
self.sharded_ddp = [ShardedDDPOption(s) for s in self.sharded_ddp.split()]
|
|
if self.sharded_ddp == [ShardedDDPOption.OFFLOAD]:
|
|
raise ValueError(
|
|
"`--sharded_ddp offload` can't work on its own. It needs to be added to `--sharded_ddp zero_dp_2` or "
|
|
'`--sharded_ddp zero_dp_3`. For example, `--sharded_ddp "zero_dp_2 offload"`.'
|
|
)
|
|
elif len(self.sharded_ddp) > 1 and ShardedDDPOption.SIMPLE in self.sharded_ddp:
|
|
raise ValueError("`--sharded_ddp simple` is not compatible with any other option.")
|
|
elif ShardedDDPOption.ZERO_DP_2 in self.sharded_ddp and ShardedDDPOption.ZERO_DP_3 in self.sharded_ddp:
|
|
raise ValueError("`--sharded_ddp zero_dp_2` is not compatible with `--sharded_ddp zero_dp_3`.")
|
|
|
|
if isinstance(self.fsdp, bool):
|
|
self.fsdp = "full_shard" if self.fsdp else ""
|
|
if isinstance(self.fsdp, str):
|
|
self.fsdp = [ExtendedFSDPOption(s) for s in self.fsdp.split()]
|
|
if self.fsdp == [ExtendedFSDPOption.OFFLOAD]:
|
|
raise ValueError(
|
|
"`--fsdp offload` can't work on its own. It needs to be added to `--fsdp full_shard` or "
|
|
'`--fsdp shard_grad_op`. For example, `--fsdp "full_shard offload"`.'
|
|
)
|
|
elif ExtendedFSDPOption.FULL_SHARD in self.fsdp and ExtendedFSDPOption.SHARD_GRAD_OP in self.fsdp:
|
|
raise ValueError("`--fsdp full_shard` is not compatible with `--fsdp shard_grad_op`.")
|
|
|
|
if self.fsdp_config is None:
|
|
self.fsdp_config = {}
|
|
|
|
if isinstance(self.fsdp_config, str):
|
|
with io.open(self.fsdp_config, "r", encoding="utf-8") as f:
|
|
self.fsdp_config = json.load(f)
|
|
|
|
if self.fsdp_min_num_params > 0:
|
|
warnings.warn("using `--fsdp_min_num_params` is deprecated. Use fsdp_config instead ", FutureWarning)
|
|
|
|
self.fsdp_config["fsdp_min_num_params"] = max(
|
|
self.fsdp_config.get("fsdp_min_num_params", 0), self.fsdp_min_num_params
|
|
)
|
|
|
|
# if fsdp_config["fsdp_transformer_layer_cls_to_wrap"] is specified as a string, convert it to a list with a single object
|
|
if isinstance(self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None), str):
|
|
self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"] = [
|
|
self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]
|
|
]
|
|
|
|
if self.fsdp_transformer_layer_cls_to_wrap is not None:
|
|
warnings.warn(
|
|
"using `--fsdp_transformer_layer_cls_to_wrap` is deprecated. Use fsdp_config instead ", FutureWarning
|
|
)
|
|
self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"] = self.fsdp_config.get(
|
|
"fsdp_transformer_layer_cls_to_wrap", []
|
|
) + [self.fsdp_transformer_layer_cls_to_wrap]
|
|
|
|
if len(self.fsdp) == 0 and self.fsdp_config["fsdp_min_num_params"] > 0:
|
|
warnings.warn("`--fsdp_min_num_params` is useful only when `--fsdp` is specified.")
|
|
|
|
if len(self.fsdp) == 0 and self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None:
|
|
warnings.warn("`--fsdp_transformer_layer_cls_to_wrap` is useful only when `--fsdp` is specified.")
|
|
|
|
if (
|
|
len(self.fsdp) > 0
|
|
and self.fsdp_config["fsdp_min_num_params"] > 0
|
|
and self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None
|
|
):
|
|
raise ValueError(
|
|
"`--fsdp_min_num_params` and `--fsdp_transformer_layer_cls_to_wrap` are mutually exclusive."
|
|
)
|
|
self.fsdp_config["xla"] = self.fsdp_config.get("xla", False)
|
|
self.fsdp_config["xla_fsdp_grad_ckpt"] = self.fsdp_config.get("xla_fsdp_grad_ckpt", False)
|
|
if self.fsdp_config["xla"]:
|
|
if len(self.fsdp) > 0:
|
|
# store XLA fsdp configuration parameters into a dictionary
|
|
self.xla_fsdp_config = self.fsdp_config.get("xla_fsdp_settings", {})
|
|
# apply appropriate string to torch.dtype conversions for parameters
|
|
if "compute_dtype" in self.xla_fsdp_config:
|
|
self.xla_fsdp_config["compute_dtype"] = getattr(torch, self.xla_fsdp_config["compute_dtype"])
|
|
if "buffer_dtype" in self.xla_fsdp_config:
|
|
self.xla_fsdp_config["buffer_dtype"] = getattr(torch, self.xla_fsdp_config["buffer_dtype"])
|
|
else:
|
|
warnings.warn("XLA FSDP can be used only when `--fsdp` is specified.")
|
|
else:
|
|
if self.fsdp_config["xla_fsdp_grad_ckpt"]:
|
|
warnings.warn("`--xla_fsdp_grad_ckpt` is useful only when `--xla` is set to true.")
|
|
|
|
# accelerate integration for FSDP
|
|
if len(self.fsdp) > 0 and not self.fsdp_config["xla"]:
|
|
os.environ["ACCELERATE_USE_FSDP"] = "true"
|
|
from accelerate.utils.constants import (
|
|
FSDP_AUTO_WRAP_POLICY,
|
|
FSDP_SHARDING_STRATEGY,
|
|
)
|
|
|
|
for fsdp_option in self.fsdp:
|
|
if fsdp_option.upper() in FSDP_SHARDING_STRATEGY:
|
|
# set environment variable for FSDP sharding strategy
|
|
os.environ["FSDP_SHARDING_STRATEGY"] = str(FSDP_SHARDING_STRATEGY.index(fsdp_option.upper()) + 1)
|
|
elif fsdp_option == FSDPOption.OFFLOAD:
|
|
os.environ["FSDP_OFFLOAD_PARAMS"] = "true"
|
|
elif fsdp_option == FSDPOption.AUTO_WRAP:
|
|
if self.fsdp_config["fsdp_min_num_params"] > 0:
|
|
os.environ["FSDP_MIN_NUM_PARAMS"] = str(self.fsdp_config["fsdp_min_num_params"])
|
|
os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[1]
|
|
elif self.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None:
|
|
os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = ",".join(
|
|
self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]
|
|
)
|
|
os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0]
|
|
prefetch_policy = self.fsdp_config.get("fsdp_backward_prefetch", "NO_PREFETCH")
|
|
os.environ["FSDP_BACKWARD_PREFETCH"] = prefetch_policy.upper()
|
|
|
|
if self.tpu_metrics_debug:
|
|
warnings.warn(
|
|
"using `--tpu_metrics_debug` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
|
|
" `--debug tpu_metrics_debug` instead",
|
|
FutureWarning,
|
|
)
|
|
if self.debug is None:
|
|
self.debug = " tpu_metrics_debug"
|
|
else:
|
|
self.debug += " tpu_metrics_debug"
|
|
self.tpu_metrics_debug = False
|
|
|
|
if isinstance(self.debug, str):
|
|
self.debug = [DebugOption(s) for s in self.debug.split()]
|
|
elif self.debug is None:
|
|
self.debug = []
|
|
|
|
self.deepspeed_plugin = None
|
|
if self.deepspeed:
|
|
# - must be run very last in arg parsing, since it will use a lot of these settings.
|
|
# - must be run before the model is created.
|
|
if not is_accelerate_available():
|
|
raise ValueError("--deepspeed requires Accelerate to be installed: `pip install accelerate`.")
|
|
from transformers.deepspeed import HfTrainerDeepSpeedConfig
|
|
|
|
# will be used later by the Trainer
|
|
# note: leave self.deepspeed unmodified in case a user relies on it not to be modified)
|
|
self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed)
|
|
self.hf_deepspeed_config.trainer_config_process(self)
|
|
|
|
# Accelerate DeepSpeed Plugin
|
|
from accelerate.utils import DeepSpeedPlugin
|
|
|
|
os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
|
|
self.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.hf_deepspeed_config)
|
|
|
|
if self.push_to_hub_token is not None:
|
|
warnings.warn(
|
|
"`--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use "
|
|
"`--hub_token` instead.",
|
|
FutureWarning,
|
|
)
|
|
self.hub_token = self.push_to_hub_token
|
|
|
|
if self.push_to_hub_model_id is not None:
|
|
self.hub_model_id = get_full_repo_name(
|
|
self.push_to_hub_model_id, organization=self.push_to_hub_organization, token=self.hub_token
|
|
)
|
|
if self.push_to_hub_organization is not None:
|
|
warnings.warn(
|
|
"`--push_to_hub_model_id` and `--push_to_hub_organization` are deprecated and will be removed in "
|
|
"version 5 of 🤗 Transformers. Use `--hub_model_id` instead and pass the full repo name to this "
|
|
f"argument (in this case {self.hub_model_id}).",
|
|
FutureWarning,
|
|
)
|
|
else:
|
|
warnings.warn(
|
|
"`--push_to_hub_model_id` is deprecated and will be removed in version 5 of 🤗 Transformers. Use "
|
|
"`--hub_model_id` instead and pass the full repo name to this argument (in this case "
|
|
f"{self.hub_model_id}).",
|
|
FutureWarning,
|
|
)
|
|
elif self.push_to_hub_organization is not None:
|
|
self.hub_model_id = f"{self.push_to_hub_organization}/{Path(self.output_dir).name}"
|
|
warnings.warn(
|
|
"`--push_to_hub_organization` is deprecated and will be removed in version 5 of 🤗 Transformers. Use "
|
|
"`--hub_model_id` instead and pass the full repo name to this argument (in this case "
|
|
f"{self.hub_model_id}).",
|
|
FutureWarning,
|
|
)
|
|
|
|
# if training args is specified, it will override the one specified in the accelerate config
|
|
if self.half_precision_backend != "apex" and len(self.sharded_ddp) == 0:
|
|
mixed_precision_dtype = os.environ.get("ACCELERATE_MIXED_PRECISION", "no")
|
|
if self.fp16:
|
|
mixed_precision_dtype = "fp16"
|
|
elif self.bf16:
|
|
mixed_precision_dtype = "bf16"
|
|
os.environ["ACCELERATE_MIXED_PRECISION"] = mixed_precision_dtype |