
    I am using this config:

    # Your YAML string yaml_string = """ base_model: meta-llama/Meta-Llama-3-8B model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer tokenizer_use_fast: false load_in_8bit: false load_in_4bit: true adapter: lora strict: false model_config: datasets: - path: /content/jokes_dataset_cleaned.jsonl type: system_prompt: "" field_system: system_prompt field_instruction: setup field_output: punchline format: "[INST] {instruction} [/INST]" no_input_format: "[INST] {instruction} [/INST]" conversation: chatml chat_template: chatml dataset_prepared_path: val_set_size: 0.0002 output_dir: ./out sequence_len: 4096 sample_packing: true pad_to_sequence_len: true gradient_accumulation_steps: 4 micro_batch_size: 3 num_epochs: 3 logging_steps: 1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 2e-5 wandb_project: wandb_watch: wandb_run_id: wandb_log_model: train_on_inputs: false group_by_length: false bf16: auto fp16: tf32: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: lora_target_linear: true lora_fan_in_fan_out: gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: true saves_per_epoch: 4 save_total_limit: 2 save_steps: evals_per_epoch: 4 eval_sample_packing: false debug: weight_decay: 0.05 fsdp: fsdp_config: special_tokens: eos_token: "<|im_end|>" pad_token: "<|end_of_text|>" tokens: - "<|im_start|>" - "<|im_end|>" """ # Convert the YAML string to a Python dictionary yaml_dict = yaml.safe_load(yaml_string)

    I get this error: Traceback (most recent call last): File "/usr/lib/python3.10/", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/", line 86, in _run_code exec(code, run_globals) File "/content/src/axolotl/src/axolotl/cli/", line 59, in <module> fire.Fire(do_cli) File "/usr/local/lib/python3.10/dist-packages/fire/", line 143, in Fire component_trace = _Fire(component, args, parsed_flag_args, context, name) File "/usr/local/lib/python3.10/dist-packages/fire/", line 477, in _Fire component, remaining_args = _CallAndUpdateTrace( File "/usr/local/lib/python3.10/dist-packages/fire/", line 693, in _CallAndUpdateTrace component = fn(*varargs, **kwargs) File "/content/src/axolotl/src/axolotl/cli/", line 30, in do_cli parsed_cfg = load_cfg(config, **kwargs) File "/content/src/axolotl/src/axolotl/cli/", line 385, in load_cfg normalize_config(cfg) File "/content/src/axolotl/src/axolotl/utils/config/", line 119, in normalize_config model_config = load_model_config(cfg) File "/content/src/axolotl/src/axolotl/utils/", line 146, in load_model_config check_model_config(cfg, model_config) File "/content/src/axolotl/src/axolotl/utils/", line 113, in check_model_config raise ValueError( ValueError: lora_modules_to_save not properly set when adding new tokens. Please include [embed_tokens, lm_head] in lora_modules_to_save. Traceback (most recent call last):

    The error message you're encountering indicates that when adding new tokens to the tokenizer and using LoRA (Low-Rank Adaptation) as an adapter, you need to specify which modules of the model should save the LoRA parameters. Specifically, since you're adding new tokens, the model expects you to explicitly set the lora_modules_to_save configuration to include both embed_tokens and lm_head, which are crucial for handling the embeddings of the new tokens and ensuring they are properly utilized during training and inference.

    To resolve this error, you need to update your configuration to include the lora_modules_to_save field and specify the required modules. Here's how you can modify your YAML configuration:

    base_model: meta-llama/Meta-Llama-3-8B
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
tokenizer_use_fast: false
load_in_8bit: false
load_in_4bit: true
adapter: lora
strict: false
model_config:
datasets:
  - path: /content/jokes_dataset_cleaned.jsonl
    type:
    system_prompt: ""
    field_system: system_prompt
    field_instruction: setup
    field_output: punchline
    format: "[INST] {instruction} [/INST]"
    no_input_format: "[INST] {instruction} [/INST]"
conversation: chatml
chat_template: chatml
dataset_prepared_path:
val_set_size: 0.0002
output_dir: ./out
sequence_len: 4096
sample_packing: true
pad_to_sequence_len: true
gradient_accumulation_steps: 4
micro_batch_size: 3
num_epochs: 3
logging_steps: 1
optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 2e-5
wandb_project:
wandb_watch:
wandb_run_id:
wandb_log_model:
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
lora_target_linear: true
lora_fan_in_fan_out:
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
saves_per_epoch: 4
save_total_limit: 2
save_steps:
evals_per_epoch: 4
eval_sample_packing: false
debug:
weight_decay: 0.05
fsdp:
fsdp_config:
special_tokens:
  eos_token: "<|im_end|>"
  pad_token: "<|end_of_text|>"
  tokens:
    - "<|im_start|>"
    - "<|im_end|>"
lora_modules_to_save:
  - embed_tokens
  - lm_head
    base_model: huggyllama/llama-7b model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: false datasets: - path: openaccess-ai-collective/jeopardy type: jeopardy dataset_prepared_path: val_set_size: 0.02 adapter: lora_model_dir: sequence_len: 512 max_packed_sequence_len: lora_r: lora_alpha: lora_dropout: lora_target_modules: lora_fan_in_fan_out: false wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./jeopardy-bot-7b gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.00003 train_on_inputs: false group_by_length: false bf16: auto tf32: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 5 xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 fsdp: fsdp_config: tokens: bos_token: "<s>" eos_token: "</s>" unk_token: "<unk>"
    base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: true load_in_4bit: false strict: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./lora-out sequence_len: 4096 sample_packing: true eval_sample_packing: false pad_to_sequence_len: true adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false group_by_length: false bf16: auto fp16: tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 fsdp: fsdp_config: special_tokens:
    def load_tokenizer(cfg): model_config = load_model_config(cfg) tokenizer_kwargs = {} use_fast = True # this is the default if cfg.tokenizer_use_fast is not None: use_fast = cfg.tokenizer_use_fast if cfg.tokenizer_legacy is not None: # True is the default w/ tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy tokenizer_cls = AutoTokenizer if cfg.tokenizer_type: tokenizer_cls = getattr(transformers, cfg.tokenizer_type) tokenizer = tokenizer_cls.from_pretrained( cfg.tokenizer_config, trust_remote_code=cfg.trust_remote_code or False, use_fast=use_fast, **tokenizer_kwargs, ) if ( tokenizer.__class__.__name__ in [ "LlamaTokenizer", "LlamaTokenizerFast", "CodeLlamaTokenizer", "CodeLlamaTokenizerFast", ] and hasattr(tokenizer, "pad_token") and not tokenizer.pad_token ): # set a pad_token, but use eos_token so we don't add a new token tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast": tokenizer.add_special_tokens({"pad_token": "[PAD]"}) os.environ["TOKENIZERS_PARALLELISM"] = "false" # Mistral's official FA implementation requires left padding if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing: tokenizer.padding_side = "left" # Qwen base only has single token, so we need to set the special tokens if cfg.is_qwen_derived_model: token_ids = ["bos_token_id", "eos_token_id", "pad_token_id", "unk_token_id"] for attr_name in token_ids: if getattr(tokenizer, attr_name) is None: setattr(tokenizer, attr_name, tokenizer.eod_id) token_names = ["bos_token", "eos_token", "pad_token", "unk_token"] for attr_name in token_names: if getattr(tokenizer, attr_name) is None: setattr(tokenizer, attr_name, "<|endoftext|>") additional_special_tokens = None if cfg.special_tokens: special_tokens = cfg.special_tokens.to_dict() additional_special_tokens = special_tokens.pop( "additional_special_tokens", None ) lora_modules_to_save = get_linear_embedding_layers(model_config.model_type) for k, val in special_tokens.items(): # check if new special token is not already in tokenizer and # is adapter training to make sure lora_modules_to_save is set # pylint: disable=too-many-boolean-expressions if ( (getattr(tokenizer, k) is None or getattr(tokenizer, k) != val) and (len(tokenizer.encode(val, add_special_tokens=False)) > 2) and cfg.adapter and ( not cfg.lora_modules_to_save or not all( x in cfg.lora_modules_to_save for x in lora_modules_to_save ) ) ): lora_modules_to_save = ", ".join( [f"`{x}`" for x in lora_modules_to_save] ) raise ValueError( f"Please set lora_modules_to_save to [{lora_modules_to_save}] when using an adapter and changing the special tokens." ) tokenizer.add_special_tokens( {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)} ) # If we add bos_token and eos_token, we need to update the post processor to # handle them correctly. # bos_or_eos_in_special_tokens = ( "bos_token" in cfg.special_tokens and "eos_token" in cfg.special_tokens ) if ( tokenizer.__class__.__name__ in ( "LlamaTokenizerFast", "CodeLlamaTokenizerFast", ) and bos_or_eos_in_special_tokens ): tokenizer.update_post_processor() if cfg.tokens: tokenizer.add_tokens( [ AddedToken(token, rstrip=False, lstrip=False, normalized=False) for token in cfg.tokens ] ) # Additional special tokens are a List, and need to be treated differently than regular special # tokens. We add them after we have called `add_tokens` in case these additional special tokens # are new tokens. # # Usage: # # ```py # special_tokens: # additional_special_tokens: ["<|im_start|>", "<|im_end|>"] # ``` if additional_special_tokens is not None: tokenizer.add_special_tokens( {"additional_special_tokens": additional_special_tokens} ) with zero_only(): LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}") LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}") LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}") LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}") if cfg.chat_template: chat_template_string = chat_templates(cfg.chat_template) if cfg.default_system_message and cfg.chat_template == "chatml": chat_template_string = chat_template_string.replace( "You are a helpful assistant.", cfg.default_system_message ) tokenizer.chat_template = chat_template_string else: "No Chat template selected. Consider adding a chat template for easier inference." ) return tokenizer
    base_model: codellama/CodeLlama-34b-hf model_type: LlamaForCausalLM tokenizer_type: CodeLlamaTokenizer load_in_8bit: true load_in_4bit: false strict: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./lora-out sequence_len: 4096 sample_packing: true pad_to_sequence_len: true adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false group_by_length: false bf16: auto fp16: tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: true s2_attention: warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 fsdp: fsdp_config: special_tokens: bos_token: "<s>" eos_token: "</s>" unk_token: "<unk>"
    base_model: openlm-research/open_llama_3b_v2 model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: false load_in_4bit: false strict: false push_dataset_to_hub: datasets: - path: teknium/GPT4-LLM-Cleaned type: alpaca dataset_prepared_path: val_set_size: 0.02 adapter: lora_model_dir: sequence_len: 1024 sample_packing: true lora_r: lora_alpha: lora_dropout: lora_target_modules: lora_target_linear: lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: output_dir: ./openllama-out gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.000003 train_on_inputs: false group_by_length: false float16: true bf16: false fp16: false tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: true gptq_groupsize: gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 fsdp: fsdp_config: special_tokens: bos_token: "<s>" eos_token: "</s>" unk_token: "<unk>"
    base_model: codellama/CodeLlama-7b-hf model_type: LlamaForCausalLM tokenizer_type: CodeLlamaTokenizer load_in_8bit: true load_in_4bit: false strict: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./lora-out sequence_len: 4096 sample_packing: true pad_to_sequence_len: true adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false group_by_length: false bf16: auto fp16: tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: true s2_attention: warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 fsdp: fsdp_config: special_tokens: bos_token: "<s>" eos_token: "</s>" unk_token: "<unk>"
    base_model: codellama/CodeLlama-13b-hf model_type: LlamaForCausalLM tokenizer_type: CodeLlamaTokenizer load_in_8bit: true load_in_4bit: false strict: false datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca dataset_prepared_path: val_set_size: 0.05 output_dir: ./lora-out sequence_len: 4096 sample_packing: true pad_to_sequence_len: true adapter: lora lora_model_dir: lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false group_by_length: false bf16: auto fp16: tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: true s2_attention: warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 fsdp: fsdp_config: special_tokens: bos_token: "<s>" eos_token: "</s>" unk_token: "<unk>"
    import os

    os.environ["CUDA_VISIBLE_DEVICES"] = "1" from peft import PeftConfig, PeftModel from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM, AutoTokenizer from datasets import load_dataset import torch import random

    peft_model_id = "smangrul/tinyllama_lora_norobots" device = "cuda" config = PeftConfig.from_pretrained(peft_model_id) model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, load_in_4bit=True, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(peft_model_id) model.resize_token_embeddings(len(tokenizer)) model = PeftModel.from_pretrained(model, peft_model_id, adapter_name="norobots") _ = model.load_adapter("smangrul/tinyllama_lora_sql", adapter_name="sql") _ = model.load_adapter("smangrul/tinyllama_lora_adcopy", adapter_name="adcopy")


    [0.8, 0.1, 0.1] linear #[1.0, 0.2] 0.7 density dare_linear #[1.5, 0.3] 0.5 density ties #[0.8, 0.5] cat

    adapters = ["norobots", "adcopy", "sql"] weights = [2.0, 0.3, 0.7] adapter_name = "merge" density = 0.2 combination_type = "ties" if adapter_name in model.peft_config: model.delete_adapter(adapter_name) model.add_weighted_adapter(adapters, weights, adapter_name, combination_type=combination_type, density=density)

    model.eval() model.set_adapter("merge")

    messages = [ {"role": "user", "content": "Write an essay about Generative AI."}, ] text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) inputs = tokenizer(text, return_tensors="pt") # , add_special_tokens=False) inputs = {k:"cuda") for k, v in inputs.items()} outputs = model.generate( **inputs, max_new_tokens=256, do_sample=True, top_p=0.95, temperature=0.2, repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id, ) print(tokenizer.decode(outputs[0]))

    messages = [ {"role": "system", "content": "Create a text ad given the following product and description."}, { "role": "user", "content": "Product: Sony PS5 PlayStation Console\nDescription: The PS5â„¢ console unleashes new gaming possibilities that you never anticipated.", }, ] text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) inputs = tokenizer(text, return_tensors="pt") # , add_special_tokens=False) inputs = {k:"cuda") for k, v in inputs.items()} outputs = model.generate( **inputs, max_new_tokens=128, do_sample=True, top_p=0.95, temperature=0.2, repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id, ) print(tokenizer.decode(outputs[0]))

    text = """Table: 2-11365528-2 Columns: ['Team', 'Head Coach', 'President', 'Home Ground', 'Location'] Natural Query: Who is the Head Coach of the team whose President is Mario Volarevic? SQL Query:"""

    inputs = tokenizer(text, return_tensors="pt") # , add_special_tokens=False) inputs = {k:"cuda") for k, v in inputs.items()} outputs = model.generate( **inputs, max_new_tokens=64, repetition_penalty=1.1, eos_token_id=tokenizer("</s>").input_ids[-1] ) print(tokenizer.decode(outputs[0]))

    def parse_args(): parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task") parser.add_argument( "--dataset_name", type=str, default=None, help="The name of the dataset to use (via the datasets library).", ) parser.add_argument( "--dataset_config_name", type=str, default=None, help="The configuration name of the dataset to use (via the datasets library).", ) parser.add_argument( "--train_file", type=str, default=None, help="A csv, txt or a json file containing the training data." ) parser.add_argument( "--validation_file", type=str, default=None, help="A csv, txt or a json file containing the validation data." ) parser.add_argument( "--validation_split_percentage", default=5, help="The percentage of the train set used as validation set in case there's no validation split", ) parser.add_argument( "--model_name_or_path", type=str, help="Path to pretrained model or model identifier from", required=False, ) parser.add_argument( "--config_name", type=str, default=None, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", type=str, default=None, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--use_slow_tokenizer", action="store_true", help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).", ) parser.add_argument( "--per_device_train_batch_size", type=int, default=8, help="Batch size (per device) for the training dataloader.", ) parser.add_argument( "--per_device_eval_batch_size", type=int, default=8, help="Batch size (per device) for the evaluation dataloader.", ) parser.add_argument( "--learning_rate", type=float, default=5e-5, help="Initial learning rate (after the potential warmup period) to use.", ) parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.") parser.add_argument( "--max_train_steps", type=int, default=None, help="Total number of training steps to perform. If provided, overrides num_train_epochs.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--lr_scheduler_type", type=SchedulerType, default="linear", help="The scheduler type to use.", choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"], ) parser.add_argument( "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler." ) parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.") parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( "--model_type", type=str, default=None, help="Model type to use if training from scratch.", choices=MODEL_TYPES, ) parser.add_argument( "--ignore_pad_token_for_loss", type=bool, default=True, help="Whether to ignore the tokens corresponding to padded labels in the loss computation or not.", ) parser.add_argument( "--max_source_length", type=int, default=128, help=( "The maximum total input sequence length after " "tokenization.Sequences longer than this will be truncated, sequences shorter will be padded." ), ) parser.add_argument( "--max_target_length", type=int, default=128, help=( "The maximum total sequence length for target text after " "tokenization. Sequences longer than this will be truncated, sequences shorter will be padded." "during ``evaluate`` and ``predict``." ), ) parser.add_argument( "--pad_to_max_length", action="store_true", help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.", ) parser.add_argument( "--preprocessing_num_workers", type=int, default=None, help="The number of processes to use for the preprocessing.", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument( "--no_keep_linebreaks", action="store_true", help="Do not keep line breaks when using TXT files." ) parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") parser.add_argument( "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--trust_remote_code", type=bool, default=False, help=( "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" "should only be set to `True` for repositories you trust and in which you have read the code, as it will" "execute code present on the Hub on your local machine." ), ) parser.add_argument( "--checkpointing_steps", type=str, default=None, help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", ) parser.add_argument( "--resume_from_checkpoint", type=str, default=None, help="If the training should continue from a checkpoint folder.", ) parser.add_argument( "--with_tracking", action="store_true", help="Whether to enable experiment trackers for logging.", ) parser.add_argument( "--report_to", type=str, default="tensorboard", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) parser.add_argument( "--low_cpu_mem_usage", action="store_true", help=( "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded." "If passed, LLM loading time and RAM consumption will be benefited." ), ) ########################## # Generation Config # ########################## parser.add_argument( "--temperature", type=float, default=0.8, help="temperature of 1.0 has no effect, lower tend toward greedy sampling", ) parser.add_argument("--k", type=int, default=40, help="Choose k candidate words") parser.add_argument("--p", type=float, default=0.95, help="The sum of probability of candidate words is 0.9 ") ########################## # Exp Args # ########################## parser.add_argument( "--adapter_name_or_path", type=str, default=None, help=( "The LoRA adapter checkpoint. Set None if you want to fine-tune from LoftQ." "Specify a path if you want to evaluate." ), ) args = parser.parse_args() # Sanity checks if args.dataset_name is None and args.train_file is None and args.validation_file is None: raise ValueError("Need either a dataset name or a training/validation file.") else: if args.train_file is not None: extension = args.train_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file." if args.validation_file is not None: extension = args.validation_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file." if args.push_to_hub: assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed." return args
    # Training PEFT models with new tokens being added to the embedding layers and tokenizer
    In this example, we will learn how to train a LoRA model when adding new tokens to the tokenizer and model. 
    This is a common usecase when doing the following:
    1. Instruction finetuning with new tokens beind added such as `<|user|>`, `<|assistant|>`, `<|system|>`, `</s>`, `<s>` to properly format the conversations
    2. Finetuning on a specific language wherein language spoecific tokens are added, e.g., korean tokens being added to vocabulary for finetuning LLM on Korean datasets.
    3. Instruction finetuning to return outputs in certain format to enable agent behaviour new tokens such as `<|FUNCTIONS|>`, `<|BROWSE|>`, `<|TEXT2IMAGE|>`, `<|ASR|>`, `<|TTS|>`, `<|GENERATECODE|>`, `<|RAG|>`.
    In such cases, you add the Embedding modules to the LORA `target_modules`. PEFT will take care of saving the embedding layers with the new added tokens along with the adapter weights that were trained on the specific initialization of the embeddings weights of the added tokens.
    Let's import the necessary libraries

    import os

    os.environ["CUDA_VISIBLE_DEVICES"] = "3" os.environ["WANDB_PROJECT"] = "PeftExamples" import transformers from peft import ( LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training, ) from transformers import ( AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments, Trainer, default_data_collator, ) import torch from dataclasses import dataclass, field from typing import Optional from dataclass_csv import DataclassReader from import Dataset, DataLoader

    from enum import Enum

    ## Prepare Model and Tokenizer
    Now, we will be adding 27 new tokens as well as replace the existing pad, bos and eos tokens of the model.

    class SpecialTokens(str, Enum): begin_target = "<|begintarget|>" end_target = "<|endtarget|>" begin_context = "<|begincontext|>" end_context = "<|endcontext|>" system = "<|system|>" user = "<|user|>" begin_last_user_utterance = "<|beginlastuserutterance|>" end_last_user_utterance = "<|endlastuserutterance|>" begin_dsts = "<|begindsts|>" end_dsts = "<|enddsts|>" begin_dst = "<|begindst|>" end_dst = "<|enddst|>" begin_belief = "<|beginbelief|>" end_belief = "<|endbelief|>" begin_response = "<|beginresponse|>" end_response = "<|endresponse|>" begin_action = "<|beginaction|>" end_action = "<|endaction|>" begin_user_action = "<|beginuseraction|>" end_user_action = "<|enduseraction|>" sys_actions = "<|sysactions|>" begin_intent = "<|beginintent|>" end_intent = "<|endintent|>" begin_requested_slots = "<|beginrequestedslots|>" end_requested_slots = "<|endrequestedslots|>" pad_token = "<|pad|>" bos_token = "<|startoftext|>"

    def list(cls):
        return [c.value for c in cls]
    We will be finetuning Mistral-7B model. Let's load the tokenizer and add the special tokens followed by loading the base model and resizzing the embedding layers to accomodate the newly added tokens.

    model_name = "mistralai/Mistral-7B-v0.1" tokenizer = AutoTokenizer.from_pretrained( model_name, pad_token=SpecialTokens.pad_token.value, bos_token=SpecialTokens.bos_token.value, eos_token=SpecialTokens.end_target.value, additional_special_tokens=SpecialTokens.list(), ) model = AutoModelForCausalLM.from_pretrained( model_name, low_cpu_mem_usage=True # use_flash_attention_2=True, # leading to an error ) model.resize_token_embeddings(len(tokenizer))

    ## Apply LoRA

    config = LoraConfig( r=64, lora_alpha=128, lora_dropout=0.0, target_modules=["embed_tokens", "lm_head", "q_proj", "v_proj"] ) model = get_peft_model(model, config) print(model.print_trainable_parameters()) print(model)

    ## Preapre Dataset

    from datasets import load_dataset

    dataset = load_dataset("smangrul/assistant_chatbot_dataset") dataset = dataset["train"].train_test_split(0.2)

    text_column = "context" label_column = "target" max_length = 512

    def preprocess_function(examples): batch_size = len(examples[text_column]) targets = [str(x) for x in examples[label_column]] model_inputs = tokenizer(examples[text_column]) labels = tokenizer(targets, add_special_tokens=False) # don't add bos token because we concatenate with inputs for i in range(batch_size): sample_input_ids = model_inputs["input_ids"][i] label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id] # print(i, sample_input_ids, label_input_ids) model_inputs["input_ids"][i] = sample_input_ids + label_input_ids labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i]) # print(model_inputs) for i in range(batch_size): sample_input_ids = model_inputs["input_ids"][i] label_input_ids = labels["input_ids"][i] model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * ( max_length - len(sample_input_ids) ) + sample_input_ids model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[ "attention_mask" ][i] labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids model_inputs["input_ids"][i] = model_inputs["input_ids"][i][:max_length] model_inputs["attention_mask"][i] = model_inputs["attention_mask"][i][:max_length] labels["input_ids"][i] = labels["input_ids"][i][:max_length] model_inputs["labels"] = labels["input_ids"] return model_inputs

    processed_datasets = preprocess_function, batched=True, num_proc=1, remove_columns=dataset["train"].column_names, load_from_cache_file=False, desc="Running tokenizer on dataset", )

    train_dataset = processed_datasets["train"]


    train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=8, pin_memory=True )



    # Train the model

    training_args = TrainingArguments( output_dir="mistral_lora_clm_with_added_tokens", num_train_epochs=2, save_total_limit=5, per_device_train_batch_size=8, warmup_steps=10, weight_decay=0.0001, dataloader_drop_last=True, bf16=True, logging_steps=10, learning_rate=1e-5, gradient_checkpointing=True, gradient_checkpointing_kwargs={"use_reentrant": False}, remove_unused_columns=False, hub_model_id="smangrul/mistral_lora_clm_with_added_tokens", push_to_hub=True, hub_private_repo=True, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, data_collator=default_data_collator, )

    model.config.use_cache = False


    # Check the model output on a sample from evaluation dataset

    import random

    i = random.randint(0, len(dataset["test"])) context = dataset["test"][i]["context"]

    batch = tokenizer(context, return_tensors="pt") batch = {k:"cuda") for k, v in batch.items()} model.eval() output_tokens = model.generate( **batch, max_new_tokens=256, do_sample=True, temperature=0.2, top_p=0.95, top_k=50, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False).split("<|endcontext|>")[1] target = dataset["test"][i]["target"] print(f"{context=} \n\n {target_predicted=} \n\n {target=}")

    # Save the Adapter model 
    When the lora layers are applied to embedding layers, the corresponding base model embedding layers are also saved. 

    trainer.push_to_hub() trainer.model.push_to_hub(training_args.output_dir)

    # Check the model loading is working as expected and generating plausible outputs.

    from peft import PeftModel

    inference_model = AutoModelForCausalLM.from_pretrained( model_name, low_cpu_mem_usage=True, # use_flash_attention_2=True, ) inference_model.resize_token_embeddings(len(tokenizer))

    inference_model = PeftModel.from_pretrained(inference_model, "smangrul/mistral_lora_clm_with_added_tokens")"cuda") inference_model.eval()

    output_tokens = inference_model.generate( **batch, max_new_tokens=256, do_sample=True, temperature=0.2, top_p=0.95, top_k=50, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, )

    target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False).split("<|endcontext|>")[1] print(f"{context=} \n\n {target_predicted=} \n\n {target=}")

    logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
    def write_model(model_path, input_base_path, tokenizer_path=None, safe_serialization=True, fix_eos_token_id=True): os.makedirs(model_path, exist_ok=True) tmp_model_path = os.path.join(model_path, "tmp") os.makedirs(tmp_model_path, exist_ok=True) config_path = Path(input_base_path) / "config.yaml" olmo_config = yaml.safe_load(config_path.read_text())["model"] n_layers = olmo_config["n_layers"] n_heads = olmo_config["n_heads"] dim = olmo_config["d_model"] dims_per_head = dim // n_heads base = 10000.0 inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) max_position_embeddings = olmo_config["max_sequence_length"] vocab_size = olmo_config.get("embedding_size", olmo_config["vocab_size"]) if olmo_config.get("n_kv_heads", None) is not None: num_key_value_heads = olmo_config["n_kv_heads"] # for GQA / MQA elif olmo_config["multi_query_attention"]: # compatibility with other checkpoints num_key_value_heads = 1 else: num_key_value_heads = n_heads print(f"Fetching all parameters from the checkpoint at {input_base_path}.") # Not sharded # (The sharded implementation would also work, but this is simpler.) loaded = torch.load(os.path.join(input_base_path, ""), map_location="cpu") param_count = 0 index_dict = {"weight_map": {}} for layer_i in range(n_layers): filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" # Unsharded # TODO: Layernorm stuff # TODO: multi query attention fused_dims = [dim, dims_per_head * num_key_value_heads, dims_per_head * num_key_value_heads] q_proj_weight, k_proj_weight, v_proj_weight = torch.split( loaded[f"transformer.blocks.{layer_i}.att_proj.weight"], fused_dims, dim=0 ) up_proj_weight, gate_proj_weight = torch.chunk( loaded[f"transformer.blocks.{layer_i}.ff_proj.weight"], 2, dim=0 ) state_dict = { f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj_weight, f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj_weight, f"model.layers.{layer_i}.self_attn.v_proj.weight": v_proj_weight, f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"transformer.blocks.{layer_i}.attn_out.weight"], f"model.layers.{layer_i}.mlp.gate_proj.weight": gate_proj_weight, f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"transformer.blocks.{layer_i}.ff_out.weight"], f"model.layers.{layer_i}.mlp.up_proj.weight": up_proj_weight, } state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq for k, v in state_dict.items(): index_dict["weight_map"][k] = filename param_count += v.numel(), os.path.join(tmp_model_path, filename)) filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" # Unsharded # TODO: Deal with weight-tying state_dict = { "model.embed_tokens.weight": loaded["transformer.wte.weight"], "lm_head.weight": loaded["transformer.ff_out.weight"] if "transformer.ff_out.weight" in loaded else loaded["transformer.wte.weight"], } for k, v in state_dict.items(): index_dict["weight_map"][k] = filename param_count += v.numel(), os.path.join(tmp_model_path, filename)) # Write configs index_dict["metadata"] = {"total_size": param_count * 2} write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) if olmo_config.get("mlp_hidden_size", None) is not None: intermediate_size = olmo_config["mlp_hidden_size"] // 2 else: intermediate_size = (dim * olmo_config["mlp_ratio"]) // 2 config = OlmoConfig( vocab_size=vocab_size, hidden_size=dim, intermediate_size=intermediate_size, num_hidden_layers=n_layers, num_attention_heads=n_heads, num_key_value_heads=num_key_value_heads, max_position_embeddings=max_position_embeddings, pad_token_id=olmo_config["pad_token_id"], bos_token_id=None, eos_token_id=olmo_config["eos_token_id"], tie_word_embeddings=olmo_config["weight_tying"], rope_theta=base, clip_qkv=olmo_config.get("clip_qkv"), ) config.save_pretrained(tmp_model_path) # Make space so we can load the model properly now. del state_dict del loaded gc.collect() if tokenizer_path is not None: _write_tokenizer(model_path, config, tokenizer_path, fix_eos_token_id) print("Loading the checkpoint in a OLMo model.") model = OlmoForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True) # Avoid saving this as part of the config. del model.config._name_or_path print("Saving in the Transformers format.") model.save_pretrained(model_path, safe_serialization=safe_serialization) shutil.rmtree(tmp_model_path)
    def main(): # See all possible arguments in src/transformers/ # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if model_args.use_auth_token is not None: warnings.warn( "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.", FutureWarning, ) if model_args.token is not None: raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") model_args.token = model_args.use_auth_token # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_plm", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) if training_args.should_log: # The default of training_args.log_level is passive, so we set log level at info here to have that default. transformers.utils.logging.set_verbosity_info() log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" )"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, token=model_args.token, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, token=model_args.token, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, token=model_args.token, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # If no validation data is there, validation_split_percentage will be used to divide the dataset. if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, token=model_args.token, ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, token=model_args.token, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "token": model_args.token, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = XLNetConfig() logger.warning("You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None:"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides)"New config: {config}") tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "token": model_args.token, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script. " "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = XLNetLMHeadModel.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, token=model_args.token, low_cpu_mem_usage=model_args.low_cpu_mem_usage, ) else:"Training new model from scratch") model = XLNetLMHeadModel(config) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. embedding_size = model.get_input_embeddings().weight.shape[0] if len(tokenizer) > embedding_size: model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = raw_datasets["train"].column_names else: column_names = raw_datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the " f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) if data_args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length) with training_args.main_process_first(desc="dataset map tokenization"): tokenized_datasets = tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset line_by_line", ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. def tokenize_function(examples): return tokenizer(examples[text_column_name]) with training_args.main_process_first(desc="dataset map tokenization"): tokenized_datasets = tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on every text in dataset", ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, and if the total_length < max_seq_length we exclude this batch and return an empty dict. # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # with training_args.main_process_first(desc="grouping texts together"): tokenized_datasets = group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc=f"Grouping texts in chunks of {max_seq_length}", ) if training_args.do_train: if "train" not in tokenized_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = tokenized_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = tokenized_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = # Data collator data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval:"*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) try: perplexity = math.exp(metrics["eval_loss"]) except OverflowError: perplexity = float("inf") metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "language-modeling"} if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
    def parse_args(): parser = argparse.ArgumentParser(description="Whisper Fine-Tuning with AdaLora") parser.add_argument( "--model_name_or_path", type=str, help="Path to pretrained model or model identifier from", required=True, ) parser.add_argument("--language", type=str, help="Language to use for training; e.g., 'Hindi' ", required=True) parser.add_argument("--language_abbr", type=str, help="Language to use for training; e.g., 'hi' ", required=True) parser.add_argument( "--task", type=str, default="transcribe", help="Task to use for training; e.g., 'transcribe' ", required=False ) parser.add_argument( "--dataset_name", type=str, default="mozilla-foundation/common_voice_11_0", help="Dataset to use for training; e.g., 'whisper' ", required=False, ) parser.add_argument( "--dataset_in_streaming_mode", action="store_true", help="Whether to use streaming mode for the dataset.", ) parser.add_argument( "--do_lower_case", action="store_true", help="lowercase the transcribed text before tokenizing" ) parser.add_argument( "--do_remove_punctuation", action="store_true", help="remove punctuation from the transcribed text" ) parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") parser.add_argument( "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--max_audio_input_length", type=float, default=30.0, help="Maximum audio length in seconds.") parser.add_argument( "--preprocessing_num_workers", type=int, default=None, help="The number of processes to use for the preprocessing.", ) parser.add_argument( "--per_device_train_batch_size", type=int, default=8, help="Batch size (per device) for the training dataloader.", ) parser.add_argument( "--per_device_eval_batch_size", type=int, default=8, help="Batch size (per device) for the evaluation dataloader.", ) parser.add_argument( "--buffer_size", type=int, default=5000, help="Number of samples to prefetch in the streaming mode.", ) parser.add_argument( "--dataloader_pin_memory", action="store_true", help="Whether or not to pin memory for the DataLoader.", ) parser.add_argument( "--dataloader_num_workers", type=int, default=0, help="Number of subprocesses to use for data loading.", ) parser.add_argument( "--learning_rate", type=float, default=5e-5, help="Initial learning rate (after the potential warmup period) to use.", ) parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.") parser.add_argument( "--max_train_steps", type=int, default=None, help="Total number of training steps to perform. If provided, overrides num_train_epochs.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--lr_scheduler_type", type=SchedulerType, default="linear", help="The scheduler type to use.", choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"], ) parser.add_argument( "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler." ) parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.") parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( "--load_best_model", action="store_true", help="Whether to load the best model at the end of training", ) parser.add_argument( "--with_tracking", action="store_true", help="Whether to enable experiment trackers for logging.", ) parser.add_argument( "--report_to", type=str, default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument( "--checkpointing_steps", type=int, default=500, help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", ) parser.add_argument( "--logging_steps", type=int, default=100, help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", ) parser.add_argument( "--evaluation_steps", type=int, default=500, help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", ) parser.add_argument( "--resume_from_checkpoint", type=str, default=None, help="If the training should continue from a checkpoint folder.", ) # lora/adalora specific args parser.add_argument( "--use_peft", action="store_true", help="Whether to use PEFT", ) parser.add_argument( "--use_adalora", action="store_true", help="Whether to use AdaLoRA or LoRA. If set, uses AdaLoRA instead of the default LoRA.", ) parser.add_argument( "--init_r", type=int, default=12, help="Initial AdaLoRA rank", ) parser.add_argument( "--target_r", type=int, default=4, help="Target AdaLoRA rank", ) parser.add_argument( "--tinit", type=int, default=200, help="number of warmup steps for AdaLoRA wherein no pruning is performed", ) parser.add_argument( "--tfinal", type=int, default=1000, help=" fix the resulting budget distribution and fine-tune the model for tfinal steps when using AdaLoRA ", ) parser.add_argument( "--delta_t", type=int, default=10, help="interval of steps for AdaLoRA to update rank", ) parser.add_argument( "--lora_alpha", type=int, default=32, help="LORA alpha", ) parser.add_argument( "--r", type=int, default=8, help="LORA rank", ) parser.add_argument( "--lora_dropout", type=float, default=0.1, help="LORA dropout", ) parser.add_argument( "--orth_reg_weight", type=float, default=0.5, help="Orthogonal regularization weight", ) parser.add_argument( "--debug_mode", action="store_true", help="Whether to use debug mode", ) args = parser.parse_args() if args.push_to_hub: assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed." return args
    def parse_args(): parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task") parser.add_argument( "--dataset_name", type=str, default=None, help="The name of the dataset to use (via the datasets library).", ) parser.add_argument( "--dataset_config_name", type=str, default=None, help="The configuration name of the dataset to use (via the datasets library).", ) parser.add_argument( "--train_file", type=str, default=None, help="A csv or a json file containing the training data." ) parser.add_argument( "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data." ) parser.add_argument( "--validation_split_percentage", default=5, help="The percentage of the train set used as validation set in case there's no validation split", ) parser.add_argument( "--model_name_or_path", type=str, help="Path to pretrained model or model identifier from", required=False, ) parser.add_argument( "--config_name", type=str, default=None, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", type=str, default=None, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--use_slow_tokenizer", action="store_true", help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).", ) parser.add_argument( "--per_device_train_batch_size", type=int, default=8, help="Batch size (per device) for the training dataloader.", ) parser.add_argument( "--per_device_eval_batch_size", type=int, default=8, help="Batch size (per device) for the evaluation dataloader.", ) parser.add_argument( "--learning_rate", type=float, default=5e-5, help="Initial learning rate (after the potential warmup period) to use.", ) parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.") parser.add_argument( "--max_train_steps", type=int, default=None, help="Total number of training steps to perform. If provided, overrides num_train_epochs.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--lr_scheduler_type", type=SchedulerType, default="linear", help="The scheduler type to use.", choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"], ) parser.add_argument( "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler." ) parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.") parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( "--model_type", type=str, default=None, help="Model type to use if training from scratch.", choices=MODEL_TYPES, ) parser.add_argument( "--block_size", type=int, default=None, help=( "Optional input sequence length after tokenization. The training dataset will be truncated in block of" " this size for training. Default to the model max input length for single sentence inputs (take into" " account special tokens)." ), ) parser.add_argument( "--preprocessing_num_workers", type=int, default=None, help="The number of processes to use for the preprocessing.", ) parser.add_argument( "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets" ) parser.add_argument( "--no_keep_linebreaks", action="store_true", help="Do not keep line breaks when using TXT files." ) parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") parser.add_argument( "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--checkpointing_steps", type=str, default=None, help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", ) parser.add_argument( "--resume_from_checkpoint", type=str, default=None, help="If the training should continue from a checkpoint folder.", ) # New Code # # Whether to load the best model at the end of training parser.add_argument( "--load_best_model", action="store_true", help="Whether to load the best model at the end of training", ) parser.add_argument( "--with_tracking", action="store_true", help="Whether to enable experiment trackers for logging.", ) parser.add_argument( "--report_to", type=str, default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' ' `"wandb"`, `"comet_ml"`, and `"dvclive"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) args = parser.parse_args() # Sanity checks if args.dataset_name is None and args.train_file is None and args.validation_file is None: raise ValueError("Need either a dataset name or a training/validation file.") else: if args.train_file is not None: extension = args.train_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file." if args.validation_file is not None: extension = args.validation_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file." if args.push_to_hub: assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed." return args