We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hi!
I tried to use llama2-7b 4bit model, got this error
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
Code:
from unsloth import FastLanguageModel import os import random import numpy as np from sklearn.metrics import f1_score from datetime import datetime import torch from transformers import AutoTokenizer, AutoModelForCausalLM from transformers import EarlyStoppingCallback, TrainingArguments from peft import PeftModel from datasets import load_dataset, load_metric from torcheval.metrics import Perplexity from trl import SFTTrainer, DataCollatorForCompletionOnlyLM from model.LLM import load_model from Dataset.MedDialogueDataset import generate_prompt_in_batch os.environ["WANDB_PROJECT"] = "llm_training" os.environ["WANDB_LOG_MODEL"] = "checkpoints" os.environ["TOKENIZERS_PARALLELISM"] = "false" def load_model(model_path, max_len: int = 4096, r: int = 16, alpha: int = 32, dropout: float = 0.1): model, tokenizer = FastLanguageModel.from_pretrained(model_name=model_path, max_seq_length=max_len, dtype=None, load_in_4bit=True, device_map='auto' ) model = FastLanguageModel.get_peft_model(model=model, r=r, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ], lora_alpha=alpha, lora_dropout=dropout, use_gradient_checkpointing='unsloth', random_state=42, use_rslora=False, loftq_config=None, ) return model, tokenizer def compute_metrics(eval_pred, tokenizer): bleu = load_metric('bleu') meteor = load_metric('meteor') rouge = load_metric('rouge') perplexity = Perplexity(ignore_index=-100) logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) valid_ids = labels != -100 valid_predictions = predictions[valid_ids] valid_labels = labels[valid_ids] predictions = [tokenizer.tokenize(tokenizer.decode(ids, skip_special_tokens=True)) for ids in valid_predictions] references = [[tokenizer.tokenize(tokenizer.decode([label], skip_special_tokens=True))] for label in valid_labels] perplexity.update(torch.tensor(logits), torch.tensor(labels)) bleu2_score = bleu.compute(predictions=predictions, references=references, max_order=2)['bleu'] bleu4_score = bleu.compute(predictions=predictions, references=references, max_order=4)['bleu'] meteor_score = meteor.compute(predictions=predictions, references=references)['meteor'] rouge = rouge.compute(predictions=predictions, references=references) f1 = f1_score(valid_labels, valid_predictions, average='macro') ppl = perplexity.compute().item() return {'bleu2': bleu2_score, 'bleu4': bleu4_score, 'meteor': meteor_score, 'rouge1': rouge['rouge1'].mid.fmeasure, 'rouge2': rouge['rouge2'].mid.fmeasure, 'rougeL': rouge['rougeL'].mid.fmeasure, 'rougeLsum': rouge['rougeLsum'].mid.fmeasure, 'f1': f1, 'perplexity': ppl } def seed_everything(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) os.environ['PYTHONHASHSEED'] = str(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False def main(args): seed_everything(args.seed) # Set the arguments model_name = args.model_name lr = args.learning_rate batch_size = args.batch_size accumulation_step = args.accumulation_step using_scheduler = args.using_scheduler sch_type = args.scheduler_type num_epochs = args.num_epochs train_path = args.train_path val_path = args.val_path test_path = args.test_path save_name = args.save_path + '/' + model_name.split("/")[-1] merged_model_name = 'full_model' + '/' + model_name.split("/")[-1] r = args.r lora_dropout = args.lora_dropout lora_alpha = args.lora_alpha # Load the model model, tokenizer = load_model(model_name, r=r, alpha=lora_alpha, dropout=lora_dropout) special_tokens_dict = {'additional_special_tokens': ['<INST>', '</INST>']} tokenizer.pad_token = tokenizer.unk_token _ = tokenizer.add_special_tokens(special_tokens_dict) model.resize_token_embeddings(len(tokenizer)) torch.cuda.empty_cache() # Load the dataset files = {'train': train_path, 'validation': val_path, 'test': test_path} dataset = load_dataset('json', data_files=files) dataset = dataset.map(lambda x: {'prompt': generate_prompt_in_batch(x)}, batched=True, load_from_cache_file=False, remove_columns=['description', 'utterances']) print(f'Train Dataset: {len(dataset["train"])} | Valid Dataset: {len(dataset["validation"])} | Test Dataset: {len(dataset["test"])}') data_collator = DataCollatorForCompletionOnlyLM(tokenizer=tokenizer, instruction_template='<INST>', response_template='</INST>', mlm=False, return_tensors='pt' ) # Early stopping callback and scheduler early_stopping = EarlyStoppingCallback(early_stopping_patience=num_epochs * 0.1) if using_scheduler and sch_type == 'cosine': warmup_step = args.warmup_step scheduler_type = 'cosine' elif using_scheduler and sch_type == 'reduce_on_plateau': warmup_step = 0 scheduler_type = 'reduce_on_plateau' else: warmup_step = 0 scheduler_type = 'constant' # Training the model training_args = TrainingArguments(per_device_train_batch_size=batch_size, gradient_accumulation_steps=accumulation_step, warmup_steps=warmup_step, num_train_epochs=num_epochs, learning_rate=lr, fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), logging_steps=5, optim="paged_adamw_8bit", weight_decay=0.1, lr_scheduler_type=scheduler_type, do_eval=True, evaluation_strategy='epoch', save_strategy='epoch', load_best_model_at_end=True, greater_is_better=False, metric_for_best_model='perplexity', seed=3407, output_dir=args.save_path, logging_dir='./logs', run_name=f'{model_name.split("/")[-1]}_{datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}', report_to='wandb', ) trainer = SFTTrainer(model=model, tokenizer=tokenizer, args=training_args, max_seq_length=args.max_len, dataset_num_proc=2, train_dataset=dataset['train'], eval_dataset=dataset['validation'], dataset_text_field='prompt', packing=False, data_collator=data_collator, compute_metrics=lambda x: compute_metrics(x, tokenizer) ) tester = SFTTrainer(model=model, tokenizer=tokenizer, args=training_args, dataset_text_field='prompt', max_seq_length=args.max_len, dataset_num_proc=2, eval_dataset=dataset['test'], packing=False, data_collator=data_collator, compute_metrics=lambda x: compute_metrics(x, tokenizer) ) model.config.use_cache = False trainer.train() result = tester.evaluate() print(result) # save the adapter weight model.save_pretrained_merged(save_name, tokenizer, save_method='lora') # Save the merged model (base model weights + QLoRA weights) model.save_pretrained_merged(merged_model_name, tokenizer, save_method = "merged_4bit")
The text was updated successfully, but these errors were encountered:
Do you know exactly where / which line the error pops out?
Sorry, something went wrong.
No branches or pull requests
Hi!
I tried to use llama2-7b 4bit model, got this error
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
Code:
The text was updated successfully, but these errors were encountered: