Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn #454

Open
Coding-Child opened this issue May 12, 2024 · 1 comment

Comments

@Coding-Child
Copy link

Coding-Child commented May 12, 2024

Hi!

I tried to use llama2-7b 4bit model, got this error

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Code:

from unsloth import FastLanguageModel
import os
import random
import numpy as np
from sklearn.metrics import f1_score
from datetime import datetime

import torch

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import EarlyStoppingCallback, TrainingArguments

from peft import PeftModel
from datasets import load_dataset, load_metric
from torcheval.metrics import Perplexity
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

from model.LLM import load_model
from Dataset.MedDialogueDataset import generate_prompt_in_batch
os.environ["WANDB_PROJECT"] = "llm_training"
os.environ["WANDB_LOG_MODEL"] = "checkpoints"
os.environ["TOKENIZERS_PARALLELISM"] = "false"


def load_model(model_path, max_len: int = 4096, r: int = 16, alpha: int = 32, dropout: float = 0.1):
    model, tokenizer = FastLanguageModel.from_pretrained(model_name=model_path,
                                                         max_seq_length=max_len,
                                                         dtype=None,
                                                         load_in_4bit=True,
                                                         device_map='auto'
                                                         )
    
    model = FastLanguageModel.get_peft_model(model=model,
                                             r=r,
                                             target_modules=["q_proj", 
                                                             "k_proj", 
                                                             "v_proj", 
                                                             "o_proj", 
                                                             "gate_proj", 
                                                             "up_proj", 
                                                             "down_proj"
                                                             ],
                                             lora_alpha=alpha,
                                             lora_dropout=dropout,
                                             use_gradient_checkpointing='unsloth',
                                             random_state=42,
                                             use_rslora=False,
                                             loftq_config=None,
                                             )

    return model, tokenizer


def compute_metrics(eval_pred, tokenizer):
    bleu = load_metric('bleu')
    meteor = load_metric('meteor')
    rouge = load_metric('rouge')
    perplexity = Perplexity(ignore_index=-100)

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    valid_ids = labels != -100
    valid_predictions = predictions[valid_ids]
    valid_labels = labels[valid_ids]

    predictions = [tokenizer.tokenize(tokenizer.decode(ids, skip_special_tokens=True)) for ids in valid_predictions]
    references = [[tokenizer.tokenize(tokenizer.decode([label], skip_special_tokens=True))] for label in valid_labels]

    perplexity.update(torch.tensor(logits), torch.tensor(labels))

    bleu2_score = bleu.compute(predictions=predictions, references=references, max_order=2)['bleu']
    bleu4_score = bleu.compute(predictions=predictions, references=references, max_order=4)['bleu']
    meteor_score = meteor.compute(predictions=predictions, references=references)['meteor']
    rouge = rouge.compute(predictions=predictions, references=references)
    f1 = f1_score(valid_labels, valid_predictions, average='macro')
    ppl = perplexity.compute().item()

    return {'bleu2': bleu2_score,
            'bleu4': bleu4_score,
            'meteor': meteor_score,
            'rouge1': rouge['rouge1'].mid.fmeasure,
            'rouge2': rouge['rouge2'].mid.fmeasure,
            'rougeL': rouge['rougeL'].mid.fmeasure,
            'rougeLsum': rouge['rougeLsum'].mid.fmeasure,
            'f1': f1,
            'perplexity': ppl
            }


def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def main(args):
    seed_everything(args.seed)

    # Set the arguments
    model_name = args.model_name
    lr = args.learning_rate
    batch_size = args.batch_size
    accumulation_step = args.accumulation_step
    using_scheduler = args.using_scheduler
    sch_type = args.scheduler_type
    num_epochs = args.num_epochs
    train_path = args.train_path
    val_path = args.val_path
    test_path = args.test_path
    save_name = args.save_path + '/' + model_name.split("/")[-1]
    merged_model_name = 'full_model' + '/' + model_name.split("/")[-1]
    r = args.r
    lora_dropout = args.lora_dropout
    lora_alpha = args.lora_alpha

    # Load the model
    model, tokenizer = load_model(model_name, r=r, alpha=lora_alpha, dropout=lora_dropout)

    special_tokens_dict = {'additional_special_tokens': ['<INST>', '</INST>']}
    tokenizer.pad_token = tokenizer.unk_token
    _ = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    torch.cuda.empty_cache()

    # Load the dataset
    files = {'train': train_path, 'validation': val_path, 'test': test_path}
    dataset = load_dataset('json', data_files=files)
    dataset = dataset.map(lambda x: {'prompt': generate_prompt_in_batch(x)}, batched=True, load_from_cache_file=False,
                          remove_columns=['description', 'utterances'])

    print(f'Train Dataset: {len(dataset["train"])} | Valid Dataset: {len(dataset["validation"])} | Test Dataset: {len(dataset["test"])}')

    data_collator = DataCollatorForCompletionOnlyLM(tokenizer=tokenizer,
                                                    instruction_template='<INST>',
                                                    response_template='</INST>',
                                                    mlm=False,
                                                    return_tensors='pt'
                                                    )

    # Early stopping callback and scheduler
    early_stopping = EarlyStoppingCallback(early_stopping_patience=num_epochs * 0.1)
    if using_scheduler and sch_type == 'cosine':
        warmup_step = args.warmup_step
        scheduler_type = 'cosine'
    elif using_scheduler and sch_type == 'reduce_on_plateau':
        warmup_step = 0
        scheduler_type = 'reduce_on_plateau'
    else:
        warmup_step = 0
        scheduler_type = 'constant'

    # Training the model
    training_args = TrainingArguments(per_device_train_batch_size=batch_size,
                                      gradient_accumulation_steps=accumulation_step,
                                      warmup_steps=warmup_step,
                                      num_train_epochs=num_epochs,
                                      learning_rate=lr,
                                      fp16=not torch.cuda.is_bf16_supported(),
                                      bf16=torch.cuda.is_bf16_supported(),
                                      logging_steps=5,
                                      optim="paged_adamw_8bit",
                                      weight_decay=0.1,
                                      lr_scheduler_type=scheduler_type,
                                      do_eval=True,
                                      evaluation_strategy='epoch',
                                      save_strategy='epoch',
                                      load_best_model_at_end=True,
                                      greater_is_better=False,
                                      metric_for_best_model='perplexity',
                                      seed=3407,
                                      output_dir=args.save_path,
                                      logging_dir='./logs',
                                      run_name=f'{model_name.split("/")[-1]}_{datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}',
                                      report_to='wandb', 
                                      )

    trainer = SFTTrainer(model=model,
                         tokenizer=tokenizer,
                         args=training_args,
                         max_seq_length=args.max_len,
                         dataset_num_proc=2,
                         train_dataset=dataset['train'],
                         eval_dataset=dataset['validation'],
                         dataset_text_field='prompt',
                         packing=False,
                         data_collator=data_collator,
                         compute_metrics=lambda x: compute_metrics(x, tokenizer)
                         )

    tester = SFTTrainer(model=model,
                        tokenizer=tokenizer,
                        args=training_args,
                        dataset_text_field='prompt',
                        max_seq_length=args.max_len,
                        dataset_num_proc=2,
                        eval_dataset=dataset['test'],
                        packing=False,
                        data_collator=data_collator,
                        compute_metrics=lambda x: compute_metrics(x, tokenizer)
                        )

    model.config.use_cache = False
    trainer.train()
    result = tester.evaluate()
    print(result)

    # save the adapter weight
    model.save_pretrained_merged(save_name, tokenizer, save_method='lora')

    # Save the merged model (base model weights + QLoRA weights)
    model.save_pretrained_merged(merged_model_name, tokenizer, save_method = "merged_4bit")
@danielhanchen
Copy link
Contributor

Do you know exactly where / which line the error pops out?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants