Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

使用 llama3 的 lora 微调报错:NotImplementedError: Cannot copy out of meta tensor; no data! #115

Open
chenmonster opened this issue May 14, 2024 · 3 comments

Comments

@chenmonster
Copy link

colab 环境,执行 llama3 lora 微调报错:NotImplementedError: Cannot copy out of meta tensor; no data! 。微调代码如下:

import pandas as pd
from modelscope import snapshot_download
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, TaskType, get_peft_model


def process_func(example):
    MAX_LENGTH = 384    # Llama分词器会将一个中文字切分为多个token,因此需要放开一些最大长度,保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|start_header_id|>user<|end_header_id|>\n\n{example['instruction'] + example['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}<|eot_id|>", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


# 下载模型
model_dir = snapshot_download('LLM-Research/Meta-Llama-3-8B-Instruct', cache_dir='.', revision='master')
print('模型路径:', model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map='auto', torch_dtype=torch.bfloat16)
model.enable_input_require_grads() # 开启梯度检查点
# 下载数据集
df = pd.read_json('https://github.com/datawhalechina/self-llm/raw/master/dataset/huanhuan.json')
ds = Dataset.from_pandas(df)
print('数据集:', ds[:3])
# 处理数据集
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
# 定义 LoraConfig
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph,具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
model = get_peft_model(model, config)
model.print_trainable_parameters()
# 定义 TrainingArguments 参数
args = TrainingArguments(
    output_dir='./output/llama3',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True
)
# 训练
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)
trainer.train()
# 保存 lora 模型
lora_path='./llama3_lora'
trainer.model.save_pretrained(lora_path)
tokenizer.save_pretrained(lora_path)

image

@KMnO4-zx
Copy link
Contributor

emmm 教程没有在colab环境下做过测试,你可以调试一下 看看bug在哪,也可以使用与教程相同的autodl环境

@chenmonster
Copy link
Author

model = AutoModelForCausalLM.from_pretrained(model_dir, device_map='auto', torch_dtype=torch.bfloat16)

将上面的代码改为

model = AutoModelForCausalLM.from_pretrained(model_dir, device_map='cuda', torch_dtype=torch.half, trust_remote_code=True)

重新运行,报错:

OutOfMemoryError: CUDA out of memory.

@KMnO4-zx
Copy link
Contributor

torch.half 是自动选择半精度加载,有些显卡不支持bf16,但全部显卡应该都支持fp16,这个应该不是导致oom的原因

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants