😍Hugging Face 的 PEFT 包微调
安装 PEFT包+导包
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
训练数据集准备
训练集格式:
json格式,三部分组成:instruction,input,output
[{"instruction": "你是农作物领域专门进行关系抽取的专家。请从给定的文本中抽取出关系三元组,不存在的关系返回空列表。请按照JSON字符串的格式回答。","input": "煤是一种常见的化石燃料,家庭用煤经过了从\"煤球\"到\"蜂窝煤\"的演变。","output": "[{\"head\": \"煤\", \"relation\": \"use\", \"tail\": \"燃料\"}]"},{"instruction": "你是农作物领域专门进行关系抽取的专家。请从给定的文本中抽取出关系三元组,不存在的关系返回空列表。请按照JSON字符串的格式回答。","input": "内分泌疾病是指内分泌腺或内分泌组织本身的分泌功能和(或)结构异常时发生的症候群。","output": "[{\"head\": \"腺\", \"relation\": \"use\", \"tail\": \"分泌\"}]"},
]
数据加载预处理
将训练数据集处理为glm所需数据样式
glm所需prompt template如下:
[gMASK]<sop>
<|system|> 你是关系抽取专家。
<|user|> 煤是一种常见的化石燃料,家庭用煤经过了从\"煤球\"到\"蜂窝煤\"的演变。
<|assistant|> [{\"head\": \"煤\", \"relation\": \"use\", \"tail\": \"燃料\"}] <|endoftext|>
接下来:将crop_train.json中的每个json对象处理为以上模版形式,并且进行分词-标号形成:input_ids, attention_mask, labels 数据
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainerdataset = load_dataset("json", data_files="./crop_train.json", split="train")
print(f"dataset: {dataset}")tokenizer = AutoTokenizer.from_pretrained("./glm-4-9b-chat", trust_remote_code=True)
print(f"tokenizer: {tokenizer}")def process_func(example):MAX_LENGTH = 256input_ids, attention_mask, labels = [], [], []# 合并example的instruction和input字段为一个字符串instruction = f"{example['instruction']} {example['input']}".strip() # queryinstruction = tokenizer.apply_chat_template([{"role": "user", "content": instruction}],add_generation_prompt=True,tokenize=True,return_tensors="pt",return_dict=True) # '[gMASK] <sop> <|user|> \nquery <|assistant|>'# 检查example["output"]是否是列表,并相应地处理if isinstance(example["output"], list):response_text = "\n".join(example["output"])else:response_text = "\n" + example["output"]response = tokenizer(response_text, add_special_tokens=False) # \n response, 缺少eos tokeninput_ids = instruction["input_ids"][0].numpy().tolist() + response["input_ids"] + [tokenizer.eos_token_id]attention_mask = instruction["attention_mask"][0].numpy().tolist() + response["attention_mask"] + [1]labels = [-100] * len(instruction["input_ids"][0].numpy().tolist()) + response["input_ids"] + [tokenizer.eos_token_id]if len(input_ids) > MAX_LENGTH:input_ids = input_ids[:MAX_LENGTH]attention_mask = attention_mask[:MAX_LENGTH]labels = labels[:MAX_LENGTH]return {"input_ids": input_ids,"attention_mask": attention_mask,"labels": labels}tokenized_ds = dataset.map(process_func, remove_columns=['instruction', 'input', 'output'])
print(f"All tokenizer tokens ids: {tokenized_ds}") # features: ['input_ids', 'attention_mask', 'labels'],# tokenized_ds: 包含input_ids, attention_mask, labels = [], [], []
input_ids_1 = tokenized_ds[0]["input_ids"]
attention_mask_1 = tokenized_ds[0]["attention_mask"]
labels_1 = tokenized_ds[0]["labels"]
print(f"input_ids_1: {input_ids_1}")
print(f"attention_mask_1: {attention_mask_1}")
print(f"labels_1: {labels_1}")input_text_1 = tokenizer.decode(input_ids_1)
print(f"input_ids_1_decode: {input_text_1}")
下载创建模型
import torch
model = AutoModelForCausalLM.from_pretrained("./glm-4-9b-chat", trust_remote_code=True, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, device_map="auto")
PEFT LoRA 代码
配置文件config
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
config = LoraConfig(target_modules=["query_key_value"], modules_to_save=["post_attention_layernorm"])
创建模型
model = get_peft_model(model, config)
print(model.print_trainable_parameters()) # all params、trainable params
配置训练参数
args = TrainingArguments(output_dir="./chatbot",per_device_train_batch_size=2,gradient_accumulation_steps=8,gradient_checkpointing=True,logging_steps=100,num_train_epochs=10,learning_rate=1e-4,remove_unused_columns=False,save_strategy="epoch"
)
创建训练器
trainer = Trainer(model=model,args=args,train_dataset=tokenized_ds.select(range(10000)),data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)
开始训练
trainer.train()
保存权重
lora_path='./GLM4'
trainer.model.save_pretrained(lora_path)
tokenizer.save_pretrained(lora_path)
原模型加载Lora训练权重推理预测
使用PeftModel包可以一起加载原始模型、lora训练模型
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModelmode_path = '/root/autodl-tmp/glm-4-9b-chat/ZhipuAI/glm-4-9b-chat'
lora_path = './GLM4_lora'# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)prompt = "你是谁?"
inputs = tokenizer.apply_chat_template([{"role": "system", "content": "假设你是皇帝身边的女人--甄嬛。"},{"role": "user", "content": prompt}],add_generation_prompt=True,tokenize=True,return_tensors="pt",return_dict=True).to('cuda')gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
with torch.no_grad():outputs = model.generate(**inputs, **gen_kwargs)outputs = outputs[:, inputs['input_ids'].shape[1]:]print(tokenizer.decode(outputs[0], skip_special_tokens=True))
合并lora模型
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, GenerationConfig
from peft import PeftModel# 载入预训练模型
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True, padding_side="left", **config_kwargs)print("Tokenizer Load Success!")config = AutoConfig.from_pretrained(base_model, **config_kwargs)# Load and prepare pretrained models (without valuehead).
model = AutoModelForCausalLM.from_pretrained(base_model,config=config,torch_dtype=torch.float16,low_cpu_mem_usage=True,trust_remote_code=True,revision='main'
)print('origin config =', model.config)
# 模型合并lora_path = "./save_lora"
model = PeftModel.from_pretrained(model, lora_path)
model = model.merge_and_unload()
print('merge config =', model.config)
# 保存合并模型
save_path = "./save_merge_model
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)