from unsloth import FastLanguageModelimport torchmax_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.model, tokenizer = FastLanguageModel.from_pretrained( model_name = "unsloth/llama-3-8b-bnb-4bit", max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit,)
model = FastLanguageModel.get_peft_model( model, r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 target_modules = ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj",], lora_alpha = 16, lora_dropout = 0, # Supports any, but = 0 is optimized bias = "none", # Supports any, but = "none" is optimized# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes! use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context random_state = 3407, use_rslora = False, # We support rank stabilized LoRA loftq_config = None, # And LoftQ)
准备数据:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction:{} ### Input:{} ### Response:{}""" EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKENdef formatting_prompts_func(examples): instructions = examples["instruction"] inputs = examples["input"] outputs = examples["output"] texts = []for instruction, input, output in zip(instructions, inputs, outputs):# Must add EOS_TOKEN, otherwise your generation will go on forever! text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN texts.append(text)return { "text" : texts, }pass from datasets import load_datasetdataset = load_dataset("yahma/alpaca-cleaned", split = "train")dataset = dataset.map(formatting_prompts_func, batched = True,)
定义trainer:
from trl import SFTTrainerfrom transformers import TrainingArguments trainer = SFTTrainer(model = model,tokenizer = tokenizer,train_dataset = dataset,dataset_text_field = "text",max_seq_length = max_seq_length,dataset_num_proc = 2,packing = False, # Can make training 5x faster for short sequences.args = TrainingArguments(per_device_train_batch_size = 2,gradient_accumulation_steps = 4,warmup_steps = 5,max_steps = 60,learning_rate = 2e-4,fp16 = not torch.cuda.is_bf16_supported(),bf16 = torch.cuda.is_bf16_supported(),logging_steps = 1,optim = "adamw_8bit",weight_decay = 0.01,lr_scheduler_type = "linear",seed = 3407,output_dir = "outputs",),)
查看内存占用:
#@title Show current memory statsgpu_stats = torch.cuda.get_device_properties(0)start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")print(f"{start_gpu_memory} GB of memory reserved.")
if True:from unsloth import FastLanguageModel model, tokenizer = FastLanguageModel.from_pretrained( model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit, ) FastLanguageModel.for_inference(model) # Enable native 2x faster inference # alpaca_prompt = You MUST copy from above! inputs = tokenizer([ alpaca_prompt.format("Answer question base on truth", # instruction"Introducing the history of Beijing", # input"", # output - leave this blank for generation! )], return_tensors = "pt").to("cuda") outputs = model.generate(**inputs, max_new_tokens = 1000, use_cache = True)tokenizer.batch_decode(outputs)