no_robots 数据集中的 10,000 个样本,被分为 9,500 个训练样本和 500 个测试样本,其中有些样本不包含 system 信息。作者使用 datasets 库加载数据集,添加了缺失的 system 信息,并将它们保存到单独的 json 文件中。示例代码如下所示:
from datasets import load_dataset # Convert dataset to OAI messages system_message = """You are Llama, an AI assistant created by Philipp to be helpful and honest. Your knowledge spans a wide range of topics, allowing you to engage in substantive conversations and provide analysis on complex subjects.""" def create_conversation(sample): if sample["messages"][0]["role"] == "system": return sample else: sample["messages"] = [{"role": "system", "content": system_message}] + sample["messages"] return sample # Load dataset from the hub dataset = load_dataset("HuggingFaceH4/no_robots") # Add system message to each conversation columns_to_remove = list(dataset["train"].features) columns_to_remove.remove("messages") dataset = dataset.map(create_conversation, remove_columns=columns_to_remove,batched=False) # Filter out conversations which are corrupted with wrong turns, keep which have even number of turns after adding system message dataset["train"] = dataset["train"].filter(lambda x: len(x["messages"][1:]) % 2 == 0) dataset["test"] = dataset["test"].filter(lambda x: len(x["messages"][1:]) % 2 == 0) # save datasets to disk dataset["train"].to_json("train_dataset.json", orient="records", force_ascii=False) dataset["test"].to_json("test_dataset.json", orient="records", force_ascii=False)
%%writefile llama_3_70b_fsdp_qlora.yaml# script parameters model_id: "meta-llama/Meta-Llama-3-70b" # Hugging Face model id dataset_path: "." # path to dataset max_seq_len: 3072 # 2048 # max sequence length for model and packing of the dataset # training parameters output_dir: "./llama-3-70b-hf-no-robot" # Temporary output directory for model checkpoints report_to: "tensorboard" # report metrics to tensorboard learning_rate: 0.0002 # learning rate 2e-4 lr_scheduler_type: "constant" # learning rate scheduler num_train_epochs: 3 # number of training epochs per_device_train_batch_size: 1 # batch size per device during training per_device_eval_batch_size: 1 # batch size for evaluation gradient_accumulation_steps: 2 # number of steps before performing a backward/update pass optim: adamw_torch # use torch adamw optimizer logging_steps: 10 # log every 10 steps save_strategy: epoch # save checkpoint every epoch evaluation_strategy: epoch # evaluate every epoch max_grad_norm: 0.3 # max gradient norm warmup_ratio: 0.03 # warmup ratio bf16: true # use bfloat16 precision tf32: true # use tf32 precision gradient_checkpointing: true # use gradient checkpointing to save memory # FSDP parameters: https://huggingface.co/docs/transformers/main/en/fsdp fsdp: "full_shard auto_wrap offload" # remove offload if enough GPU memory fsdp_config: backward_prefetch: "backward_pre" forward_prefetch: "false" use_orig_params: "false"
#### COMMENT IN TO MERGE PEFT AND BASE MODEL #### # from peft import AutoPeftModelForCausalLM # # Load PEFT model on CPU # model = AutoPeftModelForCausalLM.from_pretrained( # args.output_dir, # torch_dtype=torch.float16, # low_cpu_mem_usage=True, # ) # # Merge LoRA and base model and save # merged_model = model.merge_and_unload() # merged_model.save_pretrained(args.output_dir,safe_serialization=True, max_shard_size="2GB")
import torchfrom peft import AutoPeftModelForCausalLMfrom transformers import AutoTokenizer peft_model_id = "./llama-3-70b-hf-no-robot" # Load Model with PEFT adapter model = AutoPeftModelForCausalLM.from_pretrained( peft_model_id, torch_dtype=torch.float16, quantization_config= {"load_in_4bit": True}, device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
接下来加载测试数据集,尝试生成指令。
from datasets import load_datasetfrom random import randint # Load our test dataset eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train") rand_idx = randint(0, len(eval_dataset)) messages = eval_dataset[rand_idx]["messages"][:2] # Test on sample input_ids = tokenizer.apply_chat_template(messages,add_generation_prompt=True,return_tensors="pt").to(model.device) outputs = model.generate( input_ids, max_new_tokens=512, eos_token_id= tokenizer.eos_token_id, do_sample=True, temperature=0.6, top_p=0.9, ) response = outputs[0][input_ids.shape[-1]:] print(f"**Query:**\n{eval_dataset[rand_idx]['messages'][1]['content']}\n") print(f"**Original Answer:**\n{eval_dataset[rand_idx]['messages'][2]['content']}\n") print(f"**Generated Answer:**\n{tokenizer.decode(response,skip_special_tokens=True)}") # **Query:** # How long was the Revolutionary War? # **Original Answer:** # The American Revolutionary War lasted just over seven years. The war started on April 19, 1775, and ended on September 3, 1783. # **Generated Answer:** # The Revolutionary War, also known as the American Revolution, was an 18th-century war fought between the Kingdom of Great Britain and the Thirteen Colonies. The war lasted from 1775 to 1783.