用自己的数据集微调成功

2025-07-10 17:47:45 +08:00 · 2025-07-10 17:47:45 +08:00 · 8b3d089b9e
commit 8b3d089b9e
parent 9dc75f007f
1 changed files with 92 additions and 0 deletions
--- a/003加载自己的数据集微调.py
+++ b/003加载自己的数据集微调.py
@ -0,0 +1,92 @@
+from datasets import load_dataset
+from unsloth import FastLanguageModel
+import torch
+
+# 加载 jsonl 文件
+dataset = load_dataset("json", data_files="dataset/test_dataset.jsonl", split="train")
+
+# 转换成 ChatML 格式的字符串字段
+def to_chatml(example):
+    messages = example["messages"]
+    chat = ""
+    for m in messages:
+        chat += f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>\n"
+    return {"text": chat.strip()}
+
+# 添加 `text` 字段
+dataset = dataset.map(to_chatml)
+
+# print("\n", dataset[0])
+
+
+# 加载预训练模型
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit",
+    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
+    load_in_4bit = True,     # 4bit uses much less memory , 启用QLoRA
+    load_in_8bit = False,    # A bit more accurate, uses 2x memory
+    full_finetuning = False, # We have full finetuning now!
+    # token = "hf_...",      # use one if using gated models
+)
+
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 16,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 16,  # Best to choose alpha = rank or rank*2
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",    # Supports any, but = "none" is optimized
+    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
+    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
+    random_state = 3407,
+    use_rslora = False,   # We support rank stabilized LoRA
+    loftq_config = None,  # And LoftQ
+)
+
+
+from trl import SFTTrainer, SFTConfig
+trainer = SFTTrainer(
+    model = model,
+    tokenizer = tokenizer,
+    train_dataset = dataset,
+    eval_dataset = None, # Can set up evaluation!
+    args = SFTConfig(
+        dataset_text_field = "text",
+        per_device_train_batch_size = 2,
+        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
+        warmup_steps = 5,
+        # num_train_epochs = 1, # Set this for 1 full training run.
+        max_steps = 30,
+        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
+        logging_steps = 1,
+        optim = "adamw_8bit",
+        weight_decay = 0.01,
+        lr_scheduler_type = "linear",
+        seed = 3407,
+        report_to = "none", # Use this for WandB etc
+    ),
+)
+
+trainer.train()
+
+
+messages = [
+    {"role" : "user", "content" : "介绍一下牛顿"}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize = False,
+    add_generation_prompt = True, # Must add for generation
+    enable_thinking = False, # Disable thinking
+)
+
+from transformers import TextStreamer
+_ = model.generate(
+    **tokenizer(text, return_tensors = "pt").to("cuda"),
+    max_new_tokens = 256, # Increase for longer outputs!
+    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
+    streamer = TextStreamer(tokenizer, skip_prompt = True),
+)
+
+