External Publication
Visit Post

Fine-Tuning an SLM for a Low-Resource Language

Hugging Face Forums [Unofficial] June 3, 2026
Source

Thank you very much for your help! My target language is Persian (Iranian), and for the base model I used Qwen 3 0.6B Instruct. I chose this model because I am targeting low-end devices, such as phones and school computers.

My main limitation is hardware. I have an RTX 3050 Mobile 4Gb , 32Gb RAM , and an Intel i5-11400H.

If you have time, I have some more questions:

I tried to continue pretraining through a LoRA adapter using the Unsloth library in Python with these LoRA adapter settings:

model = FastLanguageModel.get_peft_model(

    model,

    r=8,

    target_modules=[

        "q_proj", "k_proj", "v_proj", "o_proj",

        "gate_proj", "up_proj", "down_proj",

    ],

    lora_alpha=16,

    lora_dropout=0,

    bias="none",

    use_gradient_checkpointing="unsloth",

    random_state=3407,

    use_rslora=False,

    loftq_config=None,

)

I used a .jsonl file containing semi-cleaned raw Wikipedia text in the target language, but the results were not very good and were weaker than the original model. Unfortunately, I do not have the training logs, but as I remember, the model started with a loss of around 2.3 , and by the end of training it reached about 1.5.

I used an 800Mb cleaned Persian Wikipedia .jsonl file to test whether the model would improve or not, but the results were negative, at least in my setup.

For fine-tuning, I used two datasets:

Persian-Wiki-QA (a Persian Wikipedia QA dataset)

Alpaca-Persian-Cleaned (a cleaned Persian translation of Alpaca, translated using Google Translate)

This is the training script I used to train the model:

import os

from datasets import load_dataset

from unsloth import FastLanguageModel

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling





MODEL_PATH = r"F:\Deep Learning\Model\Qwen 3 0.6B SafeTensor"

DATA_PATH  = r"E:\Ava1\Dataset-pars\Wikipedia-Persian\Wikipedia-Persian-textonly.jsonl"

OUTPUT_DIR = r"F:\Deep Learning\Outputs\qwen3-0.6b-persian-pretrain-lora"



MAX_SEQ_LENGTH = 256

LOAD_IN_4BIT = True



def get_latest_checkpoint(output_dir):

    if not os.path.exists(output_dir):

        return None

    checkpoints = []

    for name in os.listdir(output_dir):

        path = os.path.join(output_dir, name)

        if os.path.isdir(path) and name.startswith("checkpoint-"):

            try:

                step = int(name.split("-")[-1])

                checkpoints.append((step, path))

            except:

                pass

    if not checkpoints:

        return None

    checkpoints.sort(key=lambda x: x[0])

    return checkpoints[-1][1]



def main():

    os.makedirs(OUTPUT_DIR, exist_ok=True)



    model, tokenizer = FastLanguageModel.from_pretrained(

        model_name=MODEL_PATH,

        max_seq_length=MAX_SEQ_LENGTH,

        dtype=None,

        load_in_4bit=LOAD_IN_4BIT,

    )



    if tokenizer.pad_token is None:

        tokenizer.pad_token = tokenizer.eos_token



    model = FastLanguageModel.get_peft_model(

        model,

        r=8,

        target_modules=[

            "q_proj", "k_proj", "v_proj", "o_proj",

            "gate_proj", "up_proj", "down_proj",

        ],

        lora_alpha=16,

        lora_dropout=0,

        bias="none",

        use_gradient_checkpointing="unsloth",

        random_state=3407,

        use_rslora=False,

        loftq_config=None,

    )



    dataset = load_dataset("json", data_files=DATA_PATH, split="train")



    def clean_example(example):

        text = example.get("Text", "")

        if text is None:

            text = ""

        return {"text": str(text).strip()}



    dataset = dataset.map(

        clean_example,

        remove_columns=dataset.column_names,

        load_from_cache_file=False,

    )



    dataset = dataset.filter(

        lambda x: len(x["text"]) > 50,

        load_from_cache_file=False,

    )



    eos_token = tokenizer.eos_token or ""



    def add_eos(example):

        return {"text": example["text"] + eos_token}



    dataset = dataset.map(add_eos, load_from_cache_file=False)



    dataset = dataset.select(range(min(2000, len(dataset))))



    def tokenize_function(examples):

        outputs = tokenizer(

            examples["text"],

            truncation=True,

            max_length=MAX_SEQ_LENGTH,

            padding=False,

        )

        outputs["labels"] = outputs["input_ids"].copy()

        return outputs



    tokenized_dataset = dataset.map(

        tokenize_function,

        batched=True,

        batch_size=1000,

        remove_columns=["text"],

        load_from_cache_file=False,

    )



    latest_checkpoint = get_latest_checkpoint(OUTPUT_DIR)

    if latest_checkpoint:

        print(f"Resuming from checkpoint: {latest_checkpoint}")

    else:

        print("No checkpoint found. Starting fresh.")



    data_collator = DataCollatorForLanguageModeling(

        tokenizer=tokenizer,

        mlm=False,

    )



    trainer = Trainer(

        model=model,

        train_dataset=tokenized_dataset,

        data_collator=data_collator,

        args=TrainingArguments(

            output_dir=OUTPUT_DIR,

            per_device_train_batch_size=1,

            gradient_accumulation_steps=32,

            warmup_steps=20,

            max_steps=100,

            learning_rate=1e-4,

            fp16=True,

            bf16=False,

            logging_steps=5,

            optim="adamw_8bit",

            weight_decay=0.01,

            lr_scheduler_type="cosine",

            save_steps=50,

            save_total_limit=2,

            report_to="none",

            seed=3407,

            remove_unused_columns=False,

            dataloader_num_workers=0,

        ),

    )



    trainer.train(resume_from_checkpoint=latest_checkpoint)



    model.save_pretrained(OUTPUT_DIR)

    tokenizer.save_pretrained(OUTPUT_DIR)



    print(f"Done. Saved to: {OUTPUT_DIR}")



if __name__ == "__main__":

    main()

Is my Persian Data small? I have another ~15GB Of Semi-Clean .txt Persian Wikipedia text that could help.

I thought about Translating Good English Datasets using a Local 1.8B Translator ( works well, but now Wow ) And then fine-tuning a Small SLM to make the dataset more natural, But you said “English-then-translate loses quality”

Discussion in the ATmosphere

Loading comments...