Fine-Tuning an SLM for a Low-Resource Language
Thank you very much for your help! My target language is Persian (Iranian), and for the base model I used Qwen 3 0.6B Instruct. I chose this model because I am targeting low-end devices, such as phones and school computers.
My main limitation is hardware. I have an RTX 3050 Mobile 4Gb , 32Gb RAM , and an Intel i5-11400H.
If you have time, I have some more questions:
I tried to continue pretraining through a LoRA adapter using the Unsloth library in Python with these LoRA adapter settings:
model = FastLanguageModel.get_peft_model(
model,
r=8,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
I used a .jsonl file containing semi-cleaned raw Wikipedia text in the target language, but the results were not very good and were weaker than the original model. Unfortunately, I do not have the training logs, but as I remember, the model started with a loss of around 2.3 , and by the end of training it reached about 1.5.
I used an 800Mb cleaned Persian Wikipedia .jsonl file to test whether the model would improve or not, but the results were negative, at least in my setup.
For fine-tuning, I used two datasets:
Persian-Wiki-QA (a Persian Wikipedia QA dataset)
Alpaca-Persian-Cleaned (a cleaned Persian translation of Alpaca, translated using Google Translate)
This is the training script I used to train the model:
import os
from datasets import load_dataset
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
MODEL_PATH = r"F:\Deep Learning\Model\Qwen 3 0.6B SafeTensor"
DATA_PATH = r"E:\Ava1\Dataset-pars\Wikipedia-Persian\Wikipedia-Persian-textonly.jsonl"
OUTPUT_DIR = r"F:\Deep Learning\Outputs\qwen3-0.6b-persian-pretrain-lora"
MAX_SEQ_LENGTH = 256
LOAD_IN_4BIT = True
def get_latest_checkpoint(output_dir):
if not os.path.exists(output_dir):
return None
checkpoints = []
for name in os.listdir(output_dir):
path = os.path.join(output_dir, name)
if os.path.isdir(path) and name.startswith("checkpoint-"):
try:
step = int(name.split("-")[-1])
checkpoints.append((step, path))
except:
pass
if not checkpoints:
return None
checkpoints.sort(key=lambda x: x[0])
return checkpoints[-1][1]
def main():
os.makedirs(OUTPUT_DIR, exist_ok=True)
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=MODEL_PATH,
max_seq_length=MAX_SEQ_LENGTH,
dtype=None,
load_in_4bit=LOAD_IN_4BIT,
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = FastLanguageModel.get_peft_model(
model,
r=8,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
dataset = load_dataset("json", data_files=DATA_PATH, split="train")
def clean_example(example):
text = example.get("Text", "")
if text is None:
text = ""
return {"text": str(text).strip()}
dataset = dataset.map(
clean_example,
remove_columns=dataset.column_names,
load_from_cache_file=False,
)
dataset = dataset.filter(
lambda x: len(x["text"]) > 50,
load_from_cache_file=False,
)
eos_token = tokenizer.eos_token or ""
def add_eos(example):
return {"text": example["text"] + eos_token}
dataset = dataset.map(add_eos, load_from_cache_file=False)
dataset = dataset.select(range(min(2000, len(dataset))))
def tokenize_function(examples):
outputs = tokenizer(
examples["text"],
truncation=True,
max_length=MAX_SEQ_LENGTH,
padding=False,
)
outputs["labels"] = outputs["input_ids"].copy()
return outputs
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
batch_size=1000,
remove_columns=["text"],
load_from_cache_file=False,
)
latest_checkpoint = get_latest_checkpoint(OUTPUT_DIR)
if latest_checkpoint:
print(f"Resuming from checkpoint: {latest_checkpoint}")
else:
print("No checkpoint found. Starting fresh.")
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
trainer = Trainer(
model=model,
train_dataset=tokenized_dataset,
data_collator=data_collator,
args=TrainingArguments(
output_dir=OUTPUT_DIR,
per_device_train_batch_size=1,
gradient_accumulation_steps=32,
warmup_steps=20,
max_steps=100,
learning_rate=1e-4,
fp16=True,
bf16=False,
logging_steps=5,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="cosine",
save_steps=50,
save_total_limit=2,
report_to="none",
seed=3407,
remove_unused_columns=False,
dataloader_num_workers=0,
),
)
trainer.train(resume_from_checkpoint=latest_checkpoint)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Done. Saved to: {OUTPUT_DIR}")
if __name__ == "__main__":
main()
Is my Persian Data small? I have another ~15GB Of Semi-Clean .txt Persian Wikipedia text that could help.
I thought about Translating Good English Datasets using a Local 1.8B Translator ( works well, but now Wow ) And then fine-tuning a Small SLM to make the dataset more natural, But you said “English-then-translate loses quality”
Discussion in the ATmosphere