Fine-Tuning an SLM for a Low-Resource Language
Hugging Face Forums [Unofficial]
June 3, 2026
I Wrote a new Training script with LoRA adapter rank 64 and a alpha of 64, Its slower now, but iโm letting it train and see what will come off next. These are the Logs :
[transformers] ==((====))== Unsloth - 2x faster free finetuning | Num GPUs used = 1
\\ /| Num examples = 1,109,504 | Num Epochs = 1 | Total steps = 34,672
O^O/ \_/ \ Batch size per device = 4 | Gradient accumulation steps = 8
\ / Data Parallel GPUs = 1 | Total batch size (4 x 8 x 1) = 32
"-____-" Trainable parameters = 40,370,176 of 636,420,096 (6.34% trained)
0%| | 0/34672 [00:00<?, ?it/s][transformers] `use_return_dict` is deprecated! Use `return_dict` instead!
{'loss': '2.918', 'grad_norm': '1.558', 'learning_rate': '9e-06', 'epoch': '0.0002884'}
{'loss': '2.773', 'grad_norm': '1.016', 'learning_rate': '1.9e-05', 'epoch': '0.0005768'}
{'loss': '2.621', 'grad_norm': '0.7811', 'learning_rate': '2.9e-05', 'epoch': '0.0008653'}
{'loss': '2.546', 'grad_norm': '0.7259', 'learning_rate': '3.9e-05', 'epoch': '0.001154'}
{'loss': '2.322', 'grad_norm': '0.6498', 'learning_rate': '4.9e-05', 'epoch': '0.001442'}
{'loss': '2.378', 'grad_norm': '0.6687', 'learning_rate': '5.9e-05', 'epoch': '0.001731'}
{'loss': '2.293', 'grad_norm': '0.7748', 'learning_rate': '6.9e-05', 'epoch': '0.002019'}
{'loss': '2.309', 'grad_norm': '0.7728', 'learning_rate': '7.9e-05', 'epoch': '0.002307'}
{'loss': '2.214', 'grad_norm': '0.7984', 'learning_rate': '8.9e-05', 'epoch': '0.002596'}
{'loss': '2.203', 'grad_norm': '0.7599', 'learning_rate': '9.9e-05', 'epoch': '0.002884'}
{'loss': '2.227', 'grad_norm': '0.8078', 'learning_rate': '0.0001', 'epoch': '0.003173'}
{'loss': '2.179', 'grad_norm': '0.7982', 'learning_rate': '0.0001', 'epoch': '0.003461'}
{'loss': '2.155', 'grad_norm': '0.7713', 'learning_rate': '0.0001', 'epoch': '0.003749'}
{'loss': '2.102', 'grad_norm': '0.7508', 'learning_rate': '0.0001', 'epoch': '0.004038'}
{'loss': '2.094', 'grad_norm': '0.7127', 'learning_rate': '0.0001', 'epoch': '0.004326'}
{'loss': '2.146', 'grad_norm': '0.7476', 'learning_rate': '0.0001', 'epoch': '0.004615'}
{'loss': '2.163', 'grad_norm': '0.7814', 'learning_rate': '0.0001', 'epoch': '0.004903'}
{'loss': '2.079', 'grad_norm': '0.7486', 'learning_rate': '0.0001', 'epoch': '0.005192'}
{'loss': '2.142', 'grad_norm': '0.7015', 'learning_rate': '0.0001', 'epoch': '0.00548'}
{'loss': '2.083', 'grad_norm': '0.7789', 'learning_rate': '0.0001', 'epoch': '0.005768'}
{'loss': '2.085', 'grad_norm': '0.7249', 'learning_rate': '0.0001', 'epoch': '0.006057'}
{'loss': '2.03', 'grad_norm': '0.7872', 'learning_rate': '0.0001', 'epoch': '0.006345'}
{'loss': '2.085', 'grad_norm': '0.7494', 'learning_rate': '0.0001', 'epoch': '0.006634'}
{'loss': '2.087', 'grad_norm': '0.7177', 'learning_rate': '0.0001', 'epoch': '0.006922'}
{'loss': '2.073', 'grad_norm': '0.6864', 'learning_rate': '0.0001', 'epoch': '0.00721'}
{'loss': '2.007', 'grad_norm': '0.7131', 'learning_rate': '9.999e-05', 'epoch': '0.007499'}
{'loss': '2.074', 'grad_norm': '0.7246', 'learning_rate': '9.999e-05', 'epoch': '0.007787'}
{'loss': '2.011', 'grad_norm': '0.6955', 'learning_rate': '9.999e-05', 'epoch': '0.008076'}
{'loss': '1.99', 'grad_norm': '0.666', 'learning_rate': '9.999e-05', 'epoch': '0.008364'}
{'loss': '2.01', 'grad_norm': '0.655', 'learning_rate': '9.999e-05', 'epoch': '0.008653'}
{'loss': '2.032', 'grad_norm': '0.6227', 'learning_rate': '9.999e-05', 'epoch': '0.008941'}
{'loss': '2.003', 'grad_norm': '0.7089', 'learning_rate': '9.999e-05', 'epoch': '0.009229'}
{'loss': '2.074', 'grad_norm': '0.6487', 'learning_rate': '9.999e-05', 'epoch': '0.009518'}
{'loss': '2.073', 'grad_norm': '0.6404', 'learning_rate': '9.999e-05', 'epoch': '0.009806'}
{'loss': '2.052', 'grad_norm': '0.6166', 'learning_rate': '9.999e-05', 'epoch': '0.01009'}
{'loss': '2.011', 'grad_norm': '0.6447', 'learning_rate': '9.999e-05', 'epoch': '0.01038'}
{'loss': '1.962', 'grad_norm': '0.6773', 'learning_rate': '9.999e-05', 'epoch': '0.01067'}
{'loss': '1.97', 'grad_norm': '0.6069', 'learning_rate': '9.998e-05', 'epoch': '0.01096'}
{'loss': '1.982', 'grad_norm': '0.6222', 'learning_rate': '9.998e-05', 'epoch': '0.01125'}
{'loss': '2.021', 'grad_norm': '0.6198', 'learning_rate': '9.998e-05', 'epoch': '0.01154'}
{'loss': '1.985', 'grad_norm': '0.629', 'learning_rate': '9.998e-05', 'epoch': '0.01183'}
{'loss': '1.959', 'grad_norm': '0.6498', 'learning_rate': '9.998e-05', 'epoch': '0.01211'}
{'loss': '2.03', 'grad_norm': '0.6532', 'learning_rate': '9.998e-05', 'epoch': '0.0124'}
{'loss': '1.933', 'grad_norm': '0.6459', 'learning_rate': '9.998e-05', 'epoch': '0.01269'}
{'loss': '1.985', 'grad_norm': '0.6385', 'learning_rate': '9.997e-05', 'epoch': '0.01298'}
{'loss': '1.995', 'grad_norm': '0.6946', 'learning_rate': '9.997e-05', 'epoch': '0.01327'}
{'loss': '1.957', 'grad_norm': '0.6225', 'learning_rate': '9.997e-05', 'epoch': '0.01356'}
{'loss': '1.969', 'grad_norm': '0.5927', 'learning_rate': '9.997e-05', 'epoch': '0.01384'}
{'loss': '1.993', 'grad_norm': '0.6287', 'learning_rate': '9.997e-05', 'epoch': '0.01413'}
{'loss': '1.964', 'grad_norm': '0.6393', 'learning_rate': '9.997e-05', 'epoch': '0.01442'}
{'loss': '1.954', 'grad_norm': '0.6973', 'learning_rate': '9.997e-05', 'epoch': '0.01471'}
{'loss': '1.981', 'grad_norm': '0.5865', 'learning_rate': '9.996e-05', 'epoch': '0.015'}
{'loss': '1.903', 'grad_norm': '0.6059', 'learning_rate': '9.996e-05', 'epoch': '0.01529'}
{'loss': '1.972', 'grad_norm': '0.6095', 'learning_rate': '9.996e-05', 'epoch': '0.01557'}
{'loss': '1.939', 'grad_norm': '0.6175', 'learning_rate': '9.996e-05', 'epoch': '0.01586'}
{'loss': '1.919', 'grad_norm': '0.5895', 'learning_rate': '9.996e-05', 'epoch': '0.01615'}
{'loss': '1.915', 'grad_norm': '0.5853', 'learning_rate': '9.995e-05', 'epoch': '0.01644'}
{'loss': '1.883', 'grad_norm': '0.6089', 'learning_rate': '9.995e-05', 'epoch': '0.01673'}
{'loss': '1.935', 'grad_norm': '0.6074', 'learning_rate': '9.995e-05', 'epoch': '0.01702'}
{'loss': '1.931', 'grad_norm': '0.6369', 'learning_rate': '9.995e-05', 'epoch': '0.01731'}
{'loss': '1.896', 'grad_norm': '0.6057', 'learning_rate': '9.995e-05', 'epoch': '0.01759'}
{'loss': '1.944', 'grad_norm': '0.5966', 'learning_rate': '9.994e-05', 'epoch': '0.01788'}
{'loss': '1.888', 'grad_norm': '0.6135', 'learning_rate': '9.994e-05', 'epoch': '0.01817'}
{'loss': '2.014', 'grad_norm': '0.6026', 'learning_rate': '9.994e-05', 'epoch': '0.01846'}
{'loss': '1.884', 'grad_norm': '0.6325', 'learning_rate': '9.994e-05', 'epoch': '0.01875'}
{'loss': '1.881', 'grad_norm': '0.6144', 'learning_rate': '9.994e-05', 'epoch': '0.01904'}
{'loss': '1.927', 'grad_norm': '0.616', 'learning_rate': '9.993e-05', 'epoch': '0.01932'}
{'loss': '1.924', 'grad_norm': '0.6267', 'learning_rate': '9.993e-05', 'epoch': '0.01961'}
2%|โโ | 683/34672 [53:56<42:15:31, 4.48s/it]
And the script :
import os
from datasets import load_dataset
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer
MODEL_PATH = r"F:\Deep Learning\Model\Qwen 3 0.6B SafeTensor"
DATA_PATH = r"E:\Ava1\Dataset-pars\Wikipedia-Persian\Wikipedia-Persian-textonly.jsonl"
OUTPUT_DIR = r"F:\Deep Learning\Outputs\qwen3-0.6b-persian-pretrain-lora3"
MAX_SEQ_LENGTH = 256
LOAD_IN_4BIT = True
NUM_PROC = 1
def prepare_dataset(tokenizer):
print("๐ Loading dataset...")
dataset = load_dataset(
"json",
data_files=DATA_PATH,
split="train",
)
text_column = "Text" if "Text" in dataset.column_names else dataset.column_names[0]
print("Fast tokenization (no truncation)...")
def tokenize_function(examples):
return tokenizer(
examples[text_column],
add_special_tokens=True,
truncation=False,
)
tokenized = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset.column_names,
desc="Tokenizing",
)
print("Packing into continuous GPT blocks...")
def group_texts(examples):
concatenated = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated["input_ids"])
total_length = (total_length // MAX_SEQ_LENGTH) * MAX_SEQ_LENGTH
result = {
k: [t[i:i + MAX_SEQ_LENGTH] for i in range(0, total_length, MAX_SEQ_LENGTH)]
for k, t in concatenated.items()
}
result["labels"] = result["input_ids"].copy()
return result
lm_dataset = tokenized.map(
group_texts,
batched=True,
batch_size=1000,
desc="Packing blocks",
)
lm_dataset = lm_dataset.with_format("torch")
return lm_dataset
def main():
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("Loading model with Unsloth...")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=MODEL_PATH,
max_seq_length=MAX_SEQ_LENGTH,
dtype=None,
load_in_4bit=LOAD_IN_4BIT,
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("Applying LoRA adapters...")
model = FastLanguageModel.get_peft_model(
model,
r=64,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_alpha=64,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
)
train_dataset = prepare_dataset(tokenizer)
print(f"Packed samples: {len(train_dataset):,}")
print("Starting continued pretraining...")
trainer = Trainer(
model=model,
train_dataset=train_dataset,
args=TrainingArguments(
output_dir=OUTPUT_DIR,
per_device_train_batch_size=4,
gradient_accumulation_steps=8,
warmup_steps=100,
num_train_epochs=1,
learning_rate=1e-4,
bf16=True,
logging_steps=10,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="cosine",
save_steps=500,
save_total_limit=2,
report_to="none",
seed=3407,
remove_unused_columns=False,
),
)
trainer.train()
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Done! Model saved to: {OUTPUT_DIR}")
if __name__ == "__main__":
main()
Discussion in the ATmosphere