cuolm · cuolm · May 6, 2026 · May 1, 2026 · May 1, 2026 · May 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -17,4 +17,5 @@ wheels/
 /data/ 
 backup/
 base_models/
+unsloth_compiled_cache/
 TODO.md
diff --git a/config/codefinetuner_config.yaml b/config/codefinetuner_config.yaml
@@ -2,60 +2,61 @@
 # Shared across all stages 
 globals: &globals
   workspace_path: null  # null: defaults to current working directory (CWD)
-  model_name: "Qwen/Qwen2.5-0.5B" 
+  model_name: "Qwen/Qwen2.5-Coder-0.5B" 
   fim_prefix_token: "<|fim_prefix|>"
   fim_middle_token: "<|fim_middle|>"
   fim_suffix_token: "<|fim_suffix|>"
   fim_pad_token: "<|fim_pad|>"
   eos_token: "<|endoftext|>"
   label_pad_token_id: -100
+  max_token_sequence_length: 512
   data_language: "c"
   data_extensions: [".c", ".h"]
 
 # Preprocess stage
 preprocess:
   <<: *globals
-  split_mode: "manual"
+  split_mode: "auto"
   train_ratio: 0.8
   eval_ratio: 0.1
   test_ratio: 0.1
-  max_token_sequence_length: 1024
   max_code_blocks_ast_depth: 2
   min_middle_tokens_length: 20
   max_middle_tokens_length: 200
   fim_examples_per_subblock_ratio: 1.0
   tokenizer_batch_size: 32
-  raw_data_path: "data/data_embedded"  # null: defaults to <workspace_path>/data
+  raw_data_path: null  # null: defaults to <workspace_path>/data
   tree_sitter_parser_path: null  # null: uses tree_sitter_language_pack parser
   tree_sitter_definitions_path: null  # null: uses package internal tree sitter definitions 
   rng_seed: 0
 
 # Finetune stage 
 finetune:
   <<: *globals
+  use_unsloth: True
   model_attn_implementation: "sdpa"
   lora_r: 32
   lora_alpha: 64
-  lora_dropout: 0.1
+  lora_dropout: 0.0
   lora_bias: "none"
   lora_target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"]
   trainer_resume_from_checkpoint: null  # null->fresh start, "last"->take last checkpoint
   trainer_clear_checkpoint_dir: false
   trainer_num_train_epochs: 1
-  trainer_per_device_train_batch_size: 2
-  trainer_per_device_eval_batch_size: 2
-  trainer_gradient_accumulation_steps: 32
-  trainer_learning_rate: 2e-5
-  trainer_weight_decay: 0.1
+  trainer_per_device_train_batch_size: 4 
+  trainer_per_device_eval_batch_size: 4 
+  trainer_gradient_accumulation_steps: 16 
+  trainer_learning_rate: 2e-4
+  trainer_weight_decay: 0.01
   trainer_max_grad_norm: 1.0
   trainer_lr_scheduler_type: "cosine"
   trainer_warmup_steps: 50
   trainer_gradient_checkpointing: true
   trainer_logging_steps: 10
   trainer_eval_strategy: "steps"
-  trainer_eval_steps: 100
+  trainer_eval_steps: 50
   trainer_save_strategy: "steps"
-  trainer_save_steps: 100
+  trainer_save_steps: 50
   trainer_logging_strategy: "steps"
   dataset_shuffle_buffer_size: 50000
   dataset_shuffle_seed: 0

diff --git a/pyproject.toml b/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
     "omegaconf>=2.3.0",
     "gguf>=0.18.0",
     "sentencepiece>=0.2.1",
+    "unsloth>=2026.4.8 ; sys_platform == 'linux' or sys_platform == 'win32'",
 ]
 
 [dependency-groups]

diff --git a/scripts/analyze_dataset.py b/scripts/analyze_dataset.py
@@ -0,0 +1,37 @@
+import json
+
+def analyze_token_lengths(file_path, threshold):
+    max_len = 0
+    total_count = 0
+    under_threshold_count = 0
+
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line in f:
+                if not line.strip():
+                    continue
+
+                total_count += 1
+                data = json.loads(line)
+                current_len = len(data.get("input_ids", []))
+
+                if current_len > max_len:
+                    max_len = current_len
+
+                if current_len <= threshold:
+                    under_threshold_count += 1
+
+    except FileNotFoundError:
+        print(f"Error: File '{file_path}' not found.")
+        return
+
+    print(f"Total examples: {total_count}")
+    print(f"Maximum token length: {max_len}")
+    print(f"Examples under/at threshold ({threshold}): {under_threshold_count} out of {total_count}")
+
+if __name__ == "__main__":
+    # CONFIGURATION
+    file_path = "outputs/preprocess/results/datasets/train_dataset.jsonl"
+    threshold = 512
+
+    analyze_token_lengths(file_path, threshold)
diff --git a/src/codefinetuner/finetune/config.py b/src/codefinetuner/finetune/config.py
@@ -13,10 +13,14 @@
 
 @dataclass
 class Config:
-# --- Mandatory Parameters ---
+    # --- Mandatory Parameters ---
     model_name: str = MISSING
     fim_pad_token: str = MISSING
     label_pad_token_id: int = MISSING
+    max_token_sequence_length: int = MISSING 
+
+    # --- Unsloth Settings ---
+    use_unsloth: bool = False
 
     # --- Model Settings ---
     model_attn_implementation: str = "sdpa"  # sdpa = built-in PyTorch implementation of scaled dot product attention, imporves performance and memory efficiency

diff --git a/src/codefinetuner/finetune/model.py b/src/codefinetuner/finetune/model.py
@@ -1,4 +1,3 @@
-
 import logging
 
 from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
@@ -10,7 +9,14 @@
 logger = logging.getLogger(__name__)
 
 
-def load_and_configure_lora_model(config: Config) -> AutoModelForCausalLM:
+def load_and_configure_lora_model(config: Config):
+    if config.use_unsloth:
+        return _load_unsloth_model(config)
+    else:
+        return _load_hf_model(config)
+
+
+def _load_hf_model(config: Config) -> AutoModelForCausalLM:
     if config.device == "cuda":
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
@@ -57,3 +63,40 @@ def load_and_configure_lora_model(config: Config) -> AutoModelForCausalLM:
     )
     lora_model = get_peft_model(model, lora_config)
     return lora_model
+
+
+def _load_unsloth_model(config: Config):
+    # Import Unsloth lazily here so its import-time patches only affect the CUDA/Unsloth
+    # finetuning path. Importing it at module level can monkey-patch transformers/peft
+    # process-wide, change behavior for non-Unsloth code paths, and add unnecessary import overhead.
+    try:
+        from unsloth import FastLanguageModel
+    except ImportError:
+        raise ImportError("Unsloth is not installed. Run: uv add unsloth or pip install unsloth")
+
+    if config.device != "cuda":
+        raise RuntimeError("Unsloth requires CUDA. Either set use_unsloth=false or run on a CUDA device.")
+
+
+    model, _ = FastLanguageModel.from_pretrained(
+        model_name=config.model_name,
+        max_seq_length=config.max_token_sequence_length,
+        dtype=config.model_dtype,
+        load_in_4bit=True,
+    )
+
+    lora_model = FastLanguageModel.get_peft_model(
+        model,
+        lora_alpha=config.lora_alpha,
+        lora_dropout=config.lora_dropout,
+        r=config.lora_r,
+        bias=config.lora_bias,
+        target_modules=config.lora_target_modules,
+        use_gradient_checkpointing="unsloth" if config.trainer_gradient_checkpointing else False,
+    )
+
+    logger.info(
+        f"[Unsloth] Model: {config.model_name} | Device: {config.device} | "
+        f"Dtype: {config.model_dtype} | max_seq_length: {config.max_token_sequence_length}"
+    )
+    return lora_model
diff --git a/src/codefinetuner/finetune/train.py b/src/codefinetuner/finetune/train.py
@@ -2,7 +2,6 @@
 import json
 import logging
 import shutil
-from pathlib import Path
 from typing import Dict, List
 
 import matplotlib.pyplot as plt
@@ -88,7 +87,7 @@ def train_lora_model(
         save_steps=config.trainer_save_steps,
         bf16=trainer_bf16,  
         fp16=trainer_fp16,  
-        gradient_checkpointing=config.trainer_gradient_checkpointing
+        gradient_checkpointing=(config.trainer_gradient_checkpointing and not config.use_unsloth),
     )
 
     data_collator = FIMDataCollator(

diff --git a/src/codefinetuner/preprocess/config.py b/src/codefinetuner/preprocess/config.py
@@ -24,13 +24,13 @@ class Config:
     eos_token: str = MISSING
     data_language: str = MISSING
     data_extensions: list[str] = MISSING
+    max_token_sequence_length: int = MISSING  # used with bytes_per_token_ratio to convert bytes to tokens, final token count is thus not exact
 
     # --- Preprocess Local Parameters ---
     split_mode: str = "auto"
     train_ratio: float = 0.8
     eval_ratio: float = 0.1
     test_ratio: float = 0.1
-    max_token_sequence_length: int = 1024  # used with bytes_per_token_ratio to convert bytes to tokens, final token count is thus not exact
     max_code_blocks_ast_depth: int = 2  # depth 1 is root, 2 includes child nodes (e.g. functions)
     min_middle_tokens_length: int = 20  # used with estimated bytes_per_token_ratio to convert bytes to tokens, final token count is thus not exact 
     max_middle_tokens_length: int = 200  # used with estimated bytes_per_token_ratio to convert bytes to tokens, final token count is thus not exact

diff --git a/src/codefinetuner/preprocess/process.py b/src/codefinetuner/preprocess/process.py
@@ -63,9 +63,9 @@ def _extract_subblock_ranges(config: Config, node: ts.Node, base_offset: int) ->
 
 def _filter_subblocks(subblock_ranges: list[Tuple[int, int]], max_bytes_per_subblock: int) -> list[Tuple[int, int]]:
     """
-     Discard subblocks that have a larger end index than max_bytes_per_subblock
+     Discard subblocks that have a larger end index than max_bytes_per_subblock.
     """
-    subblock_ranges = sorted(subblock_ranges, key=lambda x: x[1]) # sort ranges by end index
+    subblock_ranges = sorted(subblock_ranges, key=lambda x: x[1])  # sort ranges by end index
     i = 0
     while i < len(subblock_ranges) and subblock_ranges[i][1] <= max_bytes_per_subblock:
         i += 1
@@ -79,6 +79,8 @@ def _generate_fim_examples_from_code_block(config: Config, code_utf8: bytes, sub
     eos_token_utf8 = config.eos_token.encode('utf8')
 
     num_of_subblocks = len(subblock_ranges)
+    if num_of_subblocks == 0:
+        return []
     num_of_fim_examples = max(1, int(num_of_subblocks * config.fim_examples_per_subblock_ratio))  
     unique_random_subblock_indices = config.rng.choice(len(subblock_ranges), size=num_of_fim_examples, replace=False)
 
@@ -126,13 +128,33 @@ def create_fim_examples(config: Config, code_blocks_iter: Iterator[Tuple[bytes,
 
         subblock_ranges = _filter_subblocks(subblock_ranges, max_bytes_per_subblock) 
 
-        code_block_utf8 = code_block_utf8[:max_bytes_per_subblock]  # Trunctate code block code if it is larger than bytes_per_code_block, we only consider subblocks inside this range.
-
         fim_examples = _generate_fim_examples_from_code_block(config, code_block_utf8, subblock_ranges, bytes_per_token_ratio)
         for fim_example in fim_examples:
             yield fim_example
 
 
+def _filter_overlong_tokenized_examples(tokenized_batch: Mapping[str, List[List[int]]], max_length: int) -> Mapping[str, List[List[int]]]:
+    """
+    Remove examples whose actual token count exceeds max_length.
+    Returns filtered batch.
+    """
+    input_ids = tokenized_batch["input_ids"]
+
+    keep_indices = []
+    for idx, token_sequence in enumerate(input_ids):
+        if len(token_sequence) <= max_length:
+            keep_indices.append(idx)
+
+    filtered_batch = {}
+    for key, values in tokenized_batch.items():
+        filtered_values = []
+        for idx in keep_indices:
+            filtered_values.append(values[idx])
+        filtered_batch[key] = filtered_values
+
+    return filtered_batch 
+
+
 def _save_tokenized_batch_as_jsonl(file_path: Path, batch: Mapping[str, List[List[int]]]) -> None:
     with open(file_path, 'a', encoding='utf-8') as f:
         batch_size = len(batch['input_ids'])
@@ -154,7 +176,6 @@ def tokenize_and_save_fim_examples(config: Config, file_path: Path, fim_examples
     batch = []
     for fim_example in fim_examples_iter:
         batch.append(fim_example.decode('utf-8'))
-        examples_counter += 1
         if (len(batch) == config.tokenizer_batch_size):
             tokenized_batch = tokenizer(
                 batch,
@@ -163,7 +184,10 @@ def tokenize_and_save_fim_examples(config: Config, file_path: Path, fim_examples
                 return_attention_mask=True
             )
             tokenized_batch["labels"] = tokenized_batch["input_ids"]
-            _save_tokenized_batch_as_jsonl(file_path, tokenized_batch)
+            filtered_tokenized_batch = _filter_overlong_tokenized_examples(tokenized_batch, config.max_token_sequence_length)
+            if filtered_tokenized_batch["input_ids"]:  # dont save if no examples left after filtering
+                _save_tokenized_batch_as_jsonl(file_path, filtered_tokenized_batch)
+            examples_counter += len(filtered_tokenized_batch["input_ids"])
             batch = []
 
     # last batch is smaller than config.tokenizer_batch_size, the "rest"
@@ -175,6 +199,9 @@ def tokenize_and_save_fim_examples(config: Config, file_path: Path, fim_examples
             return_attention_mask=True
         )
         tokenized_batch["labels"] = tokenized_batch["input_ids"]
-        _save_tokenized_batch_as_jsonl(file_path, tokenized_batch)
+        filtered_tokenized_batch = _filter_overlong_tokenized_examples(tokenized_batch, config.max_token_sequence_length)
+        if filtered_tokenized_batch["input_ids"]:  # dont save if no examples left after filtering
+            _save_tokenized_batch_as_jsonl(file_path, filtered_tokenized_batch)
+        examples_counter += len(filtered_tokenized_batch["input_ids"])
 
-    logger.info(f"Processed and saved {examples_counter} FIM examples to {file_path}")
+    logger.info(f"Processed and saved {examples_counter} FIM examples to {file_path}")
diff --git a/tests/config/codefinetuner_config.yaml b/tests/config/codefinetuner_config.yaml
@@ -7,13 +7,13 @@ globals: &globals
   fim_pad_token: "<|fim_pad|>"
   eos_token: "<|endoftext|>"
   label_pad_token_id: -100
+  max_token_sequence_length: 1024
   data_language: "c"
   data_extensions: [".c", ".h"]
 
 preprocess:
   <<: *globals
   split_mode: "manual"
-  max_token_sequence_length: 1024
   min_middle_tokens_length: 1
   max_middle_tokens_length: 500
   fim_examples_per_subblock_ratio: 1.0

diff --git a/tests/finetune/test_config.py b/tests/finetune/test_config.py
@@ -50,6 +50,7 @@ def test_load_from_yaml_ignores_unknown_keys(tmp_path):
           fim_pad_token: "<|fim_pad|>"
           eos_token: "<|endoftext|>"
           label_pad_token_id: -100
+          max_token_sequence_length: 1024
           data_language: "c"
           data_extensions: [".c", ".h"]
           unknown_key_that_does_not_exist: 999
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,4 +17,5 @@ wheels/ @@
     /data/
     backup/
     base_models/
+    unsloth_compiled_cache/
     TODO.md