Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ wheels/
/data/
backup/
base_models/
unsloth_compiled_cache/
TODO.md
25 changes: 13 additions & 12 deletions config/codefinetuner_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,60 +2,61 @@
# Shared across all stages
globals: &globals
workspace_path: null # null: defaults to current working directory (CWD)
model_name: "Qwen/Qwen2.5-0.5B"
model_name: "Qwen/Qwen2.5-Coder-0.5B"
fim_prefix_token: "<|fim_prefix|>"
fim_middle_token: "<|fim_middle|>"
fim_suffix_token: "<|fim_suffix|>"
fim_pad_token: "<|fim_pad|>"
eos_token: "<|endoftext|>"
label_pad_token_id: -100
max_token_sequence_length: 512
data_language: "c"
data_extensions: [".c", ".h"]

# Preprocess stage
preprocess:
<<: *globals
split_mode: "manual"
split_mode: "auto"
train_ratio: 0.8
eval_ratio: 0.1
test_ratio: 0.1
max_token_sequence_length: 1024
max_code_blocks_ast_depth: 2
min_middle_tokens_length: 20
max_middle_tokens_length: 200
fim_examples_per_subblock_ratio: 1.0
tokenizer_batch_size: 32
raw_data_path: "data/data_embedded" # null: defaults to <workspace_path>/data
raw_data_path: null # null: defaults to <workspace_path>/data
tree_sitter_parser_path: null # null: uses tree_sitter_language_pack parser
tree_sitter_definitions_path: null # null: uses package internal tree sitter definitions
rng_seed: 0

# Finetune stage
finetune:
<<: *globals
use_unsloth: True
model_attn_implementation: "sdpa"
lora_r: 32
lora_alpha: 64
lora_dropout: 0.1
lora_dropout: 0.0
lora_bias: "none"
lora_target_modules: ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"]
trainer_resume_from_checkpoint: null # null->fresh start, "last"->take last checkpoint
trainer_clear_checkpoint_dir: false
trainer_num_train_epochs: 1
trainer_per_device_train_batch_size: 2
trainer_per_device_eval_batch_size: 2
trainer_gradient_accumulation_steps: 32
trainer_learning_rate: 2e-5
trainer_weight_decay: 0.1
trainer_per_device_train_batch_size: 4
trainer_per_device_eval_batch_size: 4
trainer_gradient_accumulation_steps: 16
trainer_learning_rate: 2e-4
trainer_weight_decay: 0.01
trainer_max_grad_norm: 1.0
trainer_lr_scheduler_type: "cosine"
trainer_warmup_steps: 50
trainer_gradient_checkpointing: true
trainer_logging_steps: 10
trainer_eval_strategy: "steps"
trainer_eval_steps: 100
trainer_eval_steps: 50
trainer_save_strategy: "steps"
trainer_save_steps: 100
trainer_save_steps: 50
trainer_logging_strategy: "steps"
dataset_shuffle_buffer_size: 50000
dataset_shuffle_seed: 0
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies = [
"omegaconf>=2.3.0",
"gguf>=0.18.0",
"sentencepiece>=0.2.1",
"unsloth>=2026.4.8 ; sys_platform == 'linux' or sys_platform == 'win32'",
]

[dependency-groups]
Expand Down
37 changes: 37 additions & 0 deletions scripts/analyze_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import json

def analyze_token_lengths(file_path, threshold):
max_len = 0
total_count = 0
under_threshold_count = 0

try:
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
if not line.strip():
continue

total_count += 1
data = json.loads(line)
current_len = len(data.get("input_ids", []))

if current_len > max_len:
max_len = current_len

if current_len <= threshold:
under_threshold_count += 1

except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
return

print(f"Total examples: {total_count}")
print(f"Maximum token length: {max_len}")
print(f"Examples under/at threshold ({threshold}): {under_threshold_count} out of {total_count}")

if __name__ == "__main__":
# CONFIGURATION
file_path = "outputs/preprocess/results/datasets/train_dataset.jsonl"
threshold = 512

analyze_token_lengths(file_path, threshold)
6 changes: 5 additions & 1 deletion src/codefinetuner/finetune/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@

@dataclass
class Config:
# --- Mandatory Parameters ---
# --- Mandatory Parameters ---
model_name: str = MISSING
fim_pad_token: str = MISSING
label_pad_token_id: int = MISSING
max_token_sequence_length: int = MISSING

# --- Unsloth Settings ---
use_unsloth: bool = False

# --- Model Settings ---
model_attn_implementation: str = "sdpa" # sdpa = built-in PyTorch implementation of scaled dot product attention, imporves performance and memory efficiency
Expand Down
47 changes: 45 additions & 2 deletions src/codefinetuner/finetune/model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

import logging

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
Expand All @@ -10,7 +9,14 @@
logger = logging.getLogger(__name__)


def load_and_configure_lora_model(config: Config) -> AutoModelForCausalLM:
def load_and_configure_lora_model(config: Config):
if config.use_unsloth:
return _load_unsloth_model(config)
else:
return _load_hf_model(config)


def _load_hf_model(config: Config) -> AutoModelForCausalLM:
if config.device == "cuda":
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
Expand Down Expand Up @@ -57,3 +63,40 @@ def load_and_configure_lora_model(config: Config) -> AutoModelForCausalLM:
)
lora_model = get_peft_model(model, lora_config)
return lora_model


def _load_unsloth_model(config: Config):
# Import Unsloth lazily here so its import-time patches only affect the CUDA/Unsloth
# finetuning path. Importing it at module level can monkey-patch transformers/peft
# process-wide, change behavior for non-Unsloth code paths, and add unnecessary import overhead.
try:
from unsloth import FastLanguageModel
except ImportError:
raise ImportError("Unsloth is not installed. Run: uv add unsloth or pip install unsloth")

if config.device != "cuda":
raise RuntimeError("Unsloth requires CUDA. Either set use_unsloth=false or run on a CUDA device.")


model, _ = FastLanguageModel.from_pretrained(
model_name=config.model_name,
max_seq_length=config.max_token_sequence_length,
dtype=config.model_dtype,
load_in_4bit=True,
)

lora_model = FastLanguageModel.get_peft_model(
model,
lora_alpha=config.lora_alpha,
lora_dropout=config.lora_dropout,
r=config.lora_r,
bias=config.lora_bias,
target_modules=config.lora_target_modules,
use_gradient_checkpointing="unsloth" if config.trainer_gradient_checkpointing else False,
)

logger.info(
f"[Unsloth] Model: {config.model_name} | Device: {config.device} | "
f"Dtype: {config.model_dtype} | max_seq_length: {config.max_token_sequence_length}"
)
return lora_model
3 changes: 1 addition & 2 deletions src/codefinetuner/finetune/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import json
import logging
import shutil
from pathlib import Path
from typing import Dict, List

import matplotlib.pyplot as plt
Expand Down Expand Up @@ -88,7 +87,7 @@ def train_lora_model(
save_steps=config.trainer_save_steps,
bf16=trainer_bf16,
fp16=trainer_fp16,
gradient_checkpointing=config.trainer_gradient_checkpointing
gradient_checkpointing=(config.trainer_gradient_checkpointing and not config.use_unsloth),
)

data_collator = FIMDataCollator(
Expand Down
2 changes: 1 addition & 1 deletion src/codefinetuner/preprocess/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ class Config:
eos_token: str = MISSING
data_language: str = MISSING
data_extensions: list[str] = MISSING
max_token_sequence_length: int = MISSING # used with bytes_per_token_ratio to convert bytes to tokens, final token count is thus not exact

# --- Preprocess Local Parameters ---
split_mode: str = "auto"
train_ratio: float = 0.8
eval_ratio: float = 0.1
test_ratio: float = 0.1
max_token_sequence_length: int = 1024 # used with bytes_per_token_ratio to convert bytes to tokens, final token count is thus not exact
max_code_blocks_ast_depth: int = 2 # depth 1 is root, 2 includes child nodes (e.g. functions)
min_middle_tokens_length: int = 20 # used with estimated bytes_per_token_ratio to convert bytes to tokens, final token count is thus not exact
max_middle_tokens_length: int = 200 # used with estimated bytes_per_token_ratio to convert bytes to tokens, final token count is thus not exact
Expand Down
43 changes: 35 additions & 8 deletions src/codefinetuner/preprocess/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,9 @@ def _extract_subblock_ranges(config: Config, node: ts.Node, base_offset: int) ->

def _filter_subblocks(subblock_ranges: list[Tuple[int, int]], max_bytes_per_subblock: int) -> list[Tuple[int, int]]:
"""
Discard subblocks that have a larger end index than max_bytes_per_subblock
Discard subblocks that have a larger end index than max_bytes_per_subblock.
"""
subblock_ranges = sorted(subblock_ranges, key=lambda x: x[1]) # sort ranges by end index
subblock_ranges = sorted(subblock_ranges, key=lambda x: x[1]) # sort ranges by end index
i = 0
while i < len(subblock_ranges) and subblock_ranges[i][1] <= max_bytes_per_subblock:
i += 1
Expand All @@ -79,6 +79,8 @@ def _generate_fim_examples_from_code_block(config: Config, code_utf8: bytes, sub
eos_token_utf8 = config.eos_token.encode('utf8')

num_of_subblocks = len(subblock_ranges)
if num_of_subblocks == 0:
return []
num_of_fim_examples = max(1, int(num_of_subblocks * config.fim_examples_per_subblock_ratio))
unique_random_subblock_indices = config.rng.choice(len(subblock_ranges), size=num_of_fim_examples, replace=False)

Expand Down Expand Up @@ -126,13 +128,33 @@ def create_fim_examples(config: Config, code_blocks_iter: Iterator[Tuple[bytes,

subblock_ranges = _filter_subblocks(subblock_ranges, max_bytes_per_subblock)

code_block_utf8 = code_block_utf8[:max_bytes_per_subblock] # Trunctate code block code if it is larger than bytes_per_code_block, we only consider subblocks inside this range.

fim_examples = _generate_fim_examples_from_code_block(config, code_block_utf8, subblock_ranges, bytes_per_token_ratio)
for fim_example in fim_examples:
yield fim_example


def _filter_overlong_tokenized_examples(tokenized_batch: Mapping[str, List[List[int]]], max_length: int) -> Mapping[str, List[List[int]]]:
"""
Remove examples whose actual token count exceeds max_length.
Returns filtered batch.
"""
input_ids = tokenized_batch["input_ids"]

keep_indices = []
for idx, token_sequence in enumerate(input_ids):
if len(token_sequence) <= max_length:
keep_indices.append(idx)

filtered_batch = {}
for key, values in tokenized_batch.items():
filtered_values = []
for idx in keep_indices:
filtered_values.append(values[idx])
filtered_batch[key] = filtered_values

return filtered_batch


def _save_tokenized_batch_as_jsonl(file_path: Path, batch: Mapping[str, List[List[int]]]) -> None:
with open(file_path, 'a', encoding='utf-8') as f:
batch_size = len(batch['input_ids'])
Expand All @@ -154,7 +176,6 @@ def tokenize_and_save_fim_examples(config: Config, file_path: Path, fim_examples
batch = []
for fim_example in fim_examples_iter:
batch.append(fim_example.decode('utf-8'))
examples_counter += 1
if (len(batch) == config.tokenizer_batch_size):
tokenized_batch = tokenizer(
batch,
Expand All @@ -163,7 +184,10 @@ def tokenize_and_save_fim_examples(config: Config, file_path: Path, fim_examples
return_attention_mask=True
)
tokenized_batch["labels"] = tokenized_batch["input_ids"]
_save_tokenized_batch_as_jsonl(file_path, tokenized_batch)
filtered_tokenized_batch = _filter_overlong_tokenized_examples(tokenized_batch, config.max_token_sequence_length)
if filtered_tokenized_batch["input_ids"]: # dont save if no examples left after filtering
_save_tokenized_batch_as_jsonl(file_path, filtered_tokenized_batch)
examples_counter += len(filtered_tokenized_batch["input_ids"])
batch = []

# last batch is smaller than config.tokenizer_batch_size, the "rest"
Expand All @@ -175,6 +199,9 @@ def tokenize_and_save_fim_examples(config: Config, file_path: Path, fim_examples
return_attention_mask=True
)
tokenized_batch["labels"] = tokenized_batch["input_ids"]
_save_tokenized_batch_as_jsonl(file_path, tokenized_batch)
filtered_tokenized_batch = _filter_overlong_tokenized_examples(tokenized_batch, config.max_token_sequence_length)
if filtered_tokenized_batch["input_ids"]: # dont save if no examples left after filtering
_save_tokenized_batch_as_jsonl(file_path, filtered_tokenized_batch)
examples_counter += len(filtered_tokenized_batch["input_ids"])

logger.info(f"Processed and saved {examples_counter} FIM examples to {file_path}")
logger.info(f"Processed and saved {examples_counter} FIM examples to {file_path}")
2 changes: 1 addition & 1 deletion tests/config/codefinetuner_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ globals: &globals
fim_pad_token: "<|fim_pad|>"
eos_token: "<|endoftext|>"
label_pad_token_id: -100
max_token_sequence_length: 1024
data_language: "c"
data_extensions: [".c", ".h"]

preprocess:
<<: *globals
split_mode: "manual"
max_token_sequence_length: 1024
min_middle_tokens_length: 1
max_middle_tokens_length: 500
fim_examples_per_subblock_ratio: 1.0
Expand Down
1 change: 1 addition & 0 deletions tests/finetune/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def test_load_from_yaml_ignores_unknown_keys(tmp_path):
fim_pad_token: "<|fim_pad|>"
eos_token: "<|endoftext|>"
label_pad_token_id: -100
max_token_sequence_length: 1024
data_language: "c"
data_extensions: [".c", ".h"]
unknown_key_that_does_not_exist: 999
Expand Down
Loading
Loading