Conversation
# Conflicts: # src/core/training/base_trainer.py # src/experiments/sft_by_complexity_splits/mmlu/llama_3b.py
…istill-branch-b # Conflicts: # src/core/datasets/distillation/distillation_branch_b_cot_dataset.py # src/core/datasets/mmlu/mmlu_cot_response_dataset.py # src/core/training/base_trainer.py # src/experiments/distill/train_branches/train_cleaned_b_full20_prompt1.py # src/experiments/distill/train_branches/train_cleaned_b_new.py
| use_rslora=self.config.lora_training_args.use_rslora, | ||
| ) | ||
| self._model = get_peft_model(model, peft_config) | ||
| if self.config.training_args.gradient_checkpointing: |
There was a problem hiding this comment.
Why do we need it? Isn't it handled automatically by transformers?
There was a problem hiding this comment.
Agree, will remove.
There was a problem hiding this comment.
removed use_cache=False (handled by Trainer), but had to keep enable_input_require_grads(), because training crashes with RE.
| eval_split_dir="data/out/splits/single_token_entropy/mmlu/qwen_3b", | ||
| eval_groups=6, | ||
| per_device_train_batch_size=1, | ||
| effective_train_batch_size=120, |
There was a problem hiding this comment.
Why do you want to change effective batch size?
| per_device_train_batch_size=1, | ||
| effective_train_batch_size=120, | ||
| num_train_epochs=20, | ||
| learning_rate=1e-4, |
There was a problem hiding this comment.
lr doesn't change (it is just set explicitly)
class LoRATrainingArgs(BaseTrainingArgs):
# Sane overrides for LoRA SFT fine-tuning
effective_train_batch_size: int = 64
learning_rate: float = 1e-4
warmup_ratio: float = 0.06
weight_decay: float = 0.0
There was a problem hiding this comment.
In your code it is effective_train_batch_size=120, typo?
There was a problem hiding this comment.
set effective_train_batch_size=64, per_device_train_batch_size=2.
| raise FileNotFoundError(f"Train parquet not found: {train_data_path}") | ||
|
|
||
| eval_question_ids = _collect_eval_question_ids(eval_split_dir, eval_groups) | ||
| train_row_filter = _build_train_row_filter(eval_question_ids) |
There was a problem hiding this comment.
Instead of applying the filter dynamically, shall we preprocess the data and save it to disk? Just like with other MMLU data splits
There was a problem hiding this comment.
Then we could use just CausalDatasetAdapater
| @@ -0,0 +1,193 @@ | |||
| """ | |||
There was a problem hiding this comment.
What is the difference between src/experiments/distill/train_branches/train_cleaned_b_full20_prompt1.py and this script? Why do we need both?
There was a problem hiding this comment.
train_cleaned_b_full20_prompt1.py is the entry point
There was a problem hiding this comment.
Oh, I see. Could you move the main script to core/... then? And keep the entry point in experiements
| lambda row: self.process_row(row).model_dump(), | ||
| num_proc=4, | ||
| remove_columns=ds.column_names, | ||
| load_from_cache_file=False, |
There was a problem hiding this comment.
Reverted. Used in debugging process.
| return ( | ||
| f"Question: {question.strip()}\n\n" | ||
| f"Options:\n{opts}\n\n" | ||
| f"Answer with the option letter first, then provide reasoning inside {THINKING_START}...{THINKING_END} tags." |
There was a problem hiding this comment.
We do not need to prompt the model to asnwer with reasoning tags, right? Reasoning models should use reasoning by default. Meaning that teh prompt should come without the request to use them
There was a problem hiding this comment.
Fair point. The model will learn the answer-first + reasoning format from the training data itself.
Will simplify the prompt to a plain question format without thinking tag instructions.
| from core.prompts.thinking_markers import THINKING_START, THINKING_END | ||
|
|
||
|
|
||
| class DistillationBranchBCoTDataset(CausalDataset[CausalDatasetConfig]): |
There was a problem hiding this comment.
Why do we need it? Could we use MMLUReasoningResponseDataset instead? Just pre-process the data to match the current format of MMLU datasets
There was a problem hiding this comment.
Agreed, we'll use MMLURasoningResponseDataset directly
- Remove redundant gradient checkpointing code from LoRATrainer - Revert load_from_cache_file=False from abstract base class - Delete DistillationBranchBCoTDataset, use MMLUReasoningResponseDataset - Remove single_token_sys_prompt_with_answer_first_thinking - Add data preprocessing script (prepare_cleaned_b_data.py) - Rewrite training orchestration (branch_b_training.py) - Use default effective_batch_size=64, remove explicit lr=1e-4 - Delete FilteredCausalDatasetAdapter and train_cleaned_b_new.py
No description provided.