| aws_output_bucket: s3://panda-us-west-2/experiments/Llama-2-13b-chat-hf-code-github-c4_v1.pp8.dp2.0822.aws |
| data_dir: null |
| dist_load_data_barrier: false |
| train_file: /tmp/data-train-code-c4/code/ |
| dev_file: null |
| test_file: null |
| model: |
| _target_: models.llama.LlamaForConditionalGeneration.from_pretrained |
| vocab_size: 79458 |
| use_peft: false |
| gradient_checkpointing: true |
| enable_flash_attention: true |
| flash_attention_vanilla_torch: true |
| pad_token_id: 2 |
| read_tensor: |
| _target_: data.collators.zh_instruct.TextDatasetCombineV2_CodeGithub |
| extra_data: |
| _target_: data.collators.zh_instruct.C4CombinedDataset |
| tokenizer: null |
| file_path: /tmp/data-train-code-c4/c4/ |
| file_num: 50 |
| extended_vocab: null |
| collator: |
| _target_: data.collators.flan.CombineCollator |
| max_seq_length: 2048 |
| tokenizer: ${model_name_or_path} |
| decoder_only: true |
| padding: longest |
| padding_side: right |
| num_workers: 4 |
| prefetch_factor: 2 |
| do_preprocess: false |
| model_name_or_path: /tmp/Llama-2-13b-chat-hf-code-github-c4/ |
| pretrain: null |
| exp_name: llama2.13b.Code.Github.C4.combine.v1.0.seq2k.w16.adamw.NA100.0822.aws.ds |
| exp_notes: null |
| output_dir: /tmp/${exp_name} |
| resume: null |
| do_train: true |
| evaluate_during_training: false |
| do_eval: false |
| eval_sub_path: checkpoint-* |
| per_gpu_train_batch_size: 2 |
| per_gpu_eval_batch_size: 1 |
| learning_rate: 1.0e-05 |
| gradient_accumulation_steps: 64 |
| weight_decay: 0.01 |
| adam_epsilon: 1.0e-06 |
| adam_betas: (0.9, 0.99) |
| max_grad_norm: 1.0 |
| num_train_epochs: 1 |
| total_dataset_len: 10000000 |
| max_steps: 0 |
| warmup_proportion: 0 |
| warmup_steps: 0 |
| optimizer: null |
| use_nvlamb: null |
| bit_training: null |
| logging_steps: 1 |
| save_best: false |
| save_steps: 250 |
| eval_steps: 250 |
| ddp_eval: true |
| no_cuda: false |
| seed: 42 |
| local_rank: 0 |
| fp16: true |
| fp16_opt_level: O1 |
| fp16_bfloat16: true |
| prediction_cfg: |
| metric: acc |
| measure: 1 |
| best_checkpoint: null |
| best_result: null |
| eval_forward_fn: |
| _target_: general_util.evaluator.DiscriminatorForwardFn |
| post_process: null |
| fairscale_config: |
| _target_: general_util.fsdp_utils.default_initialize |
| fp16: ${fp16} |
| move_grads_to_cpu: false |
| move_params_to_cpu: false |
| flatten_parameters: false |
| with_lightseq: false |
| load_lr_scheduler_states: false |
| ds_cfg: |
| train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size} |
| gradient_accumulation_steps: ${gradient_accumulation_steps} |
| optimizer: |
| type: AdamW |
| params: |
| lr: ${learning_rate} |
| betas: |
| - 0.9 |
| - 0.96 |
| eps: ${adam_epsilon} |
| weight_decay: ${weight_decay} |
| scheduler: |
| type: WarmupDecayLR |
| params: |
| total_num_steps: 4882 |
| warmup_max_lr: ${learning_rate} |
| warmup_num_steps: 0 |
| warmup_type: linear |
| gradient_clipping: ${max_grad_norm} |
| bf16: |
| enabled: ${fp16} |
| zero_optimization: |
| stage: 1 |
| contiguous_gradients: true |
| overlap_comm: true |
| reduce_scatter: true |
| reduce_bucket_size: 500000000.0 |
| allgather_bucket_size: 500000000.0 |
| offload_optimizer: |
| device: cpu |
| pin_memory: true |
| steps_per_print: 1 |
| summary_helper: |
| _target_: general_util.tensorboard_helper.WandbWriter |
| batch_index_or_keys: null |
| outputs_index_or_keys: null |
| n_gpu: 1 |
| device: cuda:0 |
| train_batch_size: 2 |
| eval_batch_size: null |
| world_size: 16 |
| world_rank: null |
|
|