| name: nvfp4a16 | |
| scheme: NVFP4A16 # FP4 weights, bf16 activations — skips activation quant for much lower KLD | |
| engine: llmcompressor | |
| # A16 doesn't need activation calibration; 128 short samples is enough for weight stats. | |
| calibration: | |
| dataset: neuralmagic/calibration | |
| config: LLM | |
| split: train | |
| num_samples: 128 | |
| max_seq_length: 2048 | |
| ignore: | |
| - lm_head | |
| - "re:.*visual.*" | |
| - "re:.*linear_attn.*" # entire SSM block (in_proj_qkvz, in_proj_ba, conv1d, out_proj) kept in bf16 | |
| - "re:.*mlp.gate$" | |
| - "re:.*mlp.shared_expert_gate$" | |
| - "re:.*mtp.*" | |
| export: | |
| save_compressed: true | |