| name: fp8_dynamic | |
| scheme: FP8_DYNAMIC # FP8 weights, FP8 dynamic per-token activations — data-free, vLLM-native | |
| engine: llmcompressor | |
| # FP8_DYNAMIC needs no calibration data (the activation scale is recomputed per | |
| # input at inference time). quantise.py still expects something; this is a tiny | |
| # placeholder that the DataFreePipeline ignores anyway. | |
| calibration: | |
| dataset: neuralmagic/calibration | |
| config: LLM | |
| split: train | |
| num_samples: 4 | |
| max_seq_length: 512 | |
| ignore: | |
| - lm_head | |
| - "re:.*visual.*" | |
| - "re:.*linear_attn.*" # Mamba/SSM block stays in bf16 — same rationale as the NVFP4A16 build | |
| - "re:.*mlp.gate$" | |
| - "re:.*mlp.shared_expert_gate$" | |
| - "re:.*mtp.*" | |
| export: | |
| save_compressed: true | |