huginnfork's picture
Initial upload
6d883f3 verified
raw
history blame contribute delete
717 Bytes
name: fp8_dynamic
scheme: FP8_DYNAMIC # FP8 weights, FP8 dynamic per-token activations — data-free, vLLM-native
engine: llmcompressor
# FP8_DYNAMIC needs no calibration data (the activation scale is recomputed per
# input at inference time). quantise.py still expects something; this is a tiny
# placeholder that the DataFreePipeline ignores anyway.
calibration:
dataset: neuralmagic/calibration
config: LLM
split: train
num_samples: 4
max_seq_length: 512
ignore:
- lm_head
- "re:.*visual.*"
- "re:.*linear_attn.*" # Mamba/SSM block stays in bf16 — same rationale as the NVFP4A16 build
- "re:.*mlp.gate$"
- "re:.*mlp.shared_expert_gate$"
- "re:.*mtp.*"
export:
save_compressed: true