|
|
from mmengine.config import read_base |
|
|
|
|
|
from opencompass.models import VLLM, HuggingFaceBaseModel, TurboMindModel |
|
|
|
|
|
with read_base(): |
|
|
|
|
|
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \ |
|
|
gpqa_datasets |
|
|
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \ |
|
|
gsm8k_datasets |
|
|
from opencompass.configs.datasets.race.race_ppl import \ |
|
|
race_datasets |
|
|
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ |
|
|
winogrande_datasets |
|
|
|
|
|
from ...rjob import eval, infer |
|
|
|
|
|
race_datasets = [race_datasets[1]] |
|
|
models = [ |
|
|
dict( |
|
|
type=TurboMindModel, |
|
|
abbr='qwen3-8b-base-turbomind', |
|
|
path='Qwen/Qwen3-0.6B-Base', |
|
|
engine_config=dict(max_batch_size=1, tp=1), |
|
|
gen_config=dict(top_k=1, |
|
|
temperature=1e-6, |
|
|
top_p=0.9, |
|
|
max_new_tokens=2048), |
|
|
max_seq_len=8192, |
|
|
max_out_len=2048, |
|
|
batch_size=1, |
|
|
run_cfg=dict(num_gpus=1), |
|
|
), |
|
|
dict( |
|
|
type=VLLM, |
|
|
abbr='qwen3-8b-base-vllm', |
|
|
path='Qwen/Qwen3-0.6B-Base', |
|
|
model_kwargs=dict(tensor_parallel_size=1, gpu_memory_utilization=0.6), |
|
|
max_seq_len=8192, |
|
|
max_out_len=2048, |
|
|
batch_size=16, |
|
|
generation_kwargs=dict(temperature=0), |
|
|
run_cfg=dict(num_gpus=1), |
|
|
), |
|
|
dict(type=HuggingFaceBaseModel, |
|
|
abbr='qwen3-8b-base-hf', |
|
|
path='Qwen/Qwen3-0.6B-Base', |
|
|
max_out_len=1024, |
|
|
batch_size=4, |
|
|
run_cfg=dict(num_gpus=1)) |
|
|
] |
|
|
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], []) |
|
|
|
|
|
for d in datasets: |
|
|
d['reader_cfg']['test_range'] = '[0:32]' |
|
|
|
|
|
for m in models: |
|
|
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']: |
|
|
m['engine_config']['max_batch_size'] = 1 |
|
|
m['batch_size'] = 1 |
|
|
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus']) |
|
|
|
|
|
summarizer = dict( |
|
|
dataset_abbrs=[ |
|
|
['gsm8k', 'accuracy'], |
|
|
['GPQA_diamond', 'accuracy'], |
|
|
['race-high', 'accuracy'], |
|
|
['winogrande', 'accuracy'], |
|
|
], |
|
|
summary_groups=sum( |
|
|
[v for k, v in locals().items() if k.endswith('_summary_groups')], []), |
|
|
) |
|
|
|