studyOverflow commited on
Commit
c8f2a5f
·
verified ·
1 Parent(s): bffee2e

feat: migrate to MBench-V-new + MBench-A-New (V binary + V pairwise + A pairwise tabs)

Browse files
Files changed (3) hide show
  1. README.md +17 -14
  2. app.py +517 -634
  3. sampling/new_task_pools.json +0 -0
README.md CHANGED
@@ -10,23 +10,26 @@ app_file: app.py
10
  pinned: false
11
  ---
12
 
13
- # MBench-V Human Annotation
14
 
15
- Gradio-based annotation UI for the MBench-V video generation benchmark.
 
16
 
17
- - **Video source (read-only)**: [studyOverflow/TempMemoryData](https://huggingface.co/datasets/studyOverflow/TempMemoryData), streamed directly from HF CDN — videos are **not** copied into this Space.
18
- - **Annotation sink (write)**: the same dataset repo, under `annotations/`. Submissions are batched by `CommitScheduler` and pushed every 5 minutes.
19
- - **Models included (6)**: `causal_forcing`, `self_forcing`, `cosmos`, `helios`, `longlive`, `memflow`. `skyreels` and `longcat` are temporarily excluded because their 0422 generation is still in progress.
20
- - **Tasks**: 584 task_ids × 6 models = **3504** `(model, task_id)` pairs.
21
 
22
- ## How to use
 
 
23
 
24
- 1. Enter your annotator name (anything unique — used to tag your submissions).
25
- 2. Watch the video on the left; read the prompt and metadata in the middle.
26
- 3. Give a score (1–5) and an optional note on the right.
27
- 4. Click **Submit & Next** to move on. Your submissions are auto-committed every 5 min.
28
 
29
- ## Notes
 
30
 
31
- - This is a minimal template. Multi-annotator deduplication, per-user task-allocation, and per-dimension scoring are **not** implemented yet — all annotators currently get a randomly shuffled pool and see tasks in their own order.
32
- - The environment variable `HF_TOKEN` must be set in the Space *Settings → Variables and secrets* with **write** access to `studyOverflow/TempMemoryData`.
 
 
 
 
 
 
10
  pinned: false
11
  ---
12
 
13
+ # MBench Annotation Platform (NEW)
14
 
15
+ Adapted to the new dataset layout (`MBench-V-new` + `MBench-A-New`) on
16
+ [`studyOverflow/TempMemoryData`](https://huggingface.co/datasets/studyOverflow/TempMemoryData).
17
 
18
+ ## Tabs
 
 
 
19
 
20
+ 1. **MBench-V Binary** — single video, "is there a memory issue?" (yes/no)
21
+ 2. **MBench-V Pairwise** — two T2V videos, 5 dimensions
22
+ 3. **MBench-A Pairwise** — two world-model videos, ≤6 dimensions
23
 
24
+ ## Annotation Sink
 
 
 
25
 
26
+ Submissions are pushed to `annotations-new/` on the dataset repo every 5 minutes via
27
+ `CommitScheduler`. Old `annotations/` is left untouched (legacy).
28
 
29
+ ## Migrated Historical Data
30
+
31
+ `annotations-new/` already contains:
32
+ - `migrated_v_binary.jsonl` (642 records from old `ann_bc109d66.jsonl`)
33
+ - `migrated_a_pairwise.jsonl` (821 records from old `ann_mbench_a_*.jsonl`)
34
+
35
+ These are read on startup so existing annotators don't see already-completed tasks again.
app.py CHANGED
@@ -1,18 +1,20 @@
1
  """
2
- MBench Annotation Space — 单视频标注 + MBench-V Pairwise + MBench-A Pairwise
3
-
4
- 功能:
5
- - Tab 1 (单视频标注): "该视频是否出现了记忆问题?" (MBench-V)
6
- - Tab 2 (MBench-V Pairwise): 同一 prompt 下两个 T2V 模型视频并排 (MBench-V)
7
- - Tab 3 (MBench-A Pairwise): 世界模型 401f 视频对比,4子集×多维度 (MBench-A)
8
-
9
- 技术栈:
10
- - Gradio 5.9.1 + FastAPI 视频代理
11
- - HuggingFace CommitScheduler 自动推送标注结果
12
- - 数据来源: studyOverflow/TempMemoryData
13
-
14
- 部署:
15
- 直接替换 HuggingFace Space 的 app.py 即可。
 
 
16
  """
17
  from __future__ import annotations
18
 
@@ -34,184 +36,107 @@ from huggingface_hub import CommitScheduler, HfApi, hf_hub_download, hf_hub_url
34
  # ---------------------------------------------------------------------------
35
 
36
  DATASET_REPO = "studyOverflow/TempMemoryData"
37
- MERGED_JSON_PATH = "MBench-V/merged.json"
38
- MODELS: list[str] = [
39
- "causal_forcing",
40
- "self_forcing",
41
- "cosmos",
42
- "helios",
43
- "longlive",
44
- "memflow",
45
- "longcat",
46
- "skyreels",
47
- ]
48
-
49
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
 
 
 
 
 
50
  ANN_DIR = Path("annotations_local")
51
  ANN_DIR.mkdir(exist_ok=True)
52
  PROCESS_ID = uuid.uuid4().hex[:8]
53
 
54
- # Separate files for annotation types
55
- ANN_FILE_BINARY = ANN_DIR / f"ann_binary_{PROCESS_ID}.jsonl"
56
- ANN_FILE_PAIRWISE = ANN_DIR / f"ann_pairwise_{PROCESS_ID}.jsonl"
57
- ANN_FILE_MBENCH_A = ANN_DIR / f"ann_mbench_a_{PROCESS_ID}.jsonl"
58
 
59
  COMMIT_INTERVAL_MIN = 5
60
  PENDING_TIMEOUT_SEC = 30 * 60
61
 
62
- # MBench-V Pairwise config
63
- PAIRWISE_DIMENSIONS = [
64
- ("entity", "实体一致性", "人物/物体离开画面再回来后,哪个视频中实体外观更一致?"),
65
- ("physical", "物理合理性", "哪个视频中的物理过程(水流/碰撞/变形等)更合理自然?"),
66
- ("prompt", "Prompt 忠实度", "哪个视频的内容更符合下方的文字描述?"),
67
- ]
68
- PAIRWISE_SAMPLES_PER_PAIR = 30
69
-
70
- # ---------------------------------------------------------------------------
71
- # MBench-A Config
72
- # ---------------------------------------------------------------------------
73
-
74
- MBENCH_A_MODELS: list[str] = [
75
- "hy_worldplay",
76
- "infinite_world",
77
- "lingbot_world",
78
- "matrix_game_2",
79
- "matrix_game_3",
80
- "yume",
81
- ]
82
- MBENCH_A_ANNOTATORS_PER_TASK = 3
83
- MBENCH_A_CATEGORY_MAP = {
84
- "environment": "Spatial_401f",
85
- "object": "Spatial_401f",
86
- "human": "Human_401f",
87
- "causal": "Casual_401f",
88
- }
89
- MBENCH_A_GT_CATEGORY_MAP = {
90
- "environment": "Spatial",
91
- "object": "Spatial",
92
- "human": "Human",
93
- "causal": "Casual",
94
- }
95
 
96
  # ---------------------------------------------------------------------------
97
- # Load MBench-V merged.json
98
  # ---------------------------------------------------------------------------
99
 
100
- def _load_merged() -> list[dict[str, Any]]:
101
- try:
102
- local = hf_hub_download(
103
- repo_id=DATASET_REPO,
104
- filename=MERGED_JSON_PATH,
105
- repo_type="dataset",
106
- token=HF_TOKEN,
107
- )
108
  with open(local, encoding="utf-8") as f:
109
  return json.load(f)
110
- except Exception as e:
111
- print(f"[mbench-ann] WARNING: Failed to load MBench-V data: {e}")
112
- return []
113
 
114
- TASKS: list[dict[str, Any]] = _load_merged()
115
- TASK_BY_ID: dict[str, dict[str, Any]] = {t["task_id"]: t for t in TASKS}
 
 
 
116
 
117
- # ---------------------------------------------------------------------------
118
- # Load MBench-A task pool
119
- # ---------------------------------------------------------------------------
120
 
121
- def _load_mbench_a_pool() -> dict[str, Any]:
122
- """Load MBench-A task pool from local file or HF."""
123
- local_path = Path(__file__).parent / "sampling" / "task_pool.json"
124
- if local_path.exists():
125
- with open(local_path, encoding="utf-8") as f:
126
- return json.load(f)
127
- # Fallback: try HF
128
- try:
129
- local = hf_hub_download(
130
- repo_id=DATASET_REPO,
131
- filename="MBench-A/task_pool.json",
132
- repo_type="dataset",
133
- token=HF_TOKEN,
134
- )
135
- with open(local, encoding="utf-8") as f:
136
- return json.load(f)
137
- except Exception as e:
138
- print(f"[mbench-ann] WARNING: Failed to load MBench-A task pool: {e}")
139
- return {"tasks": [], "quality_control_tasks": [], "metadata": {}}
140
-
141
- MBENCH_A_POOL = _load_mbench_a_pool()
142
- MBENCH_A_TASKS: list[dict] = MBENCH_A_POOL.get("tasks", []) + MBENCH_A_POOL.get("quality_control_tasks", [])
143
- MBENCH_A_TASK_BY_ID: dict[str, dict] = {t["task_id"]: t for t in MBENCH_A_TASKS}
144
 
145
  # ---------------------------------------------------------------------------
146
- # MBench-V Pool setup
147
  # ---------------------------------------------------------------------------
148
 
149
- BINARY_POOL: list[tuple[str, str]] = [(m, t["task_id"]) for m in MODELS for t in TASKS]
150
- BINARY_POOL_SET: set[tuple[str, str]] = set(BINARY_POOL)
151
 
152
- def _build_pairwise_pool() -> list[tuple[str, str, str, str]]:
153
- pool = []
154
- task_ids = [t["task_id"] for t in TASKS[:PAIRWISE_SAMPLES_PER_PAIR]]
155
- for tid in task_ids:
156
- for i, m_a in enumerate(MODELS):
157
- for m_b in MODELS[i+1:]:
158
- for dim_key, _, _ in PAIRWISE_DIMENSIONS:
159
- pool.append((tid, m_a, m_b, dim_key))
160
- return pool
161
-
162
- PAIRWISE_POOL: list[tuple[str, str, str, str]] = _build_pairwise_pool()
163
- PAIRWISE_POOL_SET: set[tuple[str, str, str, str]] = set(PAIRWISE_POOL)
164
-
165
- print(f"[mbench-ann] MBench-V: {len(TASKS)} tasks × {len(MODELS)} models")
166
- print(f"[mbench-ann] MBench-V binary pool: {len(BINARY_POOL)}, pairwise pool: {len(PAIRWISE_POOL)}")
167
- print(f"[mbench-ann] MBench-A: {len(MBENCH_A_TASKS)} tasks, {len(MBENCH_A_POOL.get('metadata', {}))} metadata")
 
 
 
168
 
169
  # ---------------------------------------------------------------------------
170
- # Video URL helpers
171
  # ---------------------------------------------------------------------------
172
 
173
- def _video_url(model: str, task_id: str) -> str:
174
- return f"/video/{model}/{task_id}.mp4"
175
 
176
- def _hf_video_url(model: str, task_id: str) -> str:
177
  return hf_hub_url(
178
  DATASET_REPO,
179
- filename=f"MBench-V/{model}/videos/{task_id}.mp4",
180
  repo_type="dataset",
181
  )
182
 
183
- def _mbench_a_video_proxy_url(model: str, subset: str, sample_id: str) -> str:
184
- """Build local proxy URL for MBench-A video."""
185
- category = MBENCH_A_CATEGORY_MAP[subset]
186
- return f"/video_a/{model}/{category}/{sample_id}/left_then_right.mp4"
187
-
188
- def _mbench_a_hf_video_url(model: str, category: str, sample_id: str) -> str:
189
- """Build HF upstream URL for MBench-A video."""
190
- return hf_hub_url(
191
- DATASET_REPO,
192
- filename=f"MBench-A/{model}/{category}/{sample_id}/left_then_right.mp4",
193
- repo_type="dataset",
194
- )
195
 
196
- def _mbench_a_asset_hf_url(path: str) -> str:
197
- """Build HF URL for MBench-A assets."""
198
  return hf_hub_url(
199
  DATASET_REPO,
200
- filename=f"MBench-A/assets/{path}",
201
  repo_type="dataset",
202
  )
203
 
204
- def _extract_prompt(task: dict[str, Any]) -> str:
205
- gp = task.get("generation_prompts") or {}
206
- prompts = gp.get("prompts") or {}
207
- for level in ("level_3", "level_4", "level_2", "level_1"):
208
- val = prompts.get(level)
209
- if isinstance(val, list) and val:
210
- n = len(val)
211
- return "\n\n".join(f"— 第 {i}/{n} 段 —\n{seg}" for i, seg in enumerate(val, 1))
212
- if isinstance(val, str) and val:
213
- return val
214
- return "(no prompt found)"
215
 
216
  def _render_video_html(url: str) -> str:
217
  return (
@@ -221,94 +146,7 @@ def _render_video_html(url: str) -> str:
221
  )
222
 
223
  # ---------------------------------------------------------------------------
224
- # MBench-A: Auxiliary info rendering
225
- # ---------------------------------------------------------------------------
226
-
227
- def _render_mbench_a_aux(task: dict) -> str:
228
- """Render auxiliary HTML info based on task subset."""
229
- subset = task["subset"]
230
-
231
- # Use CSS class for guaranteed visibility (Gradio themes can override inline styles)
232
- box = 'class="aux-info-box"'
233
-
234
- # Camera motion info (shown for ALL subsets)
235
- motion = task.get("camera_motion", "left_then_right")
236
- motion_desc = task.get("camera_motion_description", motion)
237
- gif_url = _mbench_a_asset_hf_url(f"camera_diagrams/{motion}.gif")
238
- camera_html = (
239
- f'<div style="flex:0 0 200px">'
240
- f'<p><b>🎬 预期相机运动</b></p>'
241
- f'<p style="margin:0 0 8px">{motion_desc}</p>'
242
- f'<img src="{gif_url}" style="width:180px">'
243
- f'</div>'
244
- )
245
-
246
- # Caption (shown for ALL subsets now)
247
- caption = task.get("caption", "")
248
- caption_html = ""
249
- if caption:
250
- caption_html = (
251
- f'<div style="flex:1;min-width:250px">'
252
- f'<p><b>📝 场景描述</b></p>'
253
- f'<p style="font-size:14px;line-height:1.5">{caption}</p>'
254
- f'</div>'
255
- )
256
-
257
- if subset == "object":
258
- sample_id = task["sample_id"]
259
- mask_url = _mbench_a_asset_hf_url(f"mask_viz/{sample_id}.png")
260
- return (
261
- f'<div {box}>'
262
- f'<p><b>🎯 请关注画面中被标注(高亮)的物体</b></p>'
263
- f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start;margin-top:8px">'
264
- f'<div style="flex:1;min-width:300px">'
265
- f'<img src="{mask_url}" style="max-width:100%;max-height:280px">'
266
- f'</div>'
267
- f'{camera_html}'
268
- f'{caption_html}'
269
- f'</div></div>'
270
- )
271
-
272
- elif subset == "causal":
273
- return (
274
- f'<div {box}>'
275
- f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start">'
276
- f'{camera_html}'
277
- f'{caption_html}'
278
- f'</div></div>'
279
- )
280
-
281
- elif subset == "human":
282
- return (
283
- f'<div {box}>'
284
- f'<p><b>👤 请关注视频中的人物</b>:观察人物离开画面再回来后,面部和外观是否保持一致。</p>'
285
- f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start;margin-top:8px">'
286
- f'{camera_html}'
287
- f'{caption_html}'
288
- f'</div></div>'
289
- )
290
-
291
- else: # environment
292
- return (
293
- f'<div {box}>'
294
- f'<p><b>🏞️ 请关注整体场景</b>:观察相机转回来后,场景的布局、风格、光照是否保持一致。</p>'
295
- f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start;margin-top:8px">'
296
- f'{camera_html}'
297
- f'{caption_html}'
298
- f'</div></div>'
299
- )
300
- return (
301
- f'<div {box}>'
302
- f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start">'
303
- f'<div style="flex:1;min-width:250px">'
304
- f'<p><b>🏞️ 请关注整体场景</b>:观察相机转回来后,场景的布局、风格、光照是否保持一致。</p>'
305
- f'</div>'
306
- f'{camera_html}'
307
- f'</div></div>'
308
- )
309
-
310
- # ---------------------------------------------------------------------------
311
- # CommitScheduler
312
  # ---------------------------------------------------------------------------
313
 
314
  scheduler: CommitScheduler | None = None
@@ -317,7 +155,7 @@ if HF_TOKEN:
317
  repo_id=DATASET_REPO,
318
  repo_type="dataset",
319
  folder_path=str(ANN_DIR),
320
- path_in_repo="annotations",
321
  every=COMMIT_INTERVAL_MIN,
322
  token=HF_TOKEN,
323
  private=False,
@@ -325,20 +163,21 @@ if HF_TOKEN:
325
  )
326
 
327
  # ---------------------------------------------------------------------------
328
- # Historical annotations
329
  # ---------------------------------------------------------------------------
330
 
331
- def _fetch_remote_annotations() -> list[dict[str, Any]]:
332
- records: list[dict[str, Any]] = []
333
  try:
334
  api = HfApi(token=HF_TOKEN)
335
  files = api.list_repo_files(repo_id=DATASET_REPO, repo_type="dataset")
336
  except Exception:
337
  return records
338
- jsonls = [p for p in files if p.startswith("annotations/") and p.endswith(".jsonl")]
339
  for path in jsonls:
340
  try:
341
- local = hf_hub_download(repo_id=DATASET_REPO, filename=path, repo_type="dataset", token=HF_TOKEN)
 
342
  with open(local, encoding="utf-8") as f:
343
  for line in f:
344
  line = line.strip()
@@ -351,7 +190,8 @@ def _fetch_remote_annotations() -> list[dict[str, Any]]:
351
  pass
352
  return records
353
 
354
- HISTORICAL = _fetch_remote_annotations()
 
355
 
356
  # ---------------------------------------------------------------------------
357
  # Shared state
@@ -359,49 +199,43 @@ HISTORICAL = _fetch_remote_annotations()
359
 
360
  STATE_LOCK = threading.Lock()
361
 
362
- # Binary state
363
- BINARY_SUBMITTED: set[tuple[str, str]] = {
364
- (r["model"], r["task_id"]) for r in HISTORICAL
365
- if r.get("type", "binary") == "binary" and "model" in r and "task_id" in r
366
- and (r["model"], r["task_id"]) in BINARY_POOL_SET
367
- }
368
- BINARY_PENDING: dict[tuple[str, str], tuple[str, float]] = {}
369
-
370
- # MBench-V Pairwise state
371
- PAIRWISE_SUBMITTED: set[tuple[str, str, str, str]] = {
372
- (r["task_id"], r["model_a"], r["model_b"], r["dimension"])
373
- for r in HISTORICAL
374
- if r.get("type") == "pairwise"
375
- and all(k in r for k in ("task_id", "model_a", "model_b", "dimension"))
376
- }
377
- PAIRWISE_PENDING: dict[tuple[str, str, str, str], tuple[str, float]] = {}
378
 
379
- # MBench-A state: task_id -> list of annotators who completed it
380
- MBENCH_A_COMPLETED: dict[str, list[str]] = defaultdict(list)
381
  for r in HISTORICAL:
382
- if r.get("type") == "pairwise_mbench_a" and "task_id" in r and "annotator" in r:
383
- tid = r["task_id"]
384
- # Handle old format where task_id might be stored differently
385
- if tid in MBENCH_A_TASK_BY_ID:
386
- MBENCH_A_COMPLETED[tid].append(r["annotator"])
 
 
 
 
 
 
387
 
388
- MBENCH_A_PENDING: dict[str, tuple[str, float]] = {}
 
 
389
 
390
- print(f"[mbench-ann] binary submitted: {len(BINARY_SUBMITTED)}")
391
- print(f"[mbench-ann] pairwise submitted: {len(PAIRWISE_SUBMITTED)}")
392
- print(f"[mbench-ann] MBench-A completed: {sum(len(v) for v in MBENCH_A_COMPLETED.values())} annotations across {len(MBENCH_A_COMPLETED)} tasks")
393
 
394
  # ---------------------------------------------------------------------------
395
- # Queue helpers
396
  # ---------------------------------------------------------------------------
397
 
398
- def _reap_expired(pending_dict):
399
  now = time.time()
400
- expired = [k for k, (_, ts) in pending_dict.items() if now - ts > PENDING_TIMEOUT_SEC]
401
  for k in expired:
402
- pending_dict.pop(k, None)
403
 
404
- def _append_annotation(record: dict[str, Any], ann_file: Path) -> None:
405
  line = json.dumps(record, ensure_ascii=False)
406
  if scheduler is not None:
407
  with scheduler.lock:
@@ -411,394 +245,422 @@ def _append_annotation(record: dict[str, Any], ann_file: Path) -> None:
411
  with ann_file.open("a", encoding="utf-8") as f:
412
  f.write(line + "\n")
413
 
 
 
 
 
 
 
 
 
 
 
 
414
  # ---------------------------------------------------------------------------
415
- # Binary annotation callbacks (MBench-V)
416
  # ---------------------------------------------------------------------------
417
 
418
- def binary_start(annotator: str, state: dict):
419
  annotator = (annotator or "").strip()
420
  if not annotator:
421
- return state, "<p>请输入名字</p>", "", "", "⚠️ 请输入名字", ""
422
- order = list(range(len(BINARY_POOL)))
423
  random.shuffle(order)
424
- state = {"annotator": annotator, "order": order, "idx": 0, "current": None, "count": 0}
425
- return _binary_next(state)
 
 
 
426
 
427
- def _binary_next(state):
428
  annotator = state["annotator"]
429
  order = state["order"]
430
  idx = state.get("idx", 0)
431
  with STATE_LOCK:
432
- _reap_expired(BINARY_PENDING)
433
  while idx < len(order):
434
- mt = BINARY_POOL[order[idx]]
435
- if mt in BINARY_SUBMITTED or mt in BINARY_PENDING:
436
- idx += 1
437
- continue
438
- BINARY_PENDING[mt] = (annotator, time.time())
 
 
 
 
439
  state["idx"] = idx
440
- state["current"] = mt
441
- model, task_id = mt
442
- task = TASK_BY_ID[task_id]
443
- video_html = _render_video_html(_video_url(model, task_id))
444
- meta = f"**模型**: `{model}` | **task_id**: `{task_id}` | **已提交**: {state['count']}"
445
- prompt = _extract_prompt(task)
446
- n_sub = len(BINARY_SUBMITTED)
447
- stats = f"全局进度: {n_sub}/{len(BINARY_POOL)} ({100*n_sub/len(BINARY_POOL):.1f}%)"
448
- return state, video_html, meta, prompt, f"✅ 已加载", stats
 
 
 
 
 
449
  state["current"] = None
450
- return state, "<p>🎉 全部完成!</p>", "全部标注完成", "", "完成", f"已完成 {len(BINARY_SUBMITTED)}/{len(BINARY_POOL)}"
451
 
452
- def binary_submit(state, verdict, note):
453
  if not state or not state.get("current"):
454
- return state, "<p>请先登录</p>", "", "", "", "", "⚠️", ""
455
- mt = state["current"]
456
- model, task_id = mt
457
  record = {
458
- "type": "binary",
459
  "timestamp": time.time(),
460
  "annotator": state["annotator"],
461
- "model": model,
462
- "task_id": task_id,
 
 
 
 
 
463
  "memory_issue": verdict == "是",
464
  "verdict": verdict,
465
  "note": (note or "").strip(),
466
  }
467
- _append_annotation(record, ANN_FILE_BINARY)
468
  with STATE_LOCK:
469
- BINARY_PENDING.pop(mt, None)
470
- BINARY_SUBMITTED.add(mt)
471
  state["count"] = state.get("count", 0) + 1
472
  state["idx"] = state["idx"] + 1
473
  state["current"] = None
474
- result = _binary_next(state)
475
- return result[0], result[1], result[2], result[3], "否", "", f"✅ 已提交 {state['count']}", result[5]
476
 
477
- def binary_skip(state):
478
  if not state or not state.get("current"):
479
- return state, "<p>请先登录</p>", "", "", "", "", "⚠️", ""
480
- mt = state["current"]
481
  with STATE_LOCK:
482
- BINARY_PENDING.pop(mt, None)
483
  state["idx"] = state["idx"] + 1
484
  state["current"] = None
485
- result = _binary_next(state)
486
- return result[0], result[1], result[2], result[3], "否", "", "⏭️ 已跳过", result[5]
487
 
488
  # ---------------------------------------------------------------------------
489
- # MBench-V Pairwise annotation callbacks
490
  # ---------------------------------------------------------------------------
491
 
492
- def pairwise_start(annotator: str, dimension: str, state: dict):
493
  annotator = (annotator or "").strip()
494
  if not annotator:
495
- return state, "<p>请先输入名字。</p>", "<p></p>", "", "", "⚠️ 请输入名字", ""
496
- dim_pool = [(i, item) for i, item in enumerate(PAIRWISE_POOL) if item[3] == dimension]
497
- order = list(range(len(dim_pool)))
 
 
498
  random.shuffle(order)
499
- state = {
500
- "annotator": annotator, "dimension": dimension, "dim_pool": dim_pool,
501
- "order": order, "idx": 0, "current": None, "count": 0,
502
- }
503
- return _pairwise_next(state)
504
 
505
- def _pairwise_next(state):
506
  annotator = state["annotator"]
507
- dim_pool = state["dim_pool"]
508
  order = state["order"]
509
  idx = state.get("idx", 0)
510
- dimension = state["dimension"]
511
- dim_label = dimension
512
- dim_question = ""
513
- for dk, dl, dq in PAIRWISE_DIMENSIONS:
514
- if dk == dimension:
515
- dim_label = dl
516
- dim_question = dq
517
- break
518
  with STATE_LOCK:
519
- _reap_expired(PAIRWISE_PENDING)
520
  while idx < len(order):
521
- pool_idx, item = dim_pool[order[idx]]
522
- tid, m_a, m_b = item[0], item[1], item[2]
523
- if item in PAIRWISE_SUBMITTED or item in PAIRWISE_PENDING:
524
- idx += 1
525
- continue
526
- PAIRWISE_PENDING[item] = (annotator, time.time())
 
 
 
527
  state["idx"] = idx
528
- state["current"] = item
 
 
529
  if random.random() < 0.5:
530
- left_model, right_model = m_a, m_b
531
- state["swapped"] = False
532
  else:
533
- left_model, right_model = m_b, m_a
534
- state["swapped"] = True
535
- task = TASK_BY_ID[tid]
536
- video_a_html = _render_video_html(_video_url(left_model, tid))
537
- video_b_html = _render_video_html(_video_url(right_model, tid))
538
- prompt = _extract_prompt(task)
539
- meta = f"**维度**: {dim_label} | **问题**: {dim_question}\n\n**已提交**: {state['count']}"
540
- n_sub = sum(1 for x in PAIRWISE_SUBMITTED if x[3] == dimension)
541
- n_total = len(dim_pool)
542
- stats = f"维度「{dim_label}」进度: {n_sub}/{n_total} ({100*n_sub/n_total:.1f}%)"
543
- return state, video_a_html, video_b_html, meta, prompt, "✅ 已加载", stats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
  state["current"] = None
545
- return state, "<p>🎉 该维度全部完成!</p>", "", "全部完成", "", "完成", ""
 
 
546
 
547
- def pairwise_submit(state, verdict, note):
548
  if not state or not state.get("current"):
549
- return state, "", "", "", "", "⚠️ 请先登录", ""
550
- item = state["current"]
551
- tid, m_a, m_b, dimension = item
552
- swapped = state.get("swapped", False)
553
- if verdict == "左边更好":
554
- winner = m_b if swapped else m_a
555
- elif verdict == "右边更好":
556
- winner = m_a if swapped else m_b
557
- else:
558
- winner = "tie"
 
 
 
 
 
 
 
 
 
559
  record = {
560
- "type": "pairwise",
561
  "timestamp": time.time(),
562
  "annotator": state["annotator"],
563
  "task_id": tid,
564
- "model_a": m_a,
565
- "model_b": m_b,
566
- "dimension": dimension,
567
- "winner": winner,
568
- "verdict_raw": verdict,
 
 
 
 
569
  "swapped": swapped,
570
  "note": (note or "").strip(),
571
  }
572
- _append_annotation(record, ANN_FILE_PAIRWISE)
573
  with STATE_LOCK:
574
- PAIRWISE_PENDING.pop(item, None)
575
- PAIRWISE_SUBMITTED.add(item)
576
  state["count"] = state.get("count", 0) + 1
577
  state["idx"] = state["idx"] + 1
578
  state["current"] = None
579
- result = _pairwise_next(state)
580
- return result[0], result[1], result[2], result[3], result[4], f"✅ 已提交第 {state['count']} 条", result[6]
581
 
582
- def pairwise_skip(state):
583
  if not state or not state.get("current"):
584
- return state, "", "", "", "", "⚠️ 请先登录", ""
585
- item = state["current"]
 
 
586
  with STATE_LOCK:
587
- PAIRWISE_PENDING.pop(item, None)
588
  state["idx"] = state["idx"] + 1
589
  state["current"] = None
590
- result = _pairwise_next(state)
591
- return result[0], result[1], result[2], result[3], result[4], "⏭️ 已跳过", result[6]
592
 
593
  # ---------------------------------------------------------------------------
594
- # MBench-A Pairwise annotation callbacks
595
  # ---------------------------------------------------------------------------
596
 
597
- def mbench_a_start(annotator: str, state: dict):
598
- """Login for MBench-A annotation."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  annotator = (annotator or "").strip()
600
  if not annotator:
 
601
  return (state, "⚠️ 请输入名字", "", "", "", "",
602
- gr.update(visible=False), gr.update(visible=False),
603
- gr.update(visible=False), gr.update(visible=False),
604
- gr.update(visible=False),
605
- "", "")
606
- # Count how many tasks this annotator has already completed.
607
- # Check both:
608
- # 1. MBENCH_A_COMPLETED (loaded from HF at startup + updated in-memory during this session)
609
- # 2. The local annotation file (captures annotations made this session before any push)
610
- historical_count = sum(
611
- 1 for anns in MBENCH_A_COMPLETED.values()
612
- if annotator in anns
613
- )
614
- # Also scan the local file in case this session's annotations haven't been pushed yet
615
- if ANN_FILE_MBENCH_A.exists():
616
- with ANN_FILE_MBENCH_A.open() as f:
617
- for line in f:
618
- line = line.strip()
619
- if not line:
620
- continue
621
- try:
622
- r = json.loads(line)
623
- if r.get("annotator") == annotator and r.get("type") == "pairwise_mbench_a":
624
- tid = r.get("task_id", "")
625
- # Only count if not already counted in MBENCH_A_COMPLETED
626
- if tid in MBENCH_A_TASK_BY_ID and annotator not in MBENCH_A_COMPLETED.get(tid, []):
627
- historical_count += 1
628
- except Exception:
629
- pass
630
-
631
- # Shuffle task order for this annotator
632
- order = list(range(len(MBENCH_A_TASKS)))
633
  random.shuffle(order)
634
- state = {
635
- "annotator": annotator,
636
- "order": order,
637
- "idx": 0,
638
- "current_task_id": None,
639
- "swapped": False,
640
- "left_model": None,
641
- "right_model": None,
642
- "count": historical_count,
643
- }
644
- return _mbench_a_next(state)
645
-
646
 
647
- def _mbench_a_next(state: dict):
648
- """Find and load the next available MBench-A task."""
649
  annotator = state["annotator"]
650
  order = state["order"]
651
  idx = state.get("idx", 0)
652
-
653
  with STATE_LOCK:
654
- _reap_expired(MBENCH_A_PENDING)
655
  while idx < len(order):
656
- task = MBENCH_A_TASKS[order[idx]]
657
  tid = task["task_id"]
658
-
659
- # Skip if already fully annotated
660
- if len(MBENCH_A_COMPLETED.get(tid, [])) >= MBENCH_A_ANNOTATORS_PER_TASK:
661
- idx += 1
662
- continue
663
- # Skip if this annotator already did it
664
- if annotator in MBENCH_A_COMPLETED.get(tid, []):
665
- idx += 1
666
- continue
667
- # Skip if currently pending by someone else
668
- if tid in MBENCH_A_PENDING and MBENCH_A_PENDING[tid][0] != annotator:
669
- idx += 1
670
- continue
671
-
672
- # Assign this task
673
- MBENCH_A_PENDING[tid] = (annotator, time.time())
674
  state["idx"] = idx
675
- state["current_task_id"] = tid
676
 
677
- # Randomly swap A/B
678
- m_a, m_b = task["model_a"], task["model_b"]
679
  if random.random() < 0.5:
680
- state["left_model"], state["right_model"] = m_a, m_b
681
- state["swapped"] = False
682
  else:
683
- state["left_model"], state["right_model"] = m_b, m_a
684
- state["swapped"] = True
685
-
686
- # Build UI outputs
687
- subset = task["subset"]
688
- video_left = _render_video_html(
689
- _mbench_a_video_proxy_url(state["left_model"], subset, task["sample_id"]))
690
- video_right = _render_video_html(
691
- _mbench_a_video_proxy_url(state["right_model"], subset, task["sample_id"]))
692
-
693
- aux_html = _render_mbench_a_aux(task)
694
 
695
- # Dimension questions
696
  dimensions = task["dimensions"]
697
- dim_questions = task.get("dimension_questions", {})
698
-
699
- # Build question radio updates (max 5)
700
  q_updates = []
701
  for i in range(6):
702
  if i < len(dimensions):
703
- dim_key = dimensions[i]
704
- question_text = dim_questions.get(dim_key, dim_key)
705
- q_updates.append(gr.update(
706
- visible=True,
707
- label=question_text,
708
- value="差不多",
709
- ))
710
  else:
711
  q_updates.append(gr.update(visible=False, value="差不多"))
712
 
713
- # Meta info
714
- subset_names = {"environment": "🏞️ Environment", "object": "🎯 Object",
715
- "human": "👤 Human", "causal": "⚡ Causal"}
716
- n_done = sum(1 for t in MBENCH_A_TASKS
717
- if len(MBENCH_A_COMPLETED.get(t["task_id"], [])) >= MBENCH_A_ANNOTATORS_PER_TASK)
718
- meta = (f"**子集**: {subset_names.get(subset, subset)} | "
719
- f"**已提交**: {state['count']}")
720
- stats = (f"全局进度: {n_done}/{len(MBENCH_A_TASKS)} tasks 完成 | "
721
- f"你已标注: {state['count']}")
722
-
723
- return (state, "✅ 已加载", aux_html, video_left, video_right, meta,
724
  *q_updates, "", stats)
 
 
 
 
725
 
726
- # All done
727
- state["current_task_id"] = None
728
- empty_q = gr.update(visible=False, value="差不多")
729
- return (state, "🎉 全部完成!", "", "<p>所有任务已完成</p>", "", "全部完成",
730
- empty_q, empty_q, empty_q, empty_q, empty_q, empty_q, "", "")
731
-
732
-
733
- def mbench_a_submit(state, q1_val, q2_val, q3_val, q4_val, q5_val, q6_val, note):
734
- """Submit MBench-A multi-dimension annotation."""
735
- if not state or not state.get("current_task_id"):
736
- empty_q = gr.update(visible=False, value="差不多")
737
  return (state, "⚠️ 请先登录", "", "", "", "",
738
- empty_q, empty_q, empty_q, empty_q, empty_q, empty_q, "", "")
739
-
740
- tid = state["current_task_id"]
741
- task = MBENCH_A_TASK_BY_ID[tid]
742
- dimensions = task["dimensions"]
743
  swapped = state["swapped"]
744
- m_a, m_b = task["model_a"], task["model_b"]
745
-
746
- # Map verdicts to winners
747
- verdicts = [q1_val, q2_val, q3_val, q4_val, q5_val, q6_val]
748
  dim_results = {}
749
- for i, dim_key in enumerate(dimensions):
750
  v = verdicts[i]
751
  if v == "A更好":
752
- # A is left; if swapped, left is model_b
753
- winner = m_b if swapped else m_a
754
  elif v == "B更好":
755
- winner = m_a if swapped else m_b
756
  else:
757
  winner = "tie"
758
- dim_results[dim_key] = winner
759
 
 
 
760
  record = {
761
- "type": "pairwise_mbench_a",
762
  "timestamp": time.time(),
763
  "annotator": state["annotator"],
764
  "task_id": tid,
 
765
  "subset": task["subset"],
766
  "sample_id": task["sample_id"],
767
- "camera_motion": task.get("camera_motion", "left_then_right"),
768
- "model_a": m_a,
769
- "model_b": m_b,
 
 
 
770
  "dimensions": dim_results,
771
  "swapped": swapped,
772
  "note": (note or "").strip(),
773
  }
774
- _append_annotation(record, ANN_FILE_MBENCH_A)
775
-
776
  with STATE_LOCK:
777
- MBENCH_A_PENDING.pop(tid, None)
778
- MBENCH_A_COMPLETED[tid].append(state["annotator"])
779
-
780
  state["count"] = state.get("count", 0) + 1
781
  state["idx"] = state["idx"] + 1
782
- state["current_task_id"] = None
783
-
784
- return _mbench_a_next(state)
785
-
786
 
787
- def mbench_a_skip(state):
788
- """Skip current MBench-A task."""
789
- if not state or not state.get("current_task_id"):
790
- empty_q = gr.update(visible=False, value="差不多")
791
  return (state, "⚠️ 请先登录", "", "", "", "",
792
- empty_q, empty_q, empty_q, empty_q, empty_q, empty_q, "", "")
793
-
794
- tid = state["current_task_id"]
795
  with STATE_LOCK:
796
- MBENCH_A_PENDING.pop(tid, None)
797
-
798
  state["idx"] = state["idx"] + 1
799
- state["current_task_id"] = None
800
- return _mbench_a_next(state)
801
-
802
 
803
  # ---------------------------------------------------------------------------
804
  # UI
@@ -806,62 +668,101 @@ def mbench_a_skip(state):
806
 
807
  CUSTOM_CSS = """
808
  #prompt_box textarea { height: 300px !important; overflow-y: auto !important; }
809
- .video-pair { display: flex; gap: 12px; }
810
- .video-pair > div { flex: 1; }
811
- /* Force aux info box to be visible regardless of Gradio theme */
812
  .aux-info-box {
813
- background: #e3e8ef !important;
814
- color: #111 !important;
815
- padding: 14px !important;
816
- border-radius: 8px !important;
817
- margin-bottom: 12px !important;
818
- border: 1px solid #b0b8c4 !important;
819
- }
820
- .aux-info-box * {
821
- color: #111 !important;
822
- }
823
- .aux-info-box img {
824
- border: 1px solid #999;
825
- border-radius: 4px;
826
  }
 
 
827
  """
828
 
829
- with gr.Blocks(title="MBench 标注", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
830
- gr.Markdown("# 🎬 MBench 视频标注平台")
831
 
832
  with gr.Tabs():
833
- # ═══════════════ MBench-A Pairwise ═══════════════
834
- with gr.Tab("MBench-A 对比 (World Models)"):
835
- gr.Markdown(
836
- "## 🌍 MBench-A — 世界模型记忆能力评测\n\n"
837
- "比较两个世界模型生成的长视频(~25 秒),评估相机转走再转回来后的记忆一致性。\n\n"
838
- "**视频 A/B 的模型身份已匿名随机分配。请对每个维度独立判断。**"
839
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840
  a_stats = gr.Markdown("")
841
  a_state = gr.State({})
842
-
843
  with gr.Row():
844
- a_name = gr.Textbox(label="标注员名字", placeholder="例如: charlie", scale=4)
845
  a_login = gr.Button("开始标注", variant="primary", scale=1)
846
-
847
  a_status = gr.Markdown("")
848
-
849
- # Auxiliary info (mask image / camera GIF + caption / instructions)
850
  a_aux = gr.HTML("")
851
-
852
- # Video pair
853
  with gr.Row(equal_height=True):
854
  with gr.Column(scale=1, min_width=360):
855
  gr.Markdown("### 视频 A")
856
- a_video_left = gr.HTML("<p>请先登录</p>")
857
  with gr.Column(scale=1, min_width=360):
858
  gr.Markdown("### 视频 B")
859
- a_video_right = gr.HTML("<p>请先登录</p>")
860
-
861
- # Task info
862
- a_meta = gr.Markdown("")
863
-
864
- # Multi-dimension questions (max 6, dynamically shown/hidden)
865
  gr.Markdown("---\n### 请对以下每个维度分别判断:")
866
  a_q1 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 1", visible=False)
867
  a_q2 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 2", visible=False)
@@ -869,22 +770,17 @@ with gr.Blocks(title="MBench 标注", theme=gr.themes.Soft(), css=CUSTOM_CSS) as
869
  a_q4 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 4", visible=False)
870
  a_q5 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 5", visible=False)
871
  a_q6 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 6", visible=False)
872
-
873
  a_note = gr.Textbox(label="备注(可选)", lines=1)
874
-
875
  with gr.Row():
876
  a_submit = gr.Button("✅ 提交并下一组", variant="primary")
877
  a_skip = gr.Button("⏭️ 跳过")
878
-
879
- # Wiring
880
- a_all_outs = [a_state, a_status, a_aux, a_video_left, a_video_right, a_meta,
881
- a_q1, a_q2, a_q3, a_q4, a_q5, a_q6, a_note, a_stats]
882
-
883
- a_login.click(mbench_a_start, [a_name, a_state], a_all_outs)
884
- a_name.submit(mbench_a_start, [a_name, a_state], a_all_outs)
885
- a_submit.click(mbench_a_submit,
886
- [a_state, a_q1, a_q2, a_q3, a_q4, a_q5, a_q6, a_note], a_all_outs)
887
- a_skip.click(mbench_a_skip, [a_state], a_all_outs)
888
 
889
  # ---------------------------------------------------------------------------
890
  # Video proxy
@@ -899,7 +795,6 @@ if __name__ == "__main__":
899
  _video_client = httpx.AsyncClient(timeout=30.0, follow_redirects=True)
900
 
901
  async def _do_proxy(upstream: str, request: Request):
902
- """Generic proxy for HF video/asset URLs."""
903
  req_headers = {}
904
  if (rng := request.headers.get("range")):
905
  req_headers["range"] = rng
@@ -910,13 +805,13 @@ if __name__ == "__main__":
910
  )
911
  except Exception as e:
912
  raise HTTPException(502, f"upstream fetch failed: {e}")
913
- passthrough_headers = {}
914
  for h in ("content-type", "content-length", "accept-ranges",
915
  "content-range", "etag", "last-modified"):
916
  if h in upstream_resp.headers:
917
- passthrough_headers[h] = upstream_resp.headers[h]
918
- passthrough_headers.setdefault("content-type", "video/mp4")
919
- passthrough_headers["cache-control"] = "public, max-age=300"
920
 
921
  async def _body():
922
  try:
@@ -924,43 +819,31 @@ if __name__ == "__main__":
924
  yield chunk
925
  finally:
926
  await upstream_resp.aclose()
 
927
 
928
- return StreamingResponse(_body(), status_code=upstream_resp.status_code, headers=passthrough_headers)
929
-
930
- async def _proxy_video(model: str, task_id: str, request: Request):
931
- """Proxy MBench-V videos."""
932
- if model not in MODELS or task_id not in TASK_BY_ID:
933
- raise HTTPException(404, "unknown (model, task_id)")
934
- upstream = _hf_video_url(model, task_id)
935
  return await _do_proxy(upstream, request)
936
 
937
- async def _proxy_mbench_a_video(model: str, category: str, sample_id: str, request: Request):
938
- """Proxy MBench-A videos."""
939
- if model not in MBENCH_A_MODELS:
940
- raise HTTPException(404, f"unknown model: {model}")
941
- upstream = _mbench_a_hf_video_url(model, category, sample_id)
942
  return await _do_proxy(upstream, request)
943
 
944
- _orig_create_app = _GradioApp.create_app
945
-
946
- def _patched_create_app(*args, **kwargs):
947
- app = _orig_create_app(*args, **kwargs)
948
- # MBench-V video proxy
949
- app.add_api_route(
950
- "/video/{model}/{task_id}.mp4",
951
- _proxy_video,
952
- methods=["GET", "HEAD"],
953
- include_in_schema=False,
954
- )
955
- # MBench-A video proxy
956
- app.add_api_route(
957
- "/video_a/{model}/{category}/{sample_id}/left_then_right.mp4",
958
- _proxy_mbench_a_video,
959
- methods=["GET", "HEAD"],
960
- include_in_schema=False,
961
- )
962
- print("[mbench-ann] video proxy routes registered (MBench-V + MBench-A)")
963
  return app
 
964
 
965
- _GradioApp.create_app = staticmethod(_patched_create_app)
966
  demo.queue(default_concurrency_limit=16).launch(ssr_mode=False)
 
1
  """
2
+ MBench Annotation Space (NEW) adapted for MBench-V-new + MBench-A-New.
3
+
4
+ Tabs:
5
+ 1. MBench-V Binary ─ "该视频是否出现了记忆问题?" (单视频, 1 标注员/任务)
6
+ 2. MBench-V Pairwise 视频, 5 维度对比 (3 标注员/任务)
7
+ 3. MBench-A Pairwise 视频, ≤6 维度对比 (3 标注员/任务)
8
+
9
+ Data sources:
10
+ - Videos: streamed from studyOverflow/TempMemoryData (MBench-V-new + MBench-A-New).
11
+ - Task pools: sampling/new_task_pools.json
12
+ - Sample metadata: sample.json under MBench-{V,A}-New/samples/{subset}/{sid}/
13
+ - Annotation sink: annotations-new/ on the dataset repo (CommitScheduler, 5 min cadence).
14
+
15
+ Notes:
16
+ - All paths use the new structure (subset names: environment/object/human/causal).
17
+ - Old annotations in annotations/ are preserved; this app writes only to annotations-new/.
18
  """
19
  from __future__ import annotations
20
 
 
36
  # ---------------------------------------------------------------------------
37
 
38
  DATASET_REPO = "studyOverflow/TempMemoryData"
 
 
 
 
 
 
 
 
 
 
 
 
39
  HF_TOKEN = os.environ.get("HF_TOKEN")
40
+
41
+ V_MODELS = ["causal_forcing", "self_forcing", "cosmos", "helios",
42
+ "longlive", "memflow", "skyreels", "longcat"]
43
+ A_MODELS = ["hy_worldplay", "infinite_world", "lingbot_world",
44
+ "matrix_game_2", "matrix_game_3", "yume"]
45
+
46
  ANN_DIR = Path("annotations_local")
47
  ANN_DIR.mkdir(exist_ok=True)
48
  PROCESS_ID = uuid.uuid4().hex[:8]
49
 
50
+ ANN_FILE_V_BINARY = ANN_DIR / f"v_binary_{PROCESS_ID}.jsonl"
51
+ ANN_FILE_V_PAIRWISE = ANN_DIR / f"v_pairwise_{PROCESS_ID}.jsonl"
52
+ ANN_FILE_A_PAIRWISE = ANN_DIR / f"a_pairwise_{PROCESS_ID}.jsonl"
 
53
 
54
  COMMIT_INTERVAL_MIN = 5
55
  PENDING_TIMEOUT_SEC = 30 * 60
56
 
57
+ V_BINARY_ANNOTATORS_PER_TASK = 1
58
+ V_PAIRWISE_ANNOTATORS_PER_TASK = 3
59
+ A_PAIRWISE_ANNOTATORS_PER_TASK = 3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  # ---------------------------------------------------------------------------
62
+ # Load task pools
63
  # ---------------------------------------------------------------------------
64
 
65
+ def _load_pools() -> dict:
66
+ local = Path(__file__).parent / "sampling" / "new_task_pools.json"
67
+ if local.exists():
 
 
 
 
 
68
  with open(local, encoding="utf-8") as f:
69
  return json.load(f)
70
+ raise RuntimeError(f"Task pool not found at {local}")
 
 
71
 
72
+ POOLS = _load_pools()
73
+ V_BINARY_TASKS: list[dict] = POOLS["v_binary"]["tasks"]
74
+ V_PAIRWISE_TASKS: list[dict] = POOLS["v_pairwise"]["tasks"]
75
+ A_PAIRWISE_TASKS: list[dict] = (POOLS["a_pairwise"]["tasks"]
76
+ + POOLS["a_pairwise"]["quality_control_tasks"])
77
 
78
+ V_BINARY_BY_ID = {t["task_id"]: t for t in V_BINARY_TASKS}
79
+ V_PAIRWISE_BY_ID = {t["task_id"]: t for t in V_PAIRWISE_TASKS}
80
+ A_PAIRWISE_BY_ID = {t["task_id"]: t for t in A_PAIRWISE_TASKS}
81
 
82
+ print(f"[ann-new] V binary tasks: {len(V_BINARY_TASKS)}")
83
+ print(f"[ann-new] V pairwise tasks: {len(V_PAIRWISE_TASKS)}")
84
+ print(f"[ann-new] A pairwise tasks: {len(A_PAIRWISE_TASKS)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  # ---------------------------------------------------------------------------
87
+ # Sample metadata cache (sample.json)
88
  # ---------------------------------------------------------------------------
89
 
90
+ _sample_cache: dict[tuple[str, str, str], dict] = {}
91
+ _sample_cache_lock = threading.Lock()
92
 
93
+ def _load_sample_meta(dataset: str, subset: str, sample_id: str) -> dict:
94
+ key = (dataset, subset, sample_id)
95
+ with _sample_cache_lock:
96
+ if key in _sample_cache:
97
+ return _sample_cache[key]
98
+ if dataset == "mbenchv":
99
+ path = f"MBench-V-new/samples/{subset}/{sample_id}/sample.json"
100
+ else:
101
+ path = f"MBench-A-New/samples/{subset}/{sample_id}/sample.json"
102
+ try:
103
+ local = hf_hub_download(DATASET_REPO, path, repo_type="dataset", token=HF_TOKEN)
104
+ with open(local, encoding="utf-8") as f:
105
+ data = json.load(f)
106
+ except Exception as e:
107
+ print(f"[ann-new] sample.json load failed for {key}: {e}")
108
+ data = {}
109
+ with _sample_cache_lock:
110
+ _sample_cache[key] = data
111
+ return data
112
 
113
  # ---------------------------------------------------------------------------
114
+ # Video URL helpers (proxy)
115
  # ---------------------------------------------------------------------------
116
 
117
+ def _v_video_proxy_url(model: str, subset: str, sample_id: str) -> str:
118
+ return f"/video_v/{model}/{subset}/{sample_id}.mp4"
119
 
120
+ def _v_video_hf_url(model: str, subset: str, sample_id: str) -> str:
121
  return hf_hub_url(
122
  DATASET_REPO,
123
+ filename=f"MBench-V-new/models/{model}/outputs/{subset}/{sample_id}/text/video.mp4",
124
  repo_type="dataset",
125
  )
126
 
127
+ def _a_video_proxy_url(model: str, subset: str, sample_id: str, condition_id: str) -> str:
128
+ return f"/video_a/{model}/{subset}/{sample_id}/{condition_id}.mp4"
 
 
 
 
 
 
 
 
 
 
129
 
130
+ def _a_video_hf_url(model: str, subset: str, sample_id: str, condition_id: str) -> str:
 
131
  return hf_hub_url(
132
  DATASET_REPO,
133
+ filename=f"MBench-A-New/models/{model}/outputs/{subset}/{sample_id}/{condition_id}/video.mp4",
134
  repo_type="dataset",
135
  )
136
 
137
+ def _a_asset_hf_url(path: str) -> str:
138
+ """Reuse old MBench-A asset directory (camera diagrams + mask viz)."""
139
+ return hf_hub_url(DATASET_REPO, filename=f"MBench-A/assets/{path}", repo_type="dataset")
 
 
 
 
 
 
 
 
140
 
141
  def _render_video_html(url: str) -> str:
142
  return (
 
146
  )
147
 
148
  # ---------------------------------------------------------------------------
149
+ # CommitScheduler annotations-new/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  # ---------------------------------------------------------------------------
151
 
152
  scheduler: CommitScheduler | None = None
 
155
  repo_id=DATASET_REPO,
156
  repo_type="dataset",
157
  folder_path=str(ANN_DIR),
158
+ path_in_repo="annotations-new",
159
  every=COMMIT_INTERVAL_MIN,
160
  token=HF_TOKEN,
161
  private=False,
 
163
  )
164
 
165
  # ---------------------------------------------------------------------------
166
+ # Load historical annotations (from annotations-new/)
167
  # ---------------------------------------------------------------------------
168
 
169
+ def _fetch_annotations_new() -> list[dict]:
170
+ records = []
171
  try:
172
  api = HfApi(token=HF_TOKEN)
173
  files = api.list_repo_files(repo_id=DATASET_REPO, repo_type="dataset")
174
  except Exception:
175
  return records
176
+ jsonls = [p for p in files if p.startswith("annotations-new/") and p.endswith(".jsonl")]
177
  for path in jsonls:
178
  try:
179
+ local = hf_hub_download(repo_id=DATASET_REPO, filename=path,
180
+ repo_type="dataset", token=HF_TOKEN)
181
  with open(local, encoding="utf-8") as f:
182
  for line in f:
183
  line = line.strip()
 
190
  pass
191
  return records
192
 
193
+ HISTORICAL = _fetch_annotations_new()
194
+ print(f"[ann-new] historical records loaded: {len(HISTORICAL)}")
195
 
196
  # ---------------------------------------------------------------------------
197
  # Shared state
 
199
 
200
  STATE_LOCK = threading.Lock()
201
 
202
+ # Each: task_id -> set of annotators who completed it
203
+ V_BINARY_COMPLETED: dict[str, set[str]] = defaultdict(set)
204
+ V_PAIRWISE_COMPLETED: dict[str, set[str]] = defaultdict(set)
205
+ A_PAIRWISE_COMPLETED: dict[str, set[str]] = defaultdict(set)
 
 
 
 
 
 
 
 
 
 
 
 
206
 
 
 
207
  for r in HISTORICAL:
208
+ t = r.get("type")
209
+ tid = r.get("task_id")
210
+ ann = r.get("annotator")
211
+ if not (tid and ann):
212
+ continue
213
+ if t == "v_binary" and tid in V_BINARY_BY_ID:
214
+ V_BINARY_COMPLETED[tid].add(ann)
215
+ elif t == "v_pairwise" and tid in V_PAIRWISE_BY_ID:
216
+ V_PAIRWISE_COMPLETED[tid].add(ann)
217
+ elif t == "a_pairwise" and tid in A_PAIRWISE_BY_ID:
218
+ A_PAIRWISE_COMPLETED[tid].add(ann)
219
 
220
+ V_BINARY_PENDING: dict[str, tuple[str, float]] = {}
221
+ V_PAIRWISE_PENDING: dict[str, tuple[str, float]] = {}
222
+ A_PAIRWISE_PENDING: dict[str, tuple[str, float]] = {}
223
 
224
+ print(f"[ann-new] V binary: {sum(len(v) for v in V_BINARY_COMPLETED.values())} annotations on {len(V_BINARY_COMPLETED)} tasks")
225
+ print(f"[ann-new] V pairwise: {sum(len(v) for v in V_PAIRWISE_COMPLETED.values())} on {len(V_PAIRWISE_COMPLETED)} tasks")
226
+ print(f"[ann-new] A pairwise: {sum(len(v) for v in A_PAIRWISE_COMPLETED.values())} on {len(A_PAIRWISE_COMPLETED)} tasks")
227
 
228
  # ---------------------------------------------------------------------------
229
+ # Helpers
230
  # ---------------------------------------------------------------------------
231
 
232
+ def _reap_expired(pending):
233
  now = time.time()
234
+ expired = [k for k, (_, ts) in pending.items() if now - ts > PENDING_TIMEOUT_SEC]
235
  for k in expired:
236
+ pending.pop(k, None)
237
 
238
+ def _append(record: dict, ann_file: Path):
239
  line = json.dumps(record, ensure_ascii=False)
240
  if scheduler is not None:
241
  with scheduler.lock:
 
245
  with ann_file.open("a", encoding="utf-8") as f:
246
  f.write(line + "\n")
247
 
248
+ def _format_caption(meta: dict) -> str:
249
+ """Render caption(_segments) as readable text."""
250
+ if not meta:
251
+ return ""
252
+ if meta.get("caption"):
253
+ return meta["caption"]
254
+ segs = meta.get("caption_segments")
255
+ if segs:
256
+ return "\n\n".join(f"— 第 {i}/{len(segs)} 段 —\n{s}" for i, s in enumerate(segs, 1))
257
+ return ""
258
+
259
  # ---------------------------------------------------------------------------
260
+ # V Binary
261
  # ---------------------------------------------------------------------------
262
 
263
+ def v_binary_start(annotator: str, state: dict):
264
  annotator = (annotator or "").strip()
265
  if not annotator:
266
+ return state, "<p>请输入名字</p>", "", "", "⚠️", ""
267
+ order = list(range(len(V_BINARY_TASKS)))
268
  random.shuffle(order)
269
+ n_done = sum(1 for v in V_BINARY_COMPLETED.values()
270
+ if annotator in v)
271
+ state = {"annotator": annotator, "order": order, "idx": 0,
272
+ "current": None, "count": n_done}
273
+ return _v_binary_next(state)
274
 
275
+ def _v_binary_next(state):
276
  annotator = state["annotator"]
277
  order = state["order"]
278
  idx = state.get("idx", 0)
279
  with STATE_LOCK:
280
+ _reap_expired(V_BINARY_PENDING)
281
  while idx < len(order):
282
+ task = V_BINARY_TASKS[order[idx]]
283
+ tid = task["task_id"]
284
+ if len(V_BINARY_COMPLETED.get(tid, set())) >= V_BINARY_ANNOTATORS_PER_TASK:
285
+ idx += 1; continue
286
+ if annotator in V_BINARY_COMPLETED.get(tid, set()):
287
+ idx += 1; continue
288
+ if tid in V_BINARY_PENDING and V_BINARY_PENDING[tid][0] != annotator:
289
+ idx += 1; continue
290
+ V_BINARY_PENDING[tid] = (annotator, time.time())
291
  state["idx"] = idx
292
+ state["current"] = tid
293
+
294
+ model = task["model_id"]
295
+ subset = task["subset"]
296
+ sid = task["sample_id"]
297
+
298
+ video_html = _render_video_html(_v_video_proxy_url(model, subset, sid))
299
+ meta = _load_sample_meta("mbenchv", subset, sid)
300
+ prompt = _format_caption(meta)
301
+ info = (f"**模型**: `{model}` | **子集**: `{subset}` | "
302
+ f"**sample**: `{sid[:24]}...` | **已提交**: {state['count']}")
303
+ n_done = sum(1 for v in V_BINARY_COMPLETED.values() if v)
304
+ stats = f"全局进度: {n_done}/{len(V_BINARY_TASKS)} ({100*n_done/len(V_BINARY_TASKS):.1f}%)"
305
+ return state, video_html, info, prompt, "✅ 已加载", stats
306
  state["current"] = None
307
+ return state, "<p>🎉 全部完成!</p>", "", "", "完成", ""
308
 
309
+ def v_binary_submit(state, verdict, note):
310
  if not state or not state.get("current"):
311
+ return state, "<p>请先登录</p>", "", "", "⚠️", "", "", ""
312
+ tid = state["current"]
313
+ task = V_BINARY_BY_ID[tid]
314
  record = {
315
+ "type": "v_binary",
316
  "timestamp": time.time(),
317
  "annotator": state["annotator"],
318
+ "task_id": tid,
319
+ "dataset_id": "mbenchv",
320
+ "model_id": task["model_id"],
321
+ "subset": task["subset"],
322
+ "sample_id": task["sample_id"],
323
+ "condition_id": "text",
324
+ "item_id": f'{task["subset"]}:{task["sample_id"]}:text',
325
  "memory_issue": verdict == "是",
326
  "verdict": verdict,
327
  "note": (note or "").strip(),
328
  }
329
+ _append(record, ANN_FILE_V_BINARY)
330
  with STATE_LOCK:
331
+ V_BINARY_PENDING.pop(tid, None)
332
+ V_BINARY_COMPLETED[tid].add(state["annotator"])
333
  state["count"] = state.get("count", 0) + 1
334
  state["idx"] = state["idx"] + 1
335
  state["current"] = None
336
+ res = _v_binary_next(state)
337
+ return res[0], res[1], res[2], res[3], f"✅ 已提交 {state['count']}", res[5], "否", ""
338
 
339
+ def v_binary_skip(state):
340
  if not state or not state.get("current"):
341
+ return state, "", "", "", "⚠️", "", "", ""
342
+ tid = state["current"]
343
  with STATE_LOCK:
344
+ V_BINARY_PENDING.pop(tid, None)
345
  state["idx"] = state["idx"] + 1
346
  state["current"] = None
347
+ res = _v_binary_next(state)
348
+ return res[0], res[1], res[2], res[3], "⏭️ 已跳过", res[5], "否", ""
349
 
350
  # ---------------------------------------------------------------------------
351
+ # V Pairwise
352
  # ---------------------------------------------------------------------------
353
 
354
+ def v_pairwise_start(annotator: str, state: dict):
355
  annotator = (annotator or "").strip()
356
  if not annotator:
357
+ empty = gr.update(visible=False, value="差不多")
358
+ return (state, "⚠️ 请输入名字", "", "", "", "",
359
+ empty, empty, empty, empty, empty, "", "")
360
+ n_done = sum(1 for v in V_PAIRWISE_COMPLETED.values() if annotator in v)
361
+ order = list(range(len(V_PAIRWISE_TASKS)))
362
  random.shuffle(order)
363
+ state = {"annotator": annotator, "order": order, "idx": 0,
364
+ "current": None, "swapped": False, "count": n_done}
365
+ return _v_pairwise_next(state)
 
 
366
 
367
+ def _v_pairwise_next(state):
368
  annotator = state["annotator"]
 
369
  order = state["order"]
370
  idx = state.get("idx", 0)
 
 
 
 
 
 
 
 
371
  with STATE_LOCK:
372
+ _reap_expired(V_PAIRWISE_PENDING)
373
  while idx < len(order):
374
+ task = V_PAIRWISE_TASKS[order[idx]]
375
+ tid = task["task_id"]
376
+ if len(V_PAIRWISE_COMPLETED.get(tid, set())) >= V_PAIRWISE_ANNOTATORS_PER_TASK:
377
+ idx += 1; continue
378
+ if annotator in V_PAIRWISE_COMPLETED.get(tid, set()):
379
+ idx += 1; continue
380
+ if tid in V_PAIRWISE_PENDING and V_PAIRWISE_PENDING[tid][0] != annotator:
381
+ idx += 1; continue
382
+ V_PAIRWISE_PENDING[tid] = (annotator, time.time())
383
  state["idx"] = idx
384
+ state["current"] = tid
385
+
386
+ ma, mb = task["model_a"], task["model_b"]
387
  if random.random() < 0.5:
388
+ left, right = ma, mb; state["swapped"] = False
 
389
  else:
390
+ left, right = mb, ma; state["swapped"] = True
391
+ subset = task["subset"]; sid = task["sample_id"]
392
+ video_l = _render_video_html(_v_video_proxy_url(left, subset, sid))
393
+ video_r = _render_video_html(_v_video_proxy_url(right, subset, sid))
394
+ meta = _load_sample_meta("mbenchv", subset, sid)
395
+ prompt = _format_caption(meta)
396
+
397
+ dim_questions = task["dimension_questions"]
398
+ dimensions = task["dimensions"]
399
+ q_updates = []
400
+ for i in range(5):
401
+ if i < len(dimensions):
402
+ qtext = dim_questions.get(dimensions[i], dimensions[i])
403
+ q_updates.append(gr.update(visible=True, label=qtext, value="差不多"))
404
+ else:
405
+ q_updates.append(gr.update(visible=False, value="差不多"))
406
+
407
+ subset_emoji = {"environment": "🏞️", "object": "🎯", "human": "👤", "causal": "⚡"}
408
+ info = (f"**子集**: {subset_emoji.get(subset, '')} {subset} | "
409
+ f"**已提交**: {state['count']}")
410
+ n_done = sum(1 for v in V_PAIRWISE_COMPLETED.values()
411
+ if len(v) >= V_PAIRWISE_ANNOTATORS_PER_TASK)
412
+ stats = f"全局进度: {n_done}/{len(V_PAIRWISE_TASKS)} 任务完成"
413
+ return (state, "✅ 已加载", video_l, video_r, info, prompt,
414
+ *q_updates, "", stats)
415
  state["current"] = None
416
+ empty = gr.update(visible=False, value="差不多")
417
+ return (state, "🎉 全部完成", "", "", "全部完成", "",
418
+ empty, empty, empty, empty, empty, "", "")
419
 
420
+ def v_pairwise_submit(state, q1, q2, q3, q4, q5, note):
421
  if not state or not state.get("current"):
422
+ empty = gr.update(visible=False, value="差不多")
423
+ return (state, "⚠️ 请先登录", "", "", "", "",
424
+ empty, empty, empty, empty, empty, "", "")
425
+ tid = state["current"]
426
+ task = V_PAIRWISE_BY_ID[tid]
427
+ swapped = state["swapped"]
428
+ ma, mb = task["model_a"], task["model_b"]
429
+ verdicts = [q1, q2, q3, q4, q5]
430
+ dim_results = {}
431
+ for i, dim in enumerate(task["dimensions"]):
432
+ v = verdicts[i]
433
+ if v == "A更好":
434
+ winner = mb if swapped else ma
435
+ elif v == "B更好":
436
+ winner = ma if swapped else mb
437
+ else:
438
+ winner = "tie"
439
+ dim_results[dim] = winner
440
+
441
  record = {
442
+ "type": "v_pairwise",
443
  "timestamp": time.time(),
444
  "annotator": state["annotator"],
445
  "task_id": tid,
446
+ "dataset_id": "mbenchv",
447
+ "subset": task["subset"],
448
+ "sample_id": task["sample_id"],
449
+ "condition_id": "text",
450
+ "model_a": ma,
451
+ "model_b": mb,
452
+ "item_a": f'{task["subset"]}:{task["sample_id"]}:text|{ma}',
453
+ "item_b": f'{task["subset"]}:{task["sample_id"]}:text|{mb}',
454
+ "dimensions": dim_results,
455
  "swapped": swapped,
456
  "note": (note or "").strip(),
457
  }
458
+ _append(record, ANN_FILE_V_PAIRWISE)
459
  with STATE_LOCK:
460
+ V_PAIRWISE_PENDING.pop(tid, None)
461
+ V_PAIRWISE_COMPLETED[tid].add(state["annotator"])
462
  state["count"] = state.get("count", 0) + 1
463
  state["idx"] = state["idx"] + 1
464
  state["current"] = None
465
+ return _v_pairwise_next(state)
 
466
 
467
+ def v_pairwise_skip(state):
468
  if not state or not state.get("current"):
469
+ empty = gr.update(visible=False, value="差不多")
470
+ return (state, "⚠️ 请先登录", "", "", "", "",
471
+ empty, empty, empty, empty, empty, "", "")
472
+ tid = state["current"]
473
  with STATE_LOCK:
474
+ V_PAIRWISE_PENDING.pop(tid, None)
475
  state["idx"] = state["idx"] + 1
476
  state["current"] = None
477
+ return _v_pairwise_next(state)
 
478
 
479
  # ---------------------------------------------------------------------------
480
+ # A Pairwise (adapted from old app, with new paths)
481
  # ---------------------------------------------------------------------------
482
 
483
+ def _render_a_aux(task: dict) -> str:
484
+ subset = task["subset"]
485
+ box = 'class="aux-info-box"'
486
+ motion = task.get("camera_motion", "left_then_right")
487
+ motion_desc = task.get("camera_motion_description", motion)
488
+ gif_url = _a_asset_hf_url(f"camera_diagrams/{motion}.gif")
489
+ camera_html = (
490
+ f'<div style="flex:0 0 200px">'
491
+ f'<p><b>🎬 预期相机运动</b></p>'
492
+ f'<p style="margin:0 0 8px">{motion_desc}</p>'
493
+ f'<img src="{gif_url}" style="width:180px">'
494
+ f'</div>'
495
+ )
496
+ caption = task.get("caption", "")
497
+ caption_html = (
498
+ f'<div style="flex:1;min-width:250px">'
499
+ f'<p><b>📝 场景描述</b></p>'
500
+ f'<p style="font-size:14px;line-height:1.5">{caption}</p>'
501
+ f'</div>'
502
+ ) if caption else ""
503
+
504
+ if subset == "object":
505
+ sample_id = task["sample_id"]
506
+ # Use new mask_viz path inside MBench-A/assets/mask_viz still works
507
+ mask_url = _a_asset_hf_url(f"mask_viz/{sample_id}.png")
508
+ return (
509
+ f'<div {box}>'
510
+ f'<p><b>🎯 请关注画面中被标注(高亮)的物体</b></p>'
511
+ f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start;margin-top:8px">'
512
+ f'<div style="flex:1;min-width:300px">'
513
+ f'<img src="{mask_url}" style="max-width:100%;max-height:280px"></div>'
514
+ f'{camera_html}{caption_html}</div></div>'
515
+ )
516
+ elif subset == "human":
517
+ return (
518
+ f'<div {box}>'
519
+ f'<p><b>👤 请关注视频中的人物</b>:观察人物离开画面再回来后,面部和外观是否保持一致。</p>'
520
+ f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start;margin-top:8px">'
521
+ f'{camera_html}{caption_html}</div></div>'
522
+ )
523
+ elif subset == "causal":
524
+ return (
525
+ f'<div {box}>'
526
+ f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start">'
527
+ f'{camera_html}{caption_html}</div></div>'
528
+ )
529
+ else: # environment
530
+ return (
531
+ f'<div {box}>'
532
+ f'<p><b>🏞️ 请关注整体场景</b>:观察相机转回来后,场景的布局/风格/光照是否保持一致。</p>'
533
+ f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start;margin-top:8px">'
534
+ f'{camera_html}{caption_html}</div></div>'
535
+ )
536
+
537
+ def a_start(annotator: str, state: dict):
538
  annotator = (annotator or "").strip()
539
  if not annotator:
540
+ empty = gr.update(visible=False, value="差不多")
541
  return (state, "⚠️ 请输入名字", "", "", "", "",
542
+ empty, empty, empty, empty, empty, empty, "", "")
543
+ n_done = sum(1 for v in A_PAIRWISE_COMPLETED.values() if annotator in v)
544
+ order = list(range(len(A_PAIRWISE_TASKS)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  random.shuffle(order)
546
+ state = {"annotator": annotator, "order": order, "idx": 0,
547
+ "current": None, "swapped": False, "count": n_done}
548
+ return _a_next(state)
 
 
 
 
 
 
 
 
 
549
 
550
+ def _a_next(state):
 
551
  annotator = state["annotator"]
552
  order = state["order"]
553
  idx = state.get("idx", 0)
 
554
  with STATE_LOCK:
555
+ _reap_expired(A_PAIRWISE_PENDING)
556
  while idx < len(order):
557
+ task = A_PAIRWISE_TASKS[order[idx]]
558
  tid = task["task_id"]
559
+ if len(A_PAIRWISE_COMPLETED.get(tid, set())) >= A_PAIRWISE_ANNOTATORS_PER_TASK:
560
+ idx += 1; continue
561
+ if annotator in A_PAIRWISE_COMPLETED.get(tid, set()):
562
+ idx += 1; continue
563
+ if tid in A_PAIRWISE_PENDING and A_PAIRWISE_PENDING[tid][0] != annotator:
564
+ idx += 1; continue
565
+ A_PAIRWISE_PENDING[tid] = (annotator, time.time())
 
 
 
 
 
 
 
 
 
566
  state["idx"] = idx
567
+ state["current"] = tid
568
 
569
+ ma, mb = task["model_a"], task["model_b"]
 
570
  if random.random() < 0.5:
571
+ left, right = ma, mb; state["swapped"] = False
 
572
  else:
573
+ left, right = mb, ma; state["swapped"] = True
574
+ subset = task["subset"]; sid = task["sample_id"]
575
+ motion = task.get("camera_motion", "left_then_right")
576
+ cond = f"{motion}_25s"
577
+ video_l = _render_video_html(_a_video_proxy_url(left, subset, sid, cond))
578
+ video_r = _render_video_html(_a_video_proxy_url(right, subset, sid, cond))
579
+ aux = _render_a_aux(task)
 
 
 
 
580
 
 
581
  dimensions = task["dimensions"]
582
+ dim_q = task.get("dimension_questions", {})
 
 
583
  q_updates = []
584
  for i in range(6):
585
  if i < len(dimensions):
586
+ qtext = dim_q.get(dimensions[i], dimensions[i])
587
+ q_updates.append(gr.update(visible=True, label=qtext, value="差不多"))
 
 
 
 
 
588
  else:
589
  q_updates.append(gr.update(visible=False, value="差不多"))
590
 
591
+ subset_emoji = {"environment": "🏞️", "object": "🎯", "human": "👤", "causal": "⚡"}
592
+ info = f"**子集**: {subset_emoji.get(subset, '')} {subset} | **已提交**: {state['count']}"
593
+ n_done = sum(1 for v in A_PAIRWISE_COMPLETED.values()
594
+ if len(v) >= A_PAIRWISE_ANNOTATORS_PER_TASK)
595
+ stats = f"全局进度: {n_done}/{len(A_PAIRWISE_TASKS)} 任务完成"
596
+ return (state, " 已加载", aux, video_l, video_r, info,
 
 
 
 
 
597
  *q_updates, "", stats)
598
+ state["current"] = None
599
+ empty = gr.update(visible=False, value="差不多")
600
+ return (state, "🎉 全部完成", "", "", "", "全部完成",
601
+ empty, empty, empty, empty, empty, empty, "", "")
602
 
603
+ def a_submit(state, q1, q2, q3, q4, q5, q6, note):
604
+ if not state or not state.get("current"):
605
+ empty = gr.update(visible=False, value="差不多")
 
 
 
 
 
 
 
 
606
  return (state, "⚠️ 请先登录", "", "", "", "",
607
+ empty, empty, empty, empty, empty, empty, "", "")
608
+ tid = state["current"]
609
+ task = A_PAIRWISE_BY_ID[tid]
 
 
610
  swapped = state["swapped"]
611
+ ma, mb = task["model_a"], task["model_b"]
612
+ verdicts = [q1, q2, q3, q4, q5, q6]
 
 
613
  dim_results = {}
614
+ for i, dim in enumerate(task["dimensions"]):
615
  v = verdicts[i]
616
  if v == "A更好":
617
+ winner = mb if swapped else ma
 
618
  elif v == "B更好":
619
+ winner = ma if swapped else mb
620
  else:
621
  winner = "tie"
622
+ dim_results[dim] = winner
623
 
624
+ motion = task.get("camera_motion", "left_then_right")
625
+ cond = f"{motion}_25s"
626
  record = {
627
+ "type": "a_pairwise",
628
  "timestamp": time.time(),
629
  "annotator": state["annotator"],
630
  "task_id": tid,
631
+ "dataset_id": "mbencha",
632
  "subset": task["subset"],
633
  "sample_id": task["sample_id"],
634
+ "condition_id": cond,
635
+ "model_a": ma,
636
+ "model_b": mb,
637
+ "item_a": f'{task["subset"]}:{task["sample_id"]}:{cond}|{ma}',
638
+ "item_b": f'{task["subset"]}:{task["sample_id"]}:{cond}|{mb}',
639
+ "camera_motion": motion,
640
  "dimensions": dim_results,
641
  "swapped": swapped,
642
  "note": (note or "").strip(),
643
  }
644
+ _append(record, ANN_FILE_A_PAIRWISE)
 
645
  with STATE_LOCK:
646
+ A_PAIRWISE_PENDING.pop(tid, None)
647
+ A_PAIRWISE_COMPLETED[tid].add(state["annotator"])
 
648
  state["count"] = state.get("count", 0) + 1
649
  state["idx"] = state["idx"] + 1
650
+ state["current"] = None
651
+ return _a_next(state)
 
 
652
 
653
+ def a_skip(state):
654
+ if not state or not state.get("current"):
655
+ empty = gr.update(visible=False, value="差不多")
 
656
  return (state, "⚠️ 请先登录", "", "", "", "",
657
+ empty, empty, empty, empty, empty, empty, "", "")
658
+ tid = state["current"]
 
659
  with STATE_LOCK:
660
+ A_PAIRWISE_PENDING.pop(tid, None)
 
661
  state["idx"] = state["idx"] + 1
662
+ state["current"] = None
663
+ return _a_next(state)
 
664
 
665
  # ---------------------------------------------------------------------------
666
  # UI
 
668
 
669
  CUSTOM_CSS = """
670
  #prompt_box textarea { height: 300px !important; overflow-y: auto !important; }
 
 
 
671
  .aux-info-box {
672
+ background: #e3e8ef !important; color: #111 !important;
673
+ padding: 14px !important; border-radius: 8px !important;
674
+ margin-bottom: 12px !important; border: 1px solid #b0b8c4 !important;
 
 
 
 
 
 
 
 
 
 
675
  }
676
+ .aux-info-box * { color: #111 !important; }
677
+ .aux-info-box img { border: 1px solid #999; border-radius: 4px; }
678
  """
679
 
680
+ with gr.Blocks(title="MBench 标注 (NEW)", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
681
+ gr.Markdown("# 🎬 MBench 视频标注平台 (新结构)")
682
 
683
  with gr.Tabs():
684
+ # ───── V Binary ─────
685
+ with gr.Tab("MBench-V Binary"):
686
+ gr.Markdown("## 📺 MBench-V — 单视频记忆问题判断\n\n"
687
+ "请观看视频并阅读 prompt,判断是否出现了**记忆问题**(场景/物体/人物前后不一致)。")
688
+ vb_stats = gr.Markdown("")
689
+ vb_state = gr.State({})
690
+ with gr.Row():
691
+ vb_name = gr.Textbox(label="标注员名字", placeholder="例如: charlie", scale=4)
692
+ vb_login = gr.Button("开始标注", variant="primary", scale=1)
693
+ vb_status = gr.Markdown("")
694
+ vb_video = gr.HTML("<p>请先登录</p>")
695
+ vb_info = gr.Markdown("")
696
+ vb_prompt = gr.Textbox(label="Prompt / 文本描述", lines=10, elem_id="prompt_box")
697
+ vb_verdict = gr.Radio(["是", "否"], value="否", label="是否出现了记忆问题?")
698
+ vb_note = gr.Textbox(label="备注(可选)", lines=1)
699
+ with gr.Row():
700
+ vb_submit = gr.Button("✅ 提交并下一组", variant="primary")
701
+ vb_skip = gr.Button("⏭️ 跳过")
702
+ vb_outs = [vb_state, vb_video, vb_info, vb_prompt, vb_status, vb_stats, vb_verdict, vb_note]
703
+ vb_login.click(v_binary_start, [vb_name, vb_state],
704
+ [vb_state, vb_video, vb_info, vb_prompt, vb_status, vb_stats])
705
+ vb_name.submit(v_binary_start, [vb_name, vb_state],
706
+ [vb_state, vb_video, vb_info, vb_prompt, vb_status, vb_stats])
707
+ vb_submit.click(v_binary_submit, [vb_state, vb_verdict, vb_note], vb_outs)
708
+ vb_skip.click(v_binary_skip, [vb_state], vb_outs)
709
+
710
+ # ───── V Pairwise ─────
711
+ with gr.Tab("MBench-V Pairwise"):
712
+ gr.Markdown("## 🎬 MBench-V — 双视频对比 (5 维度)\n\n"
713
+ "比较两个 T2V 模型生成的视频,从 5 个维度独立判断哪个更好。")
714
+ vp_stats = gr.Markdown("")
715
+ vp_state = gr.State({})
716
+ with gr.Row():
717
+ vp_name = gr.Textbox(label="标注员名字", scale=4)
718
+ vp_login = gr.Button("开始标注", variant="primary", scale=1)
719
+ vp_status = gr.Markdown("")
720
+ with gr.Row(equal_height=True):
721
+ with gr.Column(scale=1, min_width=360):
722
+ gr.Markdown("### 视频 A")
723
+ vp_video_l = gr.HTML("<p>请先登录</p>")
724
+ with gr.Column(scale=1, min_width=360):
725
+ gr.Markdown("### 视频 B")
726
+ vp_video_r = gr.HTML("<p>请先登录</p>")
727
+ vp_info = gr.Markdown("")
728
+ vp_prompt = gr.Textbox(label="Prompt / 文本描述", lines=8, elem_id="prompt_box")
729
+ gr.Markdown("---\n### 请对以下每个维度分别判断:")
730
+ vp_q1 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 1", visible=False)
731
+ vp_q2 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 2", visible=False)
732
+ vp_q3 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 3", visible=False)
733
+ vp_q4 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 4", visible=False)
734
+ vp_q5 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 5", visible=False)
735
+ vp_note = gr.Textbox(label="备注(可选)", lines=1)
736
+ with gr.Row():
737
+ vp_submit = gr.Button("✅ 提交并下一组", variant="primary")
738
+ vp_skip = gr.Button("⏭️ 跳过")
739
+ vp_outs = [vp_state, vp_status, vp_video_l, vp_video_r, vp_info, vp_prompt,
740
+ vp_q1, vp_q2, vp_q3, vp_q4, vp_q5, vp_note, vp_stats]
741
+ vp_login.click(v_pairwise_start, [vp_name, vp_state], vp_outs)
742
+ vp_name.submit(v_pairwise_start, [vp_name, vp_state], vp_outs)
743
+ vp_submit.click(v_pairwise_submit,
744
+ [vp_state, vp_q1, vp_q2, vp_q3, vp_q4, vp_q5, vp_note], vp_outs)
745
+ vp_skip.click(v_pairwise_skip, [vp_state], vp_outs)
746
+
747
+ # ───── A Pairwise ─────
748
+ with gr.Tab("MBench-A Pairwise"):
749
+ gr.Markdown("## 🌍 MBench-A — 世界模型双视频对比 (≤6 维度)\n\n"
750
+ "比较两个世界模型的长视频(25 秒),评估相机运动结束后的记忆一致性。")
751
  a_stats = gr.Markdown("")
752
  a_state = gr.State({})
 
753
  with gr.Row():
754
+ a_name = gr.Textbox(label="标注员名字", scale=4)
755
  a_login = gr.Button("开始标注", variant="primary", scale=1)
 
756
  a_status = gr.Markdown("")
 
 
757
  a_aux = gr.HTML("")
 
 
758
  with gr.Row(equal_height=True):
759
  with gr.Column(scale=1, min_width=360):
760
  gr.Markdown("### 视频 A")
761
+ a_video_l = gr.HTML("<p>请先登录</p>")
762
  with gr.Column(scale=1, min_width=360):
763
  gr.Markdown("### 视频 B")
764
+ a_video_r = gr.HTML("<p>请先登录</p>")
765
+ a_info = gr.Markdown("")
 
 
 
 
766
  gr.Markdown("---\n### 请对以下每个维度分别判断:")
767
  a_q1 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 1", visible=False)
768
  a_q2 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 2", visible=False)
 
770
  a_q4 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 4", visible=False)
771
  a_q5 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 5", visible=False)
772
  a_q6 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 6", visible=False)
 
773
  a_note = gr.Textbox(label="备注(可选)", lines=1)
 
774
  with gr.Row():
775
  a_submit = gr.Button("✅ 提交并下一组", variant="primary")
776
  a_skip = gr.Button("⏭️ 跳过")
777
+ a_outs = [a_state, a_status, a_aux, a_video_l, a_video_r, a_info,
778
+ a_q1, a_q2, a_q3, a_q4, a_q5, a_q6, a_note, a_stats]
779
+ a_login.click(a_start, [a_name, a_state], a_outs)
780
+ a_name.submit(a_start, [a_name, a_state], a_outs)
781
+ a_submit.click(a_submit,
782
+ [a_state, a_q1, a_q2, a_q3, a_q4, a_q5, a_q6, a_note], a_outs)
783
+ a_skip.click(a_skip, [a_state], a_outs)
 
 
 
784
 
785
  # ---------------------------------------------------------------------------
786
  # Video proxy
 
795
  _video_client = httpx.AsyncClient(timeout=30.0, follow_redirects=True)
796
 
797
  async def _do_proxy(upstream: str, request: Request):
 
798
  req_headers = {}
799
  if (rng := request.headers.get("range")):
800
  req_headers["range"] = rng
 
805
  )
806
  except Exception as e:
807
  raise HTTPException(502, f"upstream fetch failed: {e}")
808
+ passthrough = {}
809
  for h in ("content-type", "content-length", "accept-ranges",
810
  "content-range", "etag", "last-modified"):
811
  if h in upstream_resp.headers:
812
+ passthrough[h] = upstream_resp.headers[h]
813
+ passthrough.setdefault("content-type", "video/mp4")
814
+ passthrough["cache-control"] = "public, max-age=300"
815
 
816
  async def _body():
817
  try:
 
819
  yield chunk
820
  finally:
821
  await upstream_resp.aclose()
822
+ return StreamingResponse(_body(), status_code=upstream_resp.status_code, headers=passthrough)
823
 
824
+ async def _proxy_v_video(model: str, subset: str, sample_id: str, request: Request):
825
+ sid = sample_id.replace(".mp4", "")
826
+ if model not in V_MODELS:
827
+ raise HTTPException(404, f"unknown V model: {model}")
828
+ upstream = _v_video_hf_url(model, subset, sid)
 
 
829
  return await _do_proxy(upstream, request)
830
 
831
+ async def _proxy_a_video(model: str, subset: str, sample_id: str, condition_id: str, request: Request):
832
+ cond = condition_id.replace(".mp4", "")
833
+ if model not in A_MODELS:
834
+ raise HTTPException(404, f"unknown A model: {model}")
835
+ upstream = _a_video_hf_url(model, subset, sample_id, cond)
836
  return await _do_proxy(upstream, request)
837
 
838
+ _orig = _GradioApp.create_app
839
+ def _patched(*args, **kwargs):
840
+ app = _orig(*args, **kwargs)
841
+ app.add_api_route("/video_v/{model}/{subset}/{sample_id}",
842
+ _proxy_v_video, methods=["GET", "HEAD"], include_in_schema=False)
843
+ app.add_api_route("/video_a/{model}/{subset}/{sample_id}/{condition_id}",
844
+ _proxy_a_video, methods=["GET", "HEAD"], include_in_schema=False)
845
+ print("[ann-new] video proxy routes registered")
 
 
 
 
 
 
 
 
 
 
 
846
  return app
847
+ _GradioApp.create_app = staticmethod(_patched)
848
 
 
849
  demo.queue(default_concurrency_limit=16).launch(ssr_mode=False)
sampling/new_task_pools.json ADDED
The diff for this file is too large to render. See raw diff