Spaces:
Sleeping
Sleeping
feat: migrate to MBench-V-new + MBench-A-New (V binary + V pairwise + A pairwise tabs)
Browse files- README.md +17 -14
- app.py +517 -634
- sampling/new_task_pools.json +0 -0
README.md
CHANGED
|
@@ -10,23 +10,26 @@ app_file: app.py
|
|
| 10 |
pinned: false
|
| 11 |
---
|
| 12 |
|
| 13 |
-
# MBench
|
| 14 |
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
-
|
| 18 |
-
- **Annotation sink (write)**: the same dataset repo, under `annotations/`. Submissions are batched by `CommitScheduler` and pushed every 5 minutes.
|
| 19 |
-
- **Models included (6)**: `causal_forcing`, `self_forcing`, `cosmos`, `helios`, `longlive`, `memflow`. `skyreels` and `longcat` are temporarily excluded because their 0422 generation is still in progress.
|
| 20 |
-
- **Tasks**: 584 task_ids × 6 models = **3504** `(model, task_id)` pairs.
|
| 21 |
|
| 22 |
-
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
2. Watch the video on the left; read the prompt and metadata in the middle.
|
| 26 |
-
3. Give a score (1–5) and an optional note on the right.
|
| 27 |
-
4. Click **Submit & Next** to move on. Your submissions are auto-committed every 5 min.
|
| 28 |
|
| 29 |
-
|
|
|
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
pinned: false
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# MBench Annotation Platform (NEW)
|
| 14 |
|
| 15 |
+
Adapted to the new dataset layout (`MBench-V-new` + `MBench-A-New`) on
|
| 16 |
+
[`studyOverflow/TempMemoryData`](https://huggingface.co/datasets/studyOverflow/TempMemoryData).
|
| 17 |
|
| 18 |
+
## Tabs
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
1. **MBench-V Binary** — single video, "is there a memory issue?" (yes/no)
|
| 21 |
+
2. **MBench-V Pairwise** — two T2V videos, 5 dimensions
|
| 22 |
+
3. **MBench-A Pairwise** — two world-model videos, ≤6 dimensions
|
| 23 |
|
| 24 |
+
## Annotation Sink
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
Submissions are pushed to `annotations-new/` on the dataset repo every 5 minutes via
|
| 27 |
+
`CommitScheduler`. Old `annotations/` is left untouched (legacy).
|
| 28 |
|
| 29 |
+
## Migrated Historical Data
|
| 30 |
+
|
| 31 |
+
`annotations-new/` already contains:
|
| 32 |
+
- `migrated_v_binary.jsonl` (642 records from old `ann_bc109d66.jsonl`)
|
| 33 |
+
- `migrated_a_pairwise.jsonl` (821 records from old `ann_mbench_a_*.jsonl`)
|
| 34 |
+
|
| 35 |
+
These are read on startup so existing annotators don't see already-completed tasks again.
|
app.py
CHANGED
|
@@ -1,18 +1,20 @@
|
|
| 1 |
"""
|
| 2 |
-
MBench Annotation Space —
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
-
|
| 11 |
-
-
|
| 12 |
-
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
| 16 |
"""
|
| 17 |
from __future__ import annotations
|
| 18 |
|
|
@@ -34,184 +36,107 @@ from huggingface_hub import CommitScheduler, HfApi, hf_hub_download, hf_hub_url
|
|
| 34 |
# ---------------------------------------------------------------------------
|
| 35 |
|
| 36 |
DATASET_REPO = "studyOverflow/TempMemoryData"
|
| 37 |
-
MERGED_JSON_PATH = "MBench-V/merged.json"
|
| 38 |
-
MODELS: list[str] = [
|
| 39 |
-
"causal_forcing",
|
| 40 |
-
"self_forcing",
|
| 41 |
-
"cosmos",
|
| 42 |
-
"helios",
|
| 43 |
-
"longlive",
|
| 44 |
-
"memflow",
|
| 45 |
-
"longcat",
|
| 46 |
-
"skyreels",
|
| 47 |
-
]
|
| 48 |
-
|
| 49 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
ANN_DIR = Path("annotations_local")
|
| 51 |
ANN_DIR.mkdir(exist_ok=True)
|
| 52 |
PROCESS_ID = uuid.uuid4().hex[:8]
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
ANN_FILE_MBENCH_A = ANN_DIR / f"ann_mbench_a_{PROCESS_ID}.jsonl"
|
| 58 |
|
| 59 |
COMMIT_INTERVAL_MIN = 5
|
| 60 |
PENDING_TIMEOUT_SEC = 30 * 60
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
("physical", "物理合理性", "哪个视频中的物理过程(水流/碰撞/变形等)更合理自然?"),
|
| 66 |
-
("prompt", "Prompt 忠实度", "哪个视频的内容更符合下方的文字描述?"),
|
| 67 |
-
]
|
| 68 |
-
PAIRWISE_SAMPLES_PER_PAIR = 30
|
| 69 |
-
|
| 70 |
-
# ---------------------------------------------------------------------------
|
| 71 |
-
# MBench-A Config
|
| 72 |
-
# ---------------------------------------------------------------------------
|
| 73 |
-
|
| 74 |
-
MBENCH_A_MODELS: list[str] = [
|
| 75 |
-
"hy_worldplay",
|
| 76 |
-
"infinite_world",
|
| 77 |
-
"lingbot_world",
|
| 78 |
-
"matrix_game_2",
|
| 79 |
-
"matrix_game_3",
|
| 80 |
-
"yume",
|
| 81 |
-
]
|
| 82 |
-
MBENCH_A_ANNOTATORS_PER_TASK = 3
|
| 83 |
-
MBENCH_A_CATEGORY_MAP = {
|
| 84 |
-
"environment": "Spatial_401f",
|
| 85 |
-
"object": "Spatial_401f",
|
| 86 |
-
"human": "Human_401f",
|
| 87 |
-
"causal": "Casual_401f",
|
| 88 |
-
}
|
| 89 |
-
MBENCH_A_GT_CATEGORY_MAP = {
|
| 90 |
-
"environment": "Spatial",
|
| 91 |
-
"object": "Spatial",
|
| 92 |
-
"human": "Human",
|
| 93 |
-
"causal": "Casual",
|
| 94 |
-
}
|
| 95 |
|
| 96 |
# ---------------------------------------------------------------------------
|
| 97 |
-
# Load
|
| 98 |
# ---------------------------------------------------------------------------
|
| 99 |
|
| 100 |
-
def
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
repo_id=DATASET_REPO,
|
| 104 |
-
filename=MERGED_JSON_PATH,
|
| 105 |
-
repo_type="dataset",
|
| 106 |
-
token=HF_TOKEN,
|
| 107 |
-
)
|
| 108 |
with open(local, encoding="utf-8") as f:
|
| 109 |
return json.load(f)
|
| 110 |
-
|
| 111 |
-
print(f"[mbench-ann] WARNING: Failed to load MBench-V data: {e}")
|
| 112 |
-
return []
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
if local_path.exists():
|
| 125 |
-
with open(local_path, encoding="utf-8") as f:
|
| 126 |
-
return json.load(f)
|
| 127 |
-
# Fallback: try HF
|
| 128 |
-
try:
|
| 129 |
-
local = hf_hub_download(
|
| 130 |
-
repo_id=DATASET_REPO,
|
| 131 |
-
filename="MBench-A/task_pool.json",
|
| 132 |
-
repo_type="dataset",
|
| 133 |
-
token=HF_TOKEN,
|
| 134 |
-
)
|
| 135 |
-
with open(local, encoding="utf-8") as f:
|
| 136 |
-
return json.load(f)
|
| 137 |
-
except Exception as e:
|
| 138 |
-
print(f"[mbench-ann] WARNING: Failed to load MBench-A task pool: {e}")
|
| 139 |
-
return {"tasks": [], "quality_control_tasks": [], "metadata": {}}
|
| 140 |
-
|
| 141 |
-
MBENCH_A_POOL = _load_mbench_a_pool()
|
| 142 |
-
MBENCH_A_TASKS: list[dict] = MBENCH_A_POOL.get("tasks", []) + MBENCH_A_POOL.get("quality_control_tasks", [])
|
| 143 |
-
MBENCH_A_TASK_BY_ID: dict[str, dict] = {t["task_id"]: t for t in MBENCH_A_TASKS}
|
| 144 |
|
| 145 |
# ---------------------------------------------------------------------------
|
| 146 |
-
#
|
| 147 |
# ---------------------------------------------------------------------------
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
|
| 152 |
-
def
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
print(f"[
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
# ---------------------------------------------------------------------------
|
| 170 |
-
# Video URL helpers
|
| 171 |
# ---------------------------------------------------------------------------
|
| 172 |
|
| 173 |
-
def
|
| 174 |
-
return f"/
|
| 175 |
|
| 176 |
-
def
|
| 177 |
return hf_hub_url(
|
| 178 |
DATASET_REPO,
|
| 179 |
-
filename=f"MBench-V/{model}/
|
| 180 |
repo_type="dataset",
|
| 181 |
)
|
| 182 |
|
| 183 |
-
def
|
| 184 |
-
|
| 185 |
-
category = MBENCH_A_CATEGORY_MAP[subset]
|
| 186 |
-
return f"/video_a/{model}/{category}/{sample_id}/left_then_right.mp4"
|
| 187 |
-
|
| 188 |
-
def _mbench_a_hf_video_url(model: str, category: str, sample_id: str) -> str:
|
| 189 |
-
"""Build HF upstream URL for MBench-A video."""
|
| 190 |
-
return hf_hub_url(
|
| 191 |
-
DATASET_REPO,
|
| 192 |
-
filename=f"MBench-A/{model}/{category}/{sample_id}/left_then_right.mp4",
|
| 193 |
-
repo_type="dataset",
|
| 194 |
-
)
|
| 195 |
|
| 196 |
-
def
|
| 197 |
-
"""Build HF URL for MBench-A assets."""
|
| 198 |
return hf_hub_url(
|
| 199 |
DATASET_REPO,
|
| 200 |
-
filename=f"MBench-A/
|
| 201 |
repo_type="dataset",
|
| 202 |
)
|
| 203 |
|
| 204 |
-
def
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
for level in ("level_3", "level_4", "level_2", "level_1"):
|
| 208 |
-
val = prompts.get(level)
|
| 209 |
-
if isinstance(val, list) and val:
|
| 210 |
-
n = len(val)
|
| 211 |
-
return "\n\n".join(f"— 第 {i}/{n} 段 —\n{seg}" for i, seg in enumerate(val, 1))
|
| 212 |
-
if isinstance(val, str) and val:
|
| 213 |
-
return val
|
| 214 |
-
return "(no prompt found)"
|
| 215 |
|
| 216 |
def _render_video_html(url: str) -> str:
|
| 217 |
return (
|
|
@@ -221,94 +146,7 @@ def _render_video_html(url: str) -> str:
|
|
| 221 |
)
|
| 222 |
|
| 223 |
# ---------------------------------------------------------------------------
|
| 224 |
-
#
|
| 225 |
-
# ---------------------------------------------------------------------------
|
| 226 |
-
|
| 227 |
-
def _render_mbench_a_aux(task: dict) -> str:
|
| 228 |
-
"""Render auxiliary HTML info based on task subset."""
|
| 229 |
-
subset = task["subset"]
|
| 230 |
-
|
| 231 |
-
# Use CSS class for guaranteed visibility (Gradio themes can override inline styles)
|
| 232 |
-
box = 'class="aux-info-box"'
|
| 233 |
-
|
| 234 |
-
# Camera motion info (shown for ALL subsets)
|
| 235 |
-
motion = task.get("camera_motion", "left_then_right")
|
| 236 |
-
motion_desc = task.get("camera_motion_description", motion)
|
| 237 |
-
gif_url = _mbench_a_asset_hf_url(f"camera_diagrams/{motion}.gif")
|
| 238 |
-
camera_html = (
|
| 239 |
-
f'<div style="flex:0 0 200px">'
|
| 240 |
-
f'<p><b>🎬 预期相机运动</b></p>'
|
| 241 |
-
f'<p style="margin:0 0 8px">{motion_desc}</p>'
|
| 242 |
-
f'<img src="{gif_url}" style="width:180px">'
|
| 243 |
-
f'</div>'
|
| 244 |
-
)
|
| 245 |
-
|
| 246 |
-
# Caption (shown for ALL subsets now)
|
| 247 |
-
caption = task.get("caption", "")
|
| 248 |
-
caption_html = ""
|
| 249 |
-
if caption:
|
| 250 |
-
caption_html = (
|
| 251 |
-
f'<div style="flex:1;min-width:250px">'
|
| 252 |
-
f'<p><b>📝 场景描述</b></p>'
|
| 253 |
-
f'<p style="font-size:14px;line-height:1.5">{caption}</p>'
|
| 254 |
-
f'</div>'
|
| 255 |
-
)
|
| 256 |
-
|
| 257 |
-
if subset == "object":
|
| 258 |
-
sample_id = task["sample_id"]
|
| 259 |
-
mask_url = _mbench_a_asset_hf_url(f"mask_viz/{sample_id}.png")
|
| 260 |
-
return (
|
| 261 |
-
f'<div {box}>'
|
| 262 |
-
f'<p><b>🎯 请关注画面中被标注(高亮)的物体</b></p>'
|
| 263 |
-
f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start;margin-top:8px">'
|
| 264 |
-
f'<div style="flex:1;min-width:300px">'
|
| 265 |
-
f'<img src="{mask_url}" style="max-width:100%;max-height:280px">'
|
| 266 |
-
f'</div>'
|
| 267 |
-
f'{camera_html}'
|
| 268 |
-
f'{caption_html}'
|
| 269 |
-
f'</div></div>'
|
| 270 |
-
)
|
| 271 |
-
|
| 272 |
-
elif subset == "causal":
|
| 273 |
-
return (
|
| 274 |
-
f'<div {box}>'
|
| 275 |
-
f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start">'
|
| 276 |
-
f'{camera_html}'
|
| 277 |
-
f'{caption_html}'
|
| 278 |
-
f'</div></div>'
|
| 279 |
-
)
|
| 280 |
-
|
| 281 |
-
elif subset == "human":
|
| 282 |
-
return (
|
| 283 |
-
f'<div {box}>'
|
| 284 |
-
f'<p><b>👤 请关注视频中的人物</b>:观察人物离开画面再回来后,面部和外观是否保持一致。</p>'
|
| 285 |
-
f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start;margin-top:8px">'
|
| 286 |
-
f'{camera_html}'
|
| 287 |
-
f'{caption_html}'
|
| 288 |
-
f'</div></div>'
|
| 289 |
-
)
|
| 290 |
-
|
| 291 |
-
else: # environment
|
| 292 |
-
return (
|
| 293 |
-
f'<div {box}>'
|
| 294 |
-
f'<p><b>🏞️ 请关注整体场景</b>:观察相机转回来后,场景的布局、风格、光照是否保持一致。</p>'
|
| 295 |
-
f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start;margin-top:8px">'
|
| 296 |
-
f'{camera_html}'
|
| 297 |
-
f'{caption_html}'
|
| 298 |
-
f'</div></div>'
|
| 299 |
-
)
|
| 300 |
-
return (
|
| 301 |
-
f'<div {box}>'
|
| 302 |
-
f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start">'
|
| 303 |
-
f'<div style="flex:1;min-width:250px">'
|
| 304 |
-
f'<p><b>🏞️ 请关注整体场景</b>:观察相机转回来后,场景的布局、风格、光照是否保持一致。</p>'
|
| 305 |
-
f'</div>'
|
| 306 |
-
f'{camera_html}'
|
| 307 |
-
f'</div></div>'
|
| 308 |
-
)
|
| 309 |
-
|
| 310 |
-
# ---------------------------------------------------------------------------
|
| 311 |
-
# CommitScheduler
|
| 312 |
# ---------------------------------------------------------------------------
|
| 313 |
|
| 314 |
scheduler: CommitScheduler | None = None
|
|
@@ -317,7 +155,7 @@ if HF_TOKEN:
|
|
| 317 |
repo_id=DATASET_REPO,
|
| 318 |
repo_type="dataset",
|
| 319 |
folder_path=str(ANN_DIR),
|
| 320 |
-
path_in_repo="annotations",
|
| 321 |
every=COMMIT_INTERVAL_MIN,
|
| 322 |
token=HF_TOKEN,
|
| 323 |
private=False,
|
|
@@ -325,20 +163,21 @@ if HF_TOKEN:
|
|
| 325 |
)
|
| 326 |
|
| 327 |
# ---------------------------------------------------------------------------
|
| 328 |
-
#
|
| 329 |
# ---------------------------------------------------------------------------
|
| 330 |
|
| 331 |
-
def
|
| 332 |
-
records
|
| 333 |
try:
|
| 334 |
api = HfApi(token=HF_TOKEN)
|
| 335 |
files = api.list_repo_files(repo_id=DATASET_REPO, repo_type="dataset")
|
| 336 |
except Exception:
|
| 337 |
return records
|
| 338 |
-
jsonls = [p for p in files if p.startswith("annotations/") and p.endswith(".jsonl")]
|
| 339 |
for path in jsonls:
|
| 340 |
try:
|
| 341 |
-
local = hf_hub_download(repo_id=DATASET_REPO, filename=path,
|
|
|
|
| 342 |
with open(local, encoding="utf-8") as f:
|
| 343 |
for line in f:
|
| 344 |
line = line.strip()
|
|
@@ -351,7 +190,8 @@ def _fetch_remote_annotations() -> list[dict[str, Any]]:
|
|
| 351 |
pass
|
| 352 |
return records
|
| 353 |
|
| 354 |
-
HISTORICAL =
|
|
|
|
| 355 |
|
| 356 |
# ---------------------------------------------------------------------------
|
| 357 |
# Shared state
|
|
@@ -359,49 +199,43 @@ HISTORICAL = _fetch_remote_annotations()
|
|
| 359 |
|
| 360 |
STATE_LOCK = threading.Lock()
|
| 361 |
|
| 362 |
-
#
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
and (r["model"], r["task_id"]) in BINARY_POOL_SET
|
| 367 |
-
}
|
| 368 |
-
BINARY_PENDING: dict[tuple[str, str], tuple[str, float]] = {}
|
| 369 |
-
|
| 370 |
-
# MBench-V Pairwise state
|
| 371 |
-
PAIRWISE_SUBMITTED: set[tuple[str, str, str, str]] = {
|
| 372 |
-
(r["task_id"], r["model_a"], r["model_b"], r["dimension"])
|
| 373 |
-
for r in HISTORICAL
|
| 374 |
-
if r.get("type") == "pairwise"
|
| 375 |
-
and all(k in r for k in ("task_id", "model_a", "model_b", "dimension"))
|
| 376 |
-
}
|
| 377 |
-
PAIRWISE_PENDING: dict[tuple[str, str, str, str], tuple[str, float]] = {}
|
| 378 |
|
| 379 |
-
# MBench-A state: task_id -> list of annotators who completed it
|
| 380 |
-
MBENCH_A_COMPLETED: dict[str, list[str]] = defaultdict(list)
|
| 381 |
for r in HISTORICAL:
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
-
|
|
|
|
|
|
|
| 389 |
|
| 390 |
-
print(f"[
|
| 391 |
-
print(f"[
|
| 392 |
-
print(f"[
|
| 393 |
|
| 394 |
# ---------------------------------------------------------------------------
|
| 395 |
-
#
|
| 396 |
# ---------------------------------------------------------------------------
|
| 397 |
|
| 398 |
-
def _reap_expired(
|
| 399 |
now = time.time()
|
| 400 |
-
expired = [k for k, (_, ts) in
|
| 401 |
for k in expired:
|
| 402 |
-
|
| 403 |
|
| 404 |
-
def
|
| 405 |
line = json.dumps(record, ensure_ascii=False)
|
| 406 |
if scheduler is not None:
|
| 407 |
with scheduler.lock:
|
|
@@ -411,394 +245,422 @@ def _append_annotation(record: dict[str, Any], ann_file: Path) -> None:
|
|
| 411 |
with ann_file.open("a", encoding="utf-8") as f:
|
| 412 |
f.write(line + "\n")
|
| 413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
# ---------------------------------------------------------------------------
|
| 415 |
-
#
|
| 416 |
# ---------------------------------------------------------------------------
|
| 417 |
|
| 418 |
-
def
|
| 419 |
annotator = (annotator or "").strip()
|
| 420 |
if not annotator:
|
| 421 |
-
return state, "<p>请
|
| 422 |
-
order = list(range(len(
|
| 423 |
random.shuffle(order)
|
| 424 |
-
|
| 425 |
-
|
|
|
|
|
|
|
|
|
|
| 426 |
|
| 427 |
-
def
|
| 428 |
annotator = state["annotator"]
|
| 429 |
order = state["order"]
|
| 430 |
idx = state.get("idx", 0)
|
| 431 |
with STATE_LOCK:
|
| 432 |
-
_reap_expired(
|
| 433 |
while idx < len(order):
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
continue
|
| 438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
state["idx"] = idx
|
| 440 |
-
state["current"] =
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
state["current"] = None
|
| 450 |
-
return state, "<p>🎉 全部完成!</p>", "
|
| 451 |
|
| 452 |
-
def
|
| 453 |
if not state or not state.get("current"):
|
| 454 |
-
return state, "<p>请先登录</p>", "", "", "
|
| 455 |
-
|
| 456 |
-
|
| 457 |
record = {
|
| 458 |
-
"type": "
|
| 459 |
"timestamp": time.time(),
|
| 460 |
"annotator": state["annotator"],
|
| 461 |
-
"
|
| 462 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
"memory_issue": verdict == "是",
|
| 464 |
"verdict": verdict,
|
| 465 |
"note": (note or "").strip(),
|
| 466 |
}
|
| 467 |
-
|
| 468 |
with STATE_LOCK:
|
| 469 |
-
|
| 470 |
-
|
| 471 |
state["count"] = state.get("count", 0) + 1
|
| 472 |
state["idx"] = state["idx"] + 1
|
| 473 |
state["current"] = None
|
| 474 |
-
|
| 475 |
-
return
|
| 476 |
|
| 477 |
-
def
|
| 478 |
if not state or not state.get("current"):
|
| 479 |
-
return state, "
|
| 480 |
-
|
| 481 |
with STATE_LOCK:
|
| 482 |
-
|
| 483 |
state["idx"] = state["idx"] + 1
|
| 484 |
state["current"] = None
|
| 485 |
-
|
| 486 |
-
return
|
| 487 |
|
| 488 |
# ---------------------------------------------------------------------------
|
| 489 |
-
#
|
| 490 |
# ---------------------------------------------------------------------------
|
| 491 |
|
| 492 |
-
def
|
| 493 |
annotator = (annotator or "").strip()
|
| 494 |
if not annotator:
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
|
|
|
|
|
|
| 498 |
random.shuffle(order)
|
| 499 |
-
state = {
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
}
|
| 503 |
-
return _pairwise_next(state)
|
| 504 |
|
| 505 |
-
def
|
| 506 |
annotator = state["annotator"]
|
| 507 |
-
dim_pool = state["dim_pool"]
|
| 508 |
order = state["order"]
|
| 509 |
idx = state.get("idx", 0)
|
| 510 |
-
dimension = state["dimension"]
|
| 511 |
-
dim_label = dimension
|
| 512 |
-
dim_question = ""
|
| 513 |
-
for dk, dl, dq in PAIRWISE_DIMENSIONS:
|
| 514 |
-
if dk == dimension:
|
| 515 |
-
dim_label = dl
|
| 516 |
-
dim_question = dq
|
| 517 |
-
break
|
| 518 |
with STATE_LOCK:
|
| 519 |
-
_reap_expired(
|
| 520 |
while idx < len(order):
|
| 521 |
-
|
| 522 |
-
tid
|
| 523 |
-
if
|
| 524 |
-
idx += 1
|
| 525 |
-
|
| 526 |
-
|
|
|
|
|
|
|
|
|
|
| 527 |
state["idx"] = idx
|
| 528 |
-
state["current"] =
|
|
|
|
|
|
|
| 529 |
if random.random() < 0.5:
|
| 530 |
-
|
| 531 |
-
state["swapped"] = False
|
| 532 |
else:
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
prompt =
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
state["current"] = None
|
| 545 |
-
|
|
|
|
|
|
|
| 546 |
|
| 547 |
-
def
|
| 548 |
if not state or not state.get("current"):
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
record = {
|
| 560 |
-
"type": "
|
| 561 |
"timestamp": time.time(),
|
| 562 |
"annotator": state["annotator"],
|
| 563 |
"task_id": tid,
|
| 564 |
-
"
|
| 565 |
-
"
|
| 566 |
-
"
|
| 567 |
-
"
|
| 568 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
"swapped": swapped,
|
| 570 |
"note": (note or "").strip(),
|
| 571 |
}
|
| 572 |
-
|
| 573 |
with STATE_LOCK:
|
| 574 |
-
|
| 575 |
-
|
| 576 |
state["count"] = state.get("count", 0) + 1
|
| 577 |
state["idx"] = state["idx"] + 1
|
| 578 |
state["current"] = None
|
| 579 |
-
|
| 580 |
-
return result[0], result[1], result[2], result[3], result[4], f"✅ 已提交第 {state['count']} 条", result[6]
|
| 581 |
|
| 582 |
-
def
|
| 583 |
if not state or not state.get("current"):
|
| 584 |
-
|
| 585 |
-
|
|
|
|
|
|
|
| 586 |
with STATE_LOCK:
|
| 587 |
-
|
| 588 |
state["idx"] = state["idx"] + 1
|
| 589 |
state["current"] = None
|
| 590 |
-
|
| 591 |
-
return result[0], result[1], result[2], result[3], result[4], "⏭️ 已跳过", result[6]
|
| 592 |
|
| 593 |
# ---------------------------------------------------------------------------
|
| 594 |
-
#
|
| 595 |
# ---------------------------------------------------------------------------
|
| 596 |
|
| 597 |
-
def
|
| 598 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
annotator = (annotator or "").strip()
|
| 600 |
if not annotator:
|
|
|
|
| 601 |
return (state, "⚠️ 请输入名字", "", "", "", "",
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
"", "")
|
| 606 |
-
# Count how many tasks this annotator has already completed.
|
| 607 |
-
# Check both:
|
| 608 |
-
# 1. MBENCH_A_COMPLETED (loaded from HF at startup + updated in-memory during this session)
|
| 609 |
-
# 2. The local annotation file (captures annotations made this session before any push)
|
| 610 |
-
historical_count = sum(
|
| 611 |
-
1 for anns in MBENCH_A_COMPLETED.values()
|
| 612 |
-
if annotator in anns
|
| 613 |
-
)
|
| 614 |
-
# Also scan the local file in case this session's annotations haven't been pushed yet
|
| 615 |
-
if ANN_FILE_MBENCH_A.exists():
|
| 616 |
-
with ANN_FILE_MBENCH_A.open() as f:
|
| 617 |
-
for line in f:
|
| 618 |
-
line = line.strip()
|
| 619 |
-
if not line:
|
| 620 |
-
continue
|
| 621 |
-
try:
|
| 622 |
-
r = json.loads(line)
|
| 623 |
-
if r.get("annotator") == annotator and r.get("type") == "pairwise_mbench_a":
|
| 624 |
-
tid = r.get("task_id", "")
|
| 625 |
-
# Only count if not already counted in MBENCH_A_COMPLETED
|
| 626 |
-
if tid in MBENCH_A_TASK_BY_ID and annotator not in MBENCH_A_COMPLETED.get(tid, []):
|
| 627 |
-
historical_count += 1
|
| 628 |
-
except Exception:
|
| 629 |
-
pass
|
| 630 |
-
|
| 631 |
-
# Shuffle task order for this annotator
|
| 632 |
-
order = list(range(len(MBENCH_A_TASKS)))
|
| 633 |
random.shuffle(order)
|
| 634 |
-
state = {
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
"idx": 0,
|
| 638 |
-
"current_task_id": None,
|
| 639 |
-
"swapped": False,
|
| 640 |
-
"left_model": None,
|
| 641 |
-
"right_model": None,
|
| 642 |
-
"count": historical_count,
|
| 643 |
-
}
|
| 644 |
-
return _mbench_a_next(state)
|
| 645 |
-
|
| 646 |
|
| 647 |
-
def
|
| 648 |
-
"""Find and load the next available MBench-A task."""
|
| 649 |
annotator = state["annotator"]
|
| 650 |
order = state["order"]
|
| 651 |
idx = state.get("idx", 0)
|
| 652 |
-
|
| 653 |
with STATE_LOCK:
|
| 654 |
-
_reap_expired(
|
| 655 |
while idx < len(order):
|
| 656 |
-
task =
|
| 657 |
tid = task["task_id"]
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
if
|
| 661 |
-
idx += 1
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
idx += 1
|
| 666 |
-
continue
|
| 667 |
-
# Skip if currently pending by someone else
|
| 668 |
-
if tid in MBENCH_A_PENDING and MBENCH_A_PENDING[tid][0] != annotator:
|
| 669 |
-
idx += 1
|
| 670 |
-
continue
|
| 671 |
-
|
| 672 |
-
# Assign this task
|
| 673 |
-
MBENCH_A_PENDING[tid] = (annotator, time.time())
|
| 674 |
state["idx"] = idx
|
| 675 |
-
state["
|
| 676 |
|
| 677 |
-
|
| 678 |
-
m_a, m_b = task["model_a"], task["model_b"]
|
| 679 |
if random.random() < 0.5:
|
| 680 |
-
|
| 681 |
-
state["swapped"] = False
|
| 682 |
else:
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
video_right = _render_video_html(
|
| 691 |
-
_mbench_a_video_proxy_url(state["right_model"], subset, task["sample_id"]))
|
| 692 |
-
|
| 693 |
-
aux_html = _render_mbench_a_aux(task)
|
| 694 |
|
| 695 |
-
# Dimension questions
|
| 696 |
dimensions = task["dimensions"]
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
# Build question radio updates (max 5)
|
| 700 |
q_updates = []
|
| 701 |
for i in range(6):
|
| 702 |
if i < len(dimensions):
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
q_updates.append(gr.update(
|
| 706 |
-
visible=True,
|
| 707 |
-
label=question_text,
|
| 708 |
-
value="差不多",
|
| 709 |
-
))
|
| 710 |
else:
|
| 711 |
q_updates.append(gr.update(visible=False, value="差不多"))
|
| 712 |
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
f"**已提交**: {state['count']}")
|
| 720 |
-
stats = (f"全局进度: {n_done}/{len(MBENCH_A_TASKS)} tasks 完成 | "
|
| 721 |
-
f"你已标注: {state['count']}")
|
| 722 |
-
|
| 723 |
-
return (state, "✅ 已加载", aux_html, video_left, video_right, meta,
|
| 724 |
*q_updates, "", stats)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 725 |
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
return (state, "🎉 全部完成!", "", "<p>所有任务已完成</p>", "", "全部完成",
|
| 730 |
-
empty_q, empty_q, empty_q, empty_q, empty_q, empty_q, "", "")
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
def mbench_a_submit(state, q1_val, q2_val, q3_val, q4_val, q5_val, q6_val, note):
|
| 734 |
-
"""Submit MBench-A multi-dimension annotation."""
|
| 735 |
-
if not state or not state.get("current_task_id"):
|
| 736 |
-
empty_q = gr.update(visible=False, value="差不多")
|
| 737 |
return (state, "⚠️ 请先登录", "", "", "", "",
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
task = MBENCH_A_TASK_BY_ID[tid]
|
| 742 |
-
dimensions = task["dimensions"]
|
| 743 |
swapped = state["swapped"]
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
# Map verdicts to winners
|
| 747 |
-
verdicts = [q1_val, q2_val, q3_val, q4_val, q5_val, q6_val]
|
| 748 |
dim_results = {}
|
| 749 |
-
for i,
|
| 750 |
v = verdicts[i]
|
| 751 |
if v == "A更好":
|
| 752 |
-
|
| 753 |
-
winner = m_b if swapped else m_a
|
| 754 |
elif v == "B更好":
|
| 755 |
-
winner =
|
| 756 |
else:
|
| 757 |
winner = "tie"
|
| 758 |
-
dim_results[
|
| 759 |
|
|
|
|
|
|
|
| 760 |
record = {
|
| 761 |
-
"type": "
|
| 762 |
"timestamp": time.time(),
|
| 763 |
"annotator": state["annotator"],
|
| 764 |
"task_id": tid,
|
|
|
|
| 765 |
"subset": task["subset"],
|
| 766 |
"sample_id": task["sample_id"],
|
| 767 |
-
"
|
| 768 |
-
"model_a":
|
| 769 |
-
"model_b":
|
|
|
|
|
|
|
|
|
|
| 770 |
"dimensions": dim_results,
|
| 771 |
"swapped": swapped,
|
| 772 |
"note": (note or "").strip(),
|
| 773 |
}
|
| 774 |
-
|
| 775 |
-
|
| 776 |
with STATE_LOCK:
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
state["count"] = state.get("count", 0) + 1
|
| 781 |
state["idx"] = state["idx"] + 1
|
| 782 |
-
state["
|
| 783 |
-
|
| 784 |
-
return _mbench_a_next(state)
|
| 785 |
-
|
| 786 |
|
| 787 |
-
def
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
empty_q = gr.update(visible=False, value="差不多")
|
| 791 |
return (state, "⚠️ 请先登录", "", "", "", "",
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
tid = state["current_task_id"]
|
| 795 |
with STATE_LOCK:
|
| 796 |
-
|
| 797 |
-
|
| 798 |
state["idx"] = state["idx"] + 1
|
| 799 |
-
state["
|
| 800 |
-
return
|
| 801 |
-
|
| 802 |
|
| 803 |
# ---------------------------------------------------------------------------
|
| 804 |
# UI
|
|
@@ -806,62 +668,101 @@ def mbench_a_skip(state):
|
|
| 806 |
|
| 807 |
CUSTOM_CSS = """
|
| 808 |
#prompt_box textarea { height: 300px !important; overflow-y: auto !important; }
|
| 809 |
-
.video-pair { display: flex; gap: 12px; }
|
| 810 |
-
.video-pair > div { flex: 1; }
|
| 811 |
-
/* Force aux info box to be visible regardless of Gradio theme */
|
| 812 |
.aux-info-box {
|
| 813 |
-
background: #e3e8ef !important;
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
border-radius: 8px !important;
|
| 817 |
-
margin-bottom: 12px !important;
|
| 818 |
-
border: 1px solid #b0b8c4 !important;
|
| 819 |
-
}
|
| 820 |
-
.aux-info-box * {
|
| 821 |
-
color: #111 !important;
|
| 822 |
-
}
|
| 823 |
-
.aux-info-box img {
|
| 824 |
-
border: 1px solid #999;
|
| 825 |
-
border-radius: 4px;
|
| 826 |
}
|
|
|
|
|
|
|
| 827 |
"""
|
| 828 |
|
| 829 |
-
with gr.Blocks(title="MBench 标注", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
|
| 830 |
-
gr.Markdown("# 🎬 MBench 视频标注平台")
|
| 831 |
|
| 832 |
with gr.Tabs():
|
| 833 |
-
#
|
| 834 |
-
with gr.Tab("MBench-
|
| 835 |
-
gr.Markdown(
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 840 |
a_stats = gr.Markdown("")
|
| 841 |
a_state = gr.State({})
|
| 842 |
-
|
| 843 |
with gr.Row():
|
| 844 |
-
a_name = gr.Textbox(label="标注员名字",
|
| 845 |
a_login = gr.Button("开始标注", variant="primary", scale=1)
|
| 846 |
-
|
| 847 |
a_status = gr.Markdown("")
|
| 848 |
-
|
| 849 |
-
# Auxiliary info (mask image / camera GIF + caption / instructions)
|
| 850 |
a_aux = gr.HTML("")
|
| 851 |
-
|
| 852 |
-
# Video pair
|
| 853 |
with gr.Row(equal_height=True):
|
| 854 |
with gr.Column(scale=1, min_width=360):
|
| 855 |
gr.Markdown("### 视频 A")
|
| 856 |
-
|
| 857 |
with gr.Column(scale=1, min_width=360):
|
| 858 |
gr.Markdown("### 视频 B")
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
# Task info
|
| 862 |
-
a_meta = gr.Markdown("")
|
| 863 |
-
|
| 864 |
-
# Multi-dimension questions (max 6, dynamically shown/hidden)
|
| 865 |
gr.Markdown("---\n### 请对以下每个维度分别判断:")
|
| 866 |
a_q1 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 1", visible=False)
|
| 867 |
a_q2 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 2", visible=False)
|
|
@@ -869,22 +770,17 @@ with gr.Blocks(title="MBench 标注", theme=gr.themes.Soft(), css=CUSTOM_CSS) as
|
|
| 869 |
a_q4 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 4", visible=False)
|
| 870 |
a_q5 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 5", visible=False)
|
| 871 |
a_q6 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 6", visible=False)
|
| 872 |
-
|
| 873 |
a_note = gr.Textbox(label="备注(可选)", lines=1)
|
| 874 |
-
|
| 875 |
with gr.Row():
|
| 876 |
a_submit = gr.Button("✅ 提交并下一组", variant="primary")
|
| 877 |
a_skip = gr.Button("⏭️ 跳过")
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
a_submit.click(mbench_a_submit,
|
| 886 |
-
[a_state, a_q1, a_q2, a_q3, a_q4, a_q5, a_q6, a_note], a_all_outs)
|
| 887 |
-
a_skip.click(mbench_a_skip, [a_state], a_all_outs)
|
| 888 |
|
| 889 |
# ---------------------------------------------------------------------------
|
| 890 |
# Video proxy
|
|
@@ -899,7 +795,6 @@ if __name__ == "__main__":
|
|
| 899 |
_video_client = httpx.AsyncClient(timeout=30.0, follow_redirects=True)
|
| 900 |
|
| 901 |
async def _do_proxy(upstream: str, request: Request):
|
| 902 |
-
"""Generic proxy for HF video/asset URLs."""
|
| 903 |
req_headers = {}
|
| 904 |
if (rng := request.headers.get("range")):
|
| 905 |
req_headers["range"] = rng
|
|
@@ -910,13 +805,13 @@ if __name__ == "__main__":
|
|
| 910 |
)
|
| 911 |
except Exception as e:
|
| 912 |
raise HTTPException(502, f"upstream fetch failed: {e}")
|
| 913 |
-
|
| 914 |
for h in ("content-type", "content-length", "accept-ranges",
|
| 915 |
"content-range", "etag", "last-modified"):
|
| 916 |
if h in upstream_resp.headers:
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
|
| 920 |
|
| 921 |
async def _body():
|
| 922 |
try:
|
|
@@ -924,43 +819,31 @@ if __name__ == "__main__":
|
|
| 924 |
yield chunk
|
| 925 |
finally:
|
| 926 |
await upstream_resp.aclose()
|
|
|
|
| 927 |
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
|
| 933 |
-
raise HTTPException(404, "unknown (model, task_id)")
|
| 934 |
-
upstream = _hf_video_url(model, task_id)
|
| 935 |
return await _do_proxy(upstream, request)
|
| 936 |
|
| 937 |
-
async def
|
| 938 |
-
|
| 939 |
-
if model not in
|
| 940 |
-
raise HTTPException(404, f"unknown model: {model}")
|
| 941 |
-
upstream =
|
| 942 |
return await _do_proxy(upstream, request)
|
| 943 |
|
| 944 |
-
|
| 945 |
-
|
| 946 |
-
|
| 947 |
-
app
|
| 948 |
-
|
| 949 |
-
app.add_api_route(
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
methods=["GET", "HEAD"],
|
| 953 |
-
include_in_schema=False,
|
| 954 |
-
)
|
| 955 |
-
# MBench-A video proxy
|
| 956 |
-
app.add_api_route(
|
| 957 |
-
"/video_a/{model}/{category}/{sample_id}/left_then_right.mp4",
|
| 958 |
-
_proxy_mbench_a_video,
|
| 959 |
-
methods=["GET", "HEAD"],
|
| 960 |
-
include_in_schema=False,
|
| 961 |
-
)
|
| 962 |
-
print("[mbench-ann] video proxy routes registered (MBench-V + MBench-A)")
|
| 963 |
return app
|
|
|
|
| 964 |
|
| 965 |
-
_GradioApp.create_app = staticmethod(_patched_create_app)
|
| 966 |
demo.queue(default_concurrency_limit=16).launch(ssr_mode=False)
|
|
|
|
| 1 |
"""
|
| 2 |
+
MBench Annotation Space (NEW) — adapted for MBench-V-new + MBench-A-New.
|
| 3 |
+
|
| 4 |
+
Tabs:
|
| 5 |
+
1. MBench-V Binary ─ "该视频是否出现了记忆问题?" (单视频, 1 标注员/任务)
|
| 6 |
+
2. MBench-V Pairwise ─ 双视频, 5 维度对比 (3 标注员/任务)
|
| 7 |
+
3. MBench-A Pairwise ─ 双视频, ≤6 维度对比 (3 标注员/任务)
|
| 8 |
+
|
| 9 |
+
Data sources:
|
| 10 |
+
- Videos: streamed from studyOverflow/TempMemoryData (MBench-V-new + MBench-A-New).
|
| 11 |
+
- Task pools: sampling/new_task_pools.json
|
| 12 |
+
- Sample metadata: sample.json under MBench-{V,A}-New/samples/{subset}/{sid}/
|
| 13 |
+
- Annotation sink: annotations-new/ on the dataset repo (CommitScheduler, 5 min cadence).
|
| 14 |
+
|
| 15 |
+
Notes:
|
| 16 |
+
- All paths use the new structure (subset names: environment/object/human/causal).
|
| 17 |
+
- Old annotations in annotations/ are preserved; this app writes only to annotations-new/.
|
| 18 |
"""
|
| 19 |
from __future__ import annotations
|
| 20 |
|
|
|
|
| 36 |
# ---------------------------------------------------------------------------
|
| 37 |
|
| 38 |
DATASET_REPO = "studyOverflow/TempMemoryData"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 40 |
+
|
| 41 |
+
V_MODELS = ["causal_forcing", "self_forcing", "cosmos", "helios",
|
| 42 |
+
"longlive", "memflow", "skyreels", "longcat"]
|
| 43 |
+
A_MODELS = ["hy_worldplay", "infinite_world", "lingbot_world",
|
| 44 |
+
"matrix_game_2", "matrix_game_3", "yume"]
|
| 45 |
+
|
| 46 |
ANN_DIR = Path("annotations_local")
|
| 47 |
ANN_DIR.mkdir(exist_ok=True)
|
| 48 |
PROCESS_ID = uuid.uuid4().hex[:8]
|
| 49 |
|
| 50 |
+
ANN_FILE_V_BINARY = ANN_DIR / f"v_binary_{PROCESS_ID}.jsonl"
|
| 51 |
+
ANN_FILE_V_PAIRWISE = ANN_DIR / f"v_pairwise_{PROCESS_ID}.jsonl"
|
| 52 |
+
ANN_FILE_A_PAIRWISE = ANN_DIR / f"a_pairwise_{PROCESS_ID}.jsonl"
|
|
|
|
| 53 |
|
| 54 |
COMMIT_INTERVAL_MIN = 5
|
| 55 |
PENDING_TIMEOUT_SEC = 30 * 60
|
| 56 |
|
| 57 |
+
V_BINARY_ANNOTATORS_PER_TASK = 1
|
| 58 |
+
V_PAIRWISE_ANNOTATORS_PER_TASK = 3
|
| 59 |
+
A_PAIRWISE_ANNOTATORS_PER_TASK = 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
# ---------------------------------------------------------------------------
|
| 62 |
+
# Load task pools
|
| 63 |
# ---------------------------------------------------------------------------
|
| 64 |
|
| 65 |
+
def _load_pools() -> dict:
|
| 66 |
+
local = Path(__file__).parent / "sampling" / "new_task_pools.json"
|
| 67 |
+
if local.exists():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
with open(local, encoding="utf-8") as f:
|
| 69 |
return json.load(f)
|
| 70 |
+
raise RuntimeError(f"Task pool not found at {local}")
|
|
|
|
|
|
|
| 71 |
|
| 72 |
+
POOLS = _load_pools()
|
| 73 |
+
V_BINARY_TASKS: list[dict] = POOLS["v_binary"]["tasks"]
|
| 74 |
+
V_PAIRWISE_TASKS: list[dict] = POOLS["v_pairwise"]["tasks"]
|
| 75 |
+
A_PAIRWISE_TASKS: list[dict] = (POOLS["a_pairwise"]["tasks"]
|
| 76 |
+
+ POOLS["a_pairwise"]["quality_control_tasks"])
|
| 77 |
|
| 78 |
+
V_BINARY_BY_ID = {t["task_id"]: t for t in V_BINARY_TASKS}
|
| 79 |
+
V_PAIRWISE_BY_ID = {t["task_id"]: t for t in V_PAIRWISE_TASKS}
|
| 80 |
+
A_PAIRWISE_BY_ID = {t["task_id"]: t for t in A_PAIRWISE_TASKS}
|
| 81 |
|
| 82 |
+
print(f"[ann-new] V binary tasks: {len(V_BINARY_TASKS)}")
|
| 83 |
+
print(f"[ann-new] V pairwise tasks: {len(V_PAIRWISE_TASKS)}")
|
| 84 |
+
print(f"[ann-new] A pairwise tasks: {len(A_PAIRWISE_TASKS)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
# ---------------------------------------------------------------------------
|
| 87 |
+
# Sample metadata cache (sample.json)
|
| 88 |
# ---------------------------------------------------------------------------
|
| 89 |
|
| 90 |
+
_sample_cache: dict[tuple[str, str, str], dict] = {}
|
| 91 |
+
_sample_cache_lock = threading.Lock()
|
| 92 |
|
| 93 |
+
def _load_sample_meta(dataset: str, subset: str, sample_id: str) -> dict:
|
| 94 |
+
key = (dataset, subset, sample_id)
|
| 95 |
+
with _sample_cache_lock:
|
| 96 |
+
if key in _sample_cache:
|
| 97 |
+
return _sample_cache[key]
|
| 98 |
+
if dataset == "mbenchv":
|
| 99 |
+
path = f"MBench-V-new/samples/{subset}/{sample_id}/sample.json"
|
| 100 |
+
else:
|
| 101 |
+
path = f"MBench-A-New/samples/{subset}/{sample_id}/sample.json"
|
| 102 |
+
try:
|
| 103 |
+
local = hf_hub_download(DATASET_REPO, path, repo_type="dataset", token=HF_TOKEN)
|
| 104 |
+
with open(local, encoding="utf-8") as f:
|
| 105 |
+
data = json.load(f)
|
| 106 |
+
except Exception as e:
|
| 107 |
+
print(f"[ann-new] sample.json load failed for {key}: {e}")
|
| 108 |
+
data = {}
|
| 109 |
+
with _sample_cache_lock:
|
| 110 |
+
_sample_cache[key] = data
|
| 111 |
+
return data
|
| 112 |
|
| 113 |
# ---------------------------------------------------------------------------
|
| 114 |
+
# Video URL helpers (proxy)
|
| 115 |
# ---------------------------------------------------------------------------
|
| 116 |
|
| 117 |
+
def _v_video_proxy_url(model: str, subset: str, sample_id: str) -> str:
|
| 118 |
+
return f"/video_v/{model}/{subset}/{sample_id}.mp4"
|
| 119 |
|
| 120 |
+
def _v_video_hf_url(model: str, subset: str, sample_id: str) -> str:
|
| 121 |
return hf_hub_url(
|
| 122 |
DATASET_REPO,
|
| 123 |
+
filename=f"MBench-V-new/models/{model}/outputs/{subset}/{sample_id}/text/video.mp4",
|
| 124 |
repo_type="dataset",
|
| 125 |
)
|
| 126 |
|
| 127 |
+
def _a_video_proxy_url(model: str, subset: str, sample_id: str, condition_id: str) -> str:
|
| 128 |
+
return f"/video_a/{model}/{subset}/{sample_id}/{condition_id}.mp4"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
+
def _a_video_hf_url(model: str, subset: str, sample_id: str, condition_id: str) -> str:
|
|
|
|
| 131 |
return hf_hub_url(
|
| 132 |
DATASET_REPO,
|
| 133 |
+
filename=f"MBench-A-New/models/{model}/outputs/{subset}/{sample_id}/{condition_id}/video.mp4",
|
| 134 |
repo_type="dataset",
|
| 135 |
)
|
| 136 |
|
| 137 |
+
def _a_asset_hf_url(path: str) -> str:
|
| 138 |
+
"""Reuse old MBench-A asset directory (camera diagrams + mask viz)."""
|
| 139 |
+
return hf_hub_url(DATASET_REPO, filename=f"MBench-A/assets/{path}", repo_type="dataset")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
def _render_video_html(url: str) -> str:
|
| 142 |
return (
|
|
|
|
| 146 |
)
|
| 147 |
|
| 148 |
# ---------------------------------------------------------------------------
|
| 149 |
+
# CommitScheduler → annotations-new/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
# ---------------------------------------------------------------------------
|
| 151 |
|
| 152 |
scheduler: CommitScheduler | None = None
|
|
|
|
| 155 |
repo_id=DATASET_REPO,
|
| 156 |
repo_type="dataset",
|
| 157 |
folder_path=str(ANN_DIR),
|
| 158 |
+
path_in_repo="annotations-new",
|
| 159 |
every=COMMIT_INTERVAL_MIN,
|
| 160 |
token=HF_TOKEN,
|
| 161 |
private=False,
|
|
|
|
| 163 |
)
|
| 164 |
|
| 165 |
# ---------------------------------------------------------------------------
|
| 166 |
+
# Load historical annotations (from annotations-new/)
|
| 167 |
# ---------------------------------------------------------------------------
|
| 168 |
|
| 169 |
+
def _fetch_annotations_new() -> list[dict]:
|
| 170 |
+
records = []
|
| 171 |
try:
|
| 172 |
api = HfApi(token=HF_TOKEN)
|
| 173 |
files = api.list_repo_files(repo_id=DATASET_REPO, repo_type="dataset")
|
| 174 |
except Exception:
|
| 175 |
return records
|
| 176 |
+
jsonls = [p for p in files if p.startswith("annotations-new/") and p.endswith(".jsonl")]
|
| 177 |
for path in jsonls:
|
| 178 |
try:
|
| 179 |
+
local = hf_hub_download(repo_id=DATASET_REPO, filename=path,
|
| 180 |
+
repo_type="dataset", token=HF_TOKEN)
|
| 181 |
with open(local, encoding="utf-8") as f:
|
| 182 |
for line in f:
|
| 183 |
line = line.strip()
|
|
|
|
| 190 |
pass
|
| 191 |
return records
|
| 192 |
|
| 193 |
+
HISTORICAL = _fetch_annotations_new()
|
| 194 |
+
print(f"[ann-new] historical records loaded: {len(HISTORICAL)}")
|
| 195 |
|
| 196 |
# ---------------------------------------------------------------------------
|
| 197 |
# Shared state
|
|
|
|
| 199 |
|
| 200 |
STATE_LOCK = threading.Lock()
|
| 201 |
|
| 202 |
+
# Each: task_id -> set of annotators who completed it
|
| 203 |
+
V_BINARY_COMPLETED: dict[str, set[str]] = defaultdict(set)
|
| 204 |
+
V_PAIRWISE_COMPLETED: dict[str, set[str]] = defaultdict(set)
|
| 205 |
+
A_PAIRWISE_COMPLETED: dict[str, set[str]] = defaultdict(set)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
|
|
|
|
|
|
| 207 |
for r in HISTORICAL:
|
| 208 |
+
t = r.get("type")
|
| 209 |
+
tid = r.get("task_id")
|
| 210 |
+
ann = r.get("annotator")
|
| 211 |
+
if not (tid and ann):
|
| 212 |
+
continue
|
| 213 |
+
if t == "v_binary" and tid in V_BINARY_BY_ID:
|
| 214 |
+
V_BINARY_COMPLETED[tid].add(ann)
|
| 215 |
+
elif t == "v_pairwise" and tid in V_PAIRWISE_BY_ID:
|
| 216 |
+
V_PAIRWISE_COMPLETED[tid].add(ann)
|
| 217 |
+
elif t == "a_pairwise" and tid in A_PAIRWISE_BY_ID:
|
| 218 |
+
A_PAIRWISE_COMPLETED[tid].add(ann)
|
| 219 |
|
| 220 |
+
V_BINARY_PENDING: dict[str, tuple[str, float]] = {}
|
| 221 |
+
V_PAIRWISE_PENDING: dict[str, tuple[str, float]] = {}
|
| 222 |
+
A_PAIRWISE_PENDING: dict[str, tuple[str, float]] = {}
|
| 223 |
|
| 224 |
+
print(f"[ann-new] V binary: {sum(len(v) for v in V_BINARY_COMPLETED.values())} annotations on {len(V_BINARY_COMPLETED)} tasks")
|
| 225 |
+
print(f"[ann-new] V pairwise: {sum(len(v) for v in V_PAIRWISE_COMPLETED.values())} on {len(V_PAIRWISE_COMPLETED)} tasks")
|
| 226 |
+
print(f"[ann-new] A pairwise: {sum(len(v) for v in A_PAIRWISE_COMPLETED.values())} on {len(A_PAIRWISE_COMPLETED)} tasks")
|
| 227 |
|
| 228 |
# ---------------------------------------------------------------------------
|
| 229 |
+
# Helpers
|
| 230 |
# ---------------------------------------------------------------------------
|
| 231 |
|
| 232 |
+
def _reap_expired(pending):
|
| 233 |
now = time.time()
|
| 234 |
+
expired = [k for k, (_, ts) in pending.items() if now - ts > PENDING_TIMEOUT_SEC]
|
| 235 |
for k in expired:
|
| 236 |
+
pending.pop(k, None)
|
| 237 |
|
| 238 |
+
def _append(record: dict, ann_file: Path):
|
| 239 |
line = json.dumps(record, ensure_ascii=False)
|
| 240 |
if scheduler is not None:
|
| 241 |
with scheduler.lock:
|
|
|
|
| 245 |
with ann_file.open("a", encoding="utf-8") as f:
|
| 246 |
f.write(line + "\n")
|
| 247 |
|
| 248 |
+
def _format_caption(meta: dict) -> str:
|
| 249 |
+
"""Render caption(_segments) as readable text."""
|
| 250 |
+
if not meta:
|
| 251 |
+
return ""
|
| 252 |
+
if meta.get("caption"):
|
| 253 |
+
return meta["caption"]
|
| 254 |
+
segs = meta.get("caption_segments")
|
| 255 |
+
if segs:
|
| 256 |
+
return "\n\n".join(f"— 第 {i}/{len(segs)} 段 —\n{s}" for i, s in enumerate(segs, 1))
|
| 257 |
+
return ""
|
| 258 |
+
|
| 259 |
# ---------------------------------------------------------------------------
|
| 260 |
+
# V Binary
|
| 261 |
# ---------------------------------------------------------------------------
|
| 262 |
|
| 263 |
+
def v_binary_start(annotator: str, state: dict):
|
| 264 |
annotator = (annotator or "").strip()
|
| 265 |
if not annotator:
|
| 266 |
+
return state, "<p>请输入名字</p>", "", "", "⚠️", ""
|
| 267 |
+
order = list(range(len(V_BINARY_TASKS)))
|
| 268 |
random.shuffle(order)
|
| 269 |
+
n_done = sum(1 for v in V_BINARY_COMPLETED.values()
|
| 270 |
+
if annotator in v)
|
| 271 |
+
state = {"annotator": annotator, "order": order, "idx": 0,
|
| 272 |
+
"current": None, "count": n_done}
|
| 273 |
+
return _v_binary_next(state)
|
| 274 |
|
| 275 |
+
def _v_binary_next(state):
|
| 276 |
annotator = state["annotator"]
|
| 277 |
order = state["order"]
|
| 278 |
idx = state.get("idx", 0)
|
| 279 |
with STATE_LOCK:
|
| 280 |
+
_reap_expired(V_BINARY_PENDING)
|
| 281 |
while idx < len(order):
|
| 282 |
+
task = V_BINARY_TASKS[order[idx]]
|
| 283 |
+
tid = task["task_id"]
|
| 284 |
+
if len(V_BINARY_COMPLETED.get(tid, set())) >= V_BINARY_ANNOTATORS_PER_TASK:
|
| 285 |
+
idx += 1; continue
|
| 286 |
+
if annotator in V_BINARY_COMPLETED.get(tid, set()):
|
| 287 |
+
idx += 1; continue
|
| 288 |
+
if tid in V_BINARY_PENDING and V_BINARY_PENDING[tid][0] != annotator:
|
| 289 |
+
idx += 1; continue
|
| 290 |
+
V_BINARY_PENDING[tid] = (annotator, time.time())
|
| 291 |
state["idx"] = idx
|
| 292 |
+
state["current"] = tid
|
| 293 |
+
|
| 294 |
+
model = task["model_id"]
|
| 295 |
+
subset = task["subset"]
|
| 296 |
+
sid = task["sample_id"]
|
| 297 |
+
|
| 298 |
+
video_html = _render_video_html(_v_video_proxy_url(model, subset, sid))
|
| 299 |
+
meta = _load_sample_meta("mbenchv", subset, sid)
|
| 300 |
+
prompt = _format_caption(meta)
|
| 301 |
+
info = (f"**模型**: `{model}` | **子集**: `{subset}` | "
|
| 302 |
+
f"**sample**: `{sid[:24]}...` | **已提交**: {state['count']}")
|
| 303 |
+
n_done = sum(1 for v in V_BINARY_COMPLETED.values() if v)
|
| 304 |
+
stats = f"全局进度: {n_done}/{len(V_BINARY_TASKS)} ({100*n_done/len(V_BINARY_TASKS):.1f}%)"
|
| 305 |
+
return state, video_html, info, prompt, "✅ 已加载", stats
|
| 306 |
state["current"] = None
|
| 307 |
+
return state, "<p>🎉 全部完成!</p>", "", "", "完成", ""
|
| 308 |
|
| 309 |
+
def v_binary_submit(state, verdict, note):
|
| 310 |
if not state or not state.get("current"):
|
| 311 |
+
return state, "<p>请先登录</p>", "", "", "⚠️", "", "否", ""
|
| 312 |
+
tid = state["current"]
|
| 313 |
+
task = V_BINARY_BY_ID[tid]
|
| 314 |
record = {
|
| 315 |
+
"type": "v_binary",
|
| 316 |
"timestamp": time.time(),
|
| 317 |
"annotator": state["annotator"],
|
| 318 |
+
"task_id": tid,
|
| 319 |
+
"dataset_id": "mbenchv",
|
| 320 |
+
"model_id": task["model_id"],
|
| 321 |
+
"subset": task["subset"],
|
| 322 |
+
"sample_id": task["sample_id"],
|
| 323 |
+
"condition_id": "text",
|
| 324 |
+
"item_id": f'{task["subset"]}:{task["sample_id"]}:text',
|
| 325 |
"memory_issue": verdict == "是",
|
| 326 |
"verdict": verdict,
|
| 327 |
"note": (note or "").strip(),
|
| 328 |
}
|
| 329 |
+
_append(record, ANN_FILE_V_BINARY)
|
| 330 |
with STATE_LOCK:
|
| 331 |
+
V_BINARY_PENDING.pop(tid, None)
|
| 332 |
+
V_BINARY_COMPLETED[tid].add(state["annotator"])
|
| 333 |
state["count"] = state.get("count", 0) + 1
|
| 334 |
state["idx"] = state["idx"] + 1
|
| 335 |
state["current"] = None
|
| 336 |
+
res = _v_binary_next(state)
|
| 337 |
+
return res[0], res[1], res[2], res[3], f"✅ 已提交 {state['count']}", res[5], "否", ""
|
| 338 |
|
| 339 |
+
def v_binary_skip(state):
|
| 340 |
if not state or not state.get("current"):
|
| 341 |
+
return state, "", "", "", "⚠️", "", "否", ""
|
| 342 |
+
tid = state["current"]
|
| 343 |
with STATE_LOCK:
|
| 344 |
+
V_BINARY_PENDING.pop(tid, None)
|
| 345 |
state["idx"] = state["idx"] + 1
|
| 346 |
state["current"] = None
|
| 347 |
+
res = _v_binary_next(state)
|
| 348 |
+
return res[0], res[1], res[2], res[3], "⏭️ 已跳过", res[5], "否", ""
|
| 349 |
|
| 350 |
# ---------------------------------------------------------------------------
|
| 351 |
+
# V Pairwise
|
| 352 |
# ---------------------------------------------------------------------------
|
| 353 |
|
| 354 |
+
def v_pairwise_start(annotator: str, state: dict):
|
| 355 |
annotator = (annotator or "").strip()
|
| 356 |
if not annotator:
|
| 357 |
+
empty = gr.update(visible=False, value="差不多")
|
| 358 |
+
return (state, "⚠️ 请输入名字", "", "", "", "",
|
| 359 |
+
empty, empty, empty, empty, empty, "", "")
|
| 360 |
+
n_done = sum(1 for v in V_PAIRWISE_COMPLETED.values() if annotator in v)
|
| 361 |
+
order = list(range(len(V_PAIRWISE_TASKS)))
|
| 362 |
random.shuffle(order)
|
| 363 |
+
state = {"annotator": annotator, "order": order, "idx": 0,
|
| 364 |
+
"current": None, "swapped": False, "count": n_done}
|
| 365 |
+
return _v_pairwise_next(state)
|
|
|
|
|
|
|
| 366 |
|
| 367 |
+
def _v_pairwise_next(state):
|
| 368 |
annotator = state["annotator"]
|
|
|
|
| 369 |
order = state["order"]
|
| 370 |
idx = state.get("idx", 0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
with STATE_LOCK:
|
| 372 |
+
_reap_expired(V_PAIRWISE_PENDING)
|
| 373 |
while idx < len(order):
|
| 374 |
+
task = V_PAIRWISE_TASKS[order[idx]]
|
| 375 |
+
tid = task["task_id"]
|
| 376 |
+
if len(V_PAIRWISE_COMPLETED.get(tid, set())) >= V_PAIRWISE_ANNOTATORS_PER_TASK:
|
| 377 |
+
idx += 1; continue
|
| 378 |
+
if annotator in V_PAIRWISE_COMPLETED.get(tid, set()):
|
| 379 |
+
idx += 1; continue
|
| 380 |
+
if tid in V_PAIRWISE_PENDING and V_PAIRWISE_PENDING[tid][0] != annotator:
|
| 381 |
+
idx += 1; continue
|
| 382 |
+
V_PAIRWISE_PENDING[tid] = (annotator, time.time())
|
| 383 |
state["idx"] = idx
|
| 384 |
+
state["current"] = tid
|
| 385 |
+
|
| 386 |
+
ma, mb = task["model_a"], task["model_b"]
|
| 387 |
if random.random() < 0.5:
|
| 388 |
+
left, right = ma, mb; state["swapped"] = False
|
|
|
|
| 389 |
else:
|
| 390 |
+
left, right = mb, ma; state["swapped"] = True
|
| 391 |
+
subset = task["subset"]; sid = task["sample_id"]
|
| 392 |
+
video_l = _render_video_html(_v_video_proxy_url(left, subset, sid))
|
| 393 |
+
video_r = _render_video_html(_v_video_proxy_url(right, subset, sid))
|
| 394 |
+
meta = _load_sample_meta("mbenchv", subset, sid)
|
| 395 |
+
prompt = _format_caption(meta)
|
| 396 |
+
|
| 397 |
+
dim_questions = task["dimension_questions"]
|
| 398 |
+
dimensions = task["dimensions"]
|
| 399 |
+
q_updates = []
|
| 400 |
+
for i in range(5):
|
| 401 |
+
if i < len(dimensions):
|
| 402 |
+
qtext = dim_questions.get(dimensions[i], dimensions[i])
|
| 403 |
+
q_updates.append(gr.update(visible=True, label=qtext, value="差不多"))
|
| 404 |
+
else:
|
| 405 |
+
q_updates.append(gr.update(visible=False, value="差不多"))
|
| 406 |
+
|
| 407 |
+
subset_emoji = {"environment": "🏞️", "object": "🎯", "human": "👤", "causal": "⚡"}
|
| 408 |
+
info = (f"**子集**: {subset_emoji.get(subset, '')} {subset} | "
|
| 409 |
+
f"**已提交**: {state['count']}")
|
| 410 |
+
n_done = sum(1 for v in V_PAIRWISE_COMPLETED.values()
|
| 411 |
+
if len(v) >= V_PAIRWISE_ANNOTATORS_PER_TASK)
|
| 412 |
+
stats = f"全局进度: {n_done}/{len(V_PAIRWISE_TASKS)} 任务完成"
|
| 413 |
+
return (state, "✅ 已加载", video_l, video_r, info, prompt,
|
| 414 |
+
*q_updates, "", stats)
|
| 415 |
state["current"] = None
|
| 416 |
+
empty = gr.update(visible=False, value="差不多")
|
| 417 |
+
return (state, "🎉 全部完成", "", "", "全部完成", "",
|
| 418 |
+
empty, empty, empty, empty, empty, "", "")
|
| 419 |
|
| 420 |
+
def v_pairwise_submit(state, q1, q2, q3, q4, q5, note):
|
| 421 |
if not state or not state.get("current"):
|
| 422 |
+
empty = gr.update(visible=False, value="差不多")
|
| 423 |
+
return (state, "⚠️ 请先登录", "", "", "", "",
|
| 424 |
+
empty, empty, empty, empty, empty, "", "")
|
| 425 |
+
tid = state["current"]
|
| 426 |
+
task = V_PAIRWISE_BY_ID[tid]
|
| 427 |
+
swapped = state["swapped"]
|
| 428 |
+
ma, mb = task["model_a"], task["model_b"]
|
| 429 |
+
verdicts = [q1, q2, q3, q4, q5]
|
| 430 |
+
dim_results = {}
|
| 431 |
+
for i, dim in enumerate(task["dimensions"]):
|
| 432 |
+
v = verdicts[i]
|
| 433 |
+
if v == "A更好":
|
| 434 |
+
winner = mb if swapped else ma
|
| 435 |
+
elif v == "B更好":
|
| 436 |
+
winner = ma if swapped else mb
|
| 437 |
+
else:
|
| 438 |
+
winner = "tie"
|
| 439 |
+
dim_results[dim] = winner
|
| 440 |
+
|
| 441 |
record = {
|
| 442 |
+
"type": "v_pairwise",
|
| 443 |
"timestamp": time.time(),
|
| 444 |
"annotator": state["annotator"],
|
| 445 |
"task_id": tid,
|
| 446 |
+
"dataset_id": "mbenchv",
|
| 447 |
+
"subset": task["subset"],
|
| 448 |
+
"sample_id": task["sample_id"],
|
| 449 |
+
"condition_id": "text",
|
| 450 |
+
"model_a": ma,
|
| 451 |
+
"model_b": mb,
|
| 452 |
+
"item_a": f'{task["subset"]}:{task["sample_id"]}:text|{ma}',
|
| 453 |
+
"item_b": f'{task["subset"]}:{task["sample_id"]}:text|{mb}',
|
| 454 |
+
"dimensions": dim_results,
|
| 455 |
"swapped": swapped,
|
| 456 |
"note": (note or "").strip(),
|
| 457 |
}
|
| 458 |
+
_append(record, ANN_FILE_V_PAIRWISE)
|
| 459 |
with STATE_LOCK:
|
| 460 |
+
V_PAIRWISE_PENDING.pop(tid, None)
|
| 461 |
+
V_PAIRWISE_COMPLETED[tid].add(state["annotator"])
|
| 462 |
state["count"] = state.get("count", 0) + 1
|
| 463 |
state["idx"] = state["idx"] + 1
|
| 464 |
state["current"] = None
|
| 465 |
+
return _v_pairwise_next(state)
|
|
|
|
| 466 |
|
| 467 |
+
def v_pairwise_skip(state):
|
| 468 |
if not state or not state.get("current"):
|
| 469 |
+
empty = gr.update(visible=False, value="差不多")
|
| 470 |
+
return (state, "⚠️ 请先登录", "", "", "", "",
|
| 471 |
+
empty, empty, empty, empty, empty, "", "")
|
| 472 |
+
tid = state["current"]
|
| 473 |
with STATE_LOCK:
|
| 474 |
+
V_PAIRWISE_PENDING.pop(tid, None)
|
| 475 |
state["idx"] = state["idx"] + 1
|
| 476 |
state["current"] = None
|
| 477 |
+
return _v_pairwise_next(state)
|
|
|
|
| 478 |
|
| 479 |
# ---------------------------------------------------------------------------
|
| 480 |
+
# A Pairwise (adapted from old app, with new paths)
|
| 481 |
# ---------------------------------------------------------------------------
|
| 482 |
|
| 483 |
+
def _render_a_aux(task: dict) -> str:
|
| 484 |
+
subset = task["subset"]
|
| 485 |
+
box = 'class="aux-info-box"'
|
| 486 |
+
motion = task.get("camera_motion", "left_then_right")
|
| 487 |
+
motion_desc = task.get("camera_motion_description", motion)
|
| 488 |
+
gif_url = _a_asset_hf_url(f"camera_diagrams/{motion}.gif")
|
| 489 |
+
camera_html = (
|
| 490 |
+
f'<div style="flex:0 0 200px">'
|
| 491 |
+
f'<p><b>🎬 预期相机运动</b></p>'
|
| 492 |
+
f'<p style="margin:0 0 8px">{motion_desc}</p>'
|
| 493 |
+
f'<img src="{gif_url}" style="width:180px">'
|
| 494 |
+
f'</div>'
|
| 495 |
+
)
|
| 496 |
+
caption = task.get("caption", "")
|
| 497 |
+
caption_html = (
|
| 498 |
+
f'<div style="flex:1;min-width:250px">'
|
| 499 |
+
f'<p><b>📝 场景描述</b></p>'
|
| 500 |
+
f'<p style="font-size:14px;line-height:1.5">{caption}</p>'
|
| 501 |
+
f'</div>'
|
| 502 |
+
) if caption else ""
|
| 503 |
+
|
| 504 |
+
if subset == "object":
|
| 505 |
+
sample_id = task["sample_id"]
|
| 506 |
+
# Use new mask_viz path inside MBench-A/assets/mask_viz still works
|
| 507 |
+
mask_url = _a_asset_hf_url(f"mask_viz/{sample_id}.png")
|
| 508 |
+
return (
|
| 509 |
+
f'<div {box}>'
|
| 510 |
+
f'<p><b>🎯 请关注画面中被标注(高亮)的物体</b></p>'
|
| 511 |
+
f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start;margin-top:8px">'
|
| 512 |
+
f'<div style="flex:1;min-width:300px">'
|
| 513 |
+
f'<img src="{mask_url}" style="max-width:100%;max-height:280px"></div>'
|
| 514 |
+
f'{camera_html}{caption_html}</div></div>'
|
| 515 |
+
)
|
| 516 |
+
elif subset == "human":
|
| 517 |
+
return (
|
| 518 |
+
f'<div {box}>'
|
| 519 |
+
f'<p><b>👤 请关注视频中的人物</b>:观察人物离开画面再回来后,面部和外观是否保持一致。</p>'
|
| 520 |
+
f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start;margin-top:8px">'
|
| 521 |
+
f'{camera_html}{caption_html}</div></div>'
|
| 522 |
+
)
|
| 523 |
+
elif subset == "causal":
|
| 524 |
+
return (
|
| 525 |
+
f'<div {box}>'
|
| 526 |
+
f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start">'
|
| 527 |
+
f'{camera_html}{caption_html}</div></div>'
|
| 528 |
+
)
|
| 529 |
+
else: # environment
|
| 530 |
+
return (
|
| 531 |
+
f'<div {box}>'
|
| 532 |
+
f'<p><b>🏞️ 请关注整体场景</b>:观察相机转回来后,场景的布局/风格/光照是否保持一致。</p>'
|
| 533 |
+
f'<div style="display:flex;gap:16px;flex-wrap:wrap;align-items:flex-start;margin-top:8px">'
|
| 534 |
+
f'{camera_html}{caption_html}</div></div>'
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
def a_start(annotator: str, state: dict):
|
| 538 |
annotator = (annotator or "").strip()
|
| 539 |
if not annotator:
|
| 540 |
+
empty = gr.update(visible=False, value="差不多")
|
| 541 |
return (state, "⚠️ 请输入名字", "", "", "", "",
|
| 542 |
+
empty, empty, empty, empty, empty, empty, "", "")
|
| 543 |
+
n_done = sum(1 for v in A_PAIRWISE_COMPLETED.values() if annotator in v)
|
| 544 |
+
order = list(range(len(A_PAIRWISE_TASKS)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
random.shuffle(order)
|
| 546 |
+
state = {"annotator": annotator, "order": order, "idx": 0,
|
| 547 |
+
"current": None, "swapped": False, "count": n_done}
|
| 548 |
+
return _a_next(state)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
|
| 550 |
+
def _a_next(state):
|
|
|
|
| 551 |
annotator = state["annotator"]
|
| 552 |
order = state["order"]
|
| 553 |
idx = state.get("idx", 0)
|
|
|
|
| 554 |
with STATE_LOCK:
|
| 555 |
+
_reap_expired(A_PAIRWISE_PENDING)
|
| 556 |
while idx < len(order):
|
| 557 |
+
task = A_PAIRWISE_TASKS[order[idx]]
|
| 558 |
tid = task["task_id"]
|
| 559 |
+
if len(A_PAIRWISE_COMPLETED.get(tid, set())) >= A_PAIRWISE_ANNOTATORS_PER_TASK:
|
| 560 |
+
idx += 1; continue
|
| 561 |
+
if annotator in A_PAIRWISE_COMPLETED.get(tid, set()):
|
| 562 |
+
idx += 1; continue
|
| 563 |
+
if tid in A_PAIRWISE_PENDING and A_PAIRWISE_PENDING[tid][0] != annotator:
|
| 564 |
+
idx += 1; continue
|
| 565 |
+
A_PAIRWISE_PENDING[tid] = (annotator, time.time())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
state["idx"] = idx
|
| 567 |
+
state["current"] = tid
|
| 568 |
|
| 569 |
+
ma, mb = task["model_a"], task["model_b"]
|
|
|
|
| 570 |
if random.random() < 0.5:
|
| 571 |
+
left, right = ma, mb; state["swapped"] = False
|
|
|
|
| 572 |
else:
|
| 573 |
+
left, right = mb, ma; state["swapped"] = True
|
| 574 |
+
subset = task["subset"]; sid = task["sample_id"]
|
| 575 |
+
motion = task.get("camera_motion", "left_then_right")
|
| 576 |
+
cond = f"{motion}_25s"
|
| 577 |
+
video_l = _render_video_html(_a_video_proxy_url(left, subset, sid, cond))
|
| 578 |
+
video_r = _render_video_html(_a_video_proxy_url(right, subset, sid, cond))
|
| 579 |
+
aux = _render_a_aux(task)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
|
|
|
|
| 581 |
dimensions = task["dimensions"]
|
| 582 |
+
dim_q = task.get("dimension_questions", {})
|
|
|
|
|
|
|
| 583 |
q_updates = []
|
| 584 |
for i in range(6):
|
| 585 |
if i < len(dimensions):
|
| 586 |
+
qtext = dim_q.get(dimensions[i], dimensions[i])
|
| 587 |
+
q_updates.append(gr.update(visible=True, label=qtext, value="差不多"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 588 |
else:
|
| 589 |
q_updates.append(gr.update(visible=False, value="差不多"))
|
| 590 |
|
| 591 |
+
subset_emoji = {"environment": "🏞️", "object": "🎯", "human": "👤", "causal": "⚡"}
|
| 592 |
+
info = f"**子集**: {subset_emoji.get(subset, '')} {subset} | **已提交**: {state['count']}"
|
| 593 |
+
n_done = sum(1 for v in A_PAIRWISE_COMPLETED.values()
|
| 594 |
+
if len(v) >= A_PAIRWISE_ANNOTATORS_PER_TASK)
|
| 595 |
+
stats = f"全局进度: {n_done}/{len(A_PAIRWISE_TASKS)} 任务完成"
|
| 596 |
+
return (state, "✅ 已加载", aux, video_l, video_r, info,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 597 |
*q_updates, "", stats)
|
| 598 |
+
state["current"] = None
|
| 599 |
+
empty = gr.update(visible=False, value="差不多")
|
| 600 |
+
return (state, "🎉 全部完成", "", "", "", "全部完成",
|
| 601 |
+
empty, empty, empty, empty, empty, empty, "", "")
|
| 602 |
|
| 603 |
+
def a_submit(state, q1, q2, q3, q4, q5, q6, note):
|
| 604 |
+
if not state or not state.get("current"):
|
| 605 |
+
empty = gr.update(visible=False, value="差不多")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
return (state, "⚠️ 请先登录", "", "", "", "",
|
| 607 |
+
empty, empty, empty, empty, empty, empty, "", "")
|
| 608 |
+
tid = state["current"]
|
| 609 |
+
task = A_PAIRWISE_BY_ID[tid]
|
|
|
|
|
|
|
| 610 |
swapped = state["swapped"]
|
| 611 |
+
ma, mb = task["model_a"], task["model_b"]
|
| 612 |
+
verdicts = [q1, q2, q3, q4, q5, q6]
|
|
|
|
|
|
|
| 613 |
dim_results = {}
|
| 614 |
+
for i, dim in enumerate(task["dimensions"]):
|
| 615 |
v = verdicts[i]
|
| 616 |
if v == "A更好":
|
| 617 |
+
winner = mb if swapped else ma
|
|
|
|
| 618 |
elif v == "B更好":
|
| 619 |
+
winner = ma if swapped else mb
|
| 620 |
else:
|
| 621 |
winner = "tie"
|
| 622 |
+
dim_results[dim] = winner
|
| 623 |
|
| 624 |
+
motion = task.get("camera_motion", "left_then_right")
|
| 625 |
+
cond = f"{motion}_25s"
|
| 626 |
record = {
|
| 627 |
+
"type": "a_pairwise",
|
| 628 |
"timestamp": time.time(),
|
| 629 |
"annotator": state["annotator"],
|
| 630 |
"task_id": tid,
|
| 631 |
+
"dataset_id": "mbencha",
|
| 632 |
"subset": task["subset"],
|
| 633 |
"sample_id": task["sample_id"],
|
| 634 |
+
"condition_id": cond,
|
| 635 |
+
"model_a": ma,
|
| 636 |
+
"model_b": mb,
|
| 637 |
+
"item_a": f'{task["subset"]}:{task["sample_id"]}:{cond}|{ma}',
|
| 638 |
+
"item_b": f'{task["subset"]}:{task["sample_id"]}:{cond}|{mb}',
|
| 639 |
+
"camera_motion": motion,
|
| 640 |
"dimensions": dim_results,
|
| 641 |
"swapped": swapped,
|
| 642 |
"note": (note or "").strip(),
|
| 643 |
}
|
| 644 |
+
_append(record, ANN_FILE_A_PAIRWISE)
|
|
|
|
| 645 |
with STATE_LOCK:
|
| 646 |
+
A_PAIRWISE_PENDING.pop(tid, None)
|
| 647 |
+
A_PAIRWISE_COMPLETED[tid].add(state["annotator"])
|
|
|
|
| 648 |
state["count"] = state.get("count", 0) + 1
|
| 649 |
state["idx"] = state["idx"] + 1
|
| 650 |
+
state["current"] = None
|
| 651 |
+
return _a_next(state)
|
|
|
|
|
|
|
| 652 |
|
| 653 |
+
def a_skip(state):
|
| 654 |
+
if not state or not state.get("current"):
|
| 655 |
+
empty = gr.update(visible=False, value="差不多")
|
|
|
|
| 656 |
return (state, "⚠️ 请先登录", "", "", "", "",
|
| 657 |
+
empty, empty, empty, empty, empty, empty, "", "")
|
| 658 |
+
tid = state["current"]
|
|
|
|
| 659 |
with STATE_LOCK:
|
| 660 |
+
A_PAIRWISE_PENDING.pop(tid, None)
|
|
|
|
| 661 |
state["idx"] = state["idx"] + 1
|
| 662 |
+
state["current"] = None
|
| 663 |
+
return _a_next(state)
|
|
|
|
| 664 |
|
| 665 |
# ---------------------------------------------------------------------------
|
| 666 |
# UI
|
|
|
|
| 668 |
|
| 669 |
CUSTOM_CSS = """
|
| 670 |
#prompt_box textarea { height: 300px !important; overflow-y: auto !important; }
|
|
|
|
|
|
|
|
|
|
| 671 |
.aux-info-box {
|
| 672 |
+
background: #e3e8ef !important; color: #111 !important;
|
| 673 |
+
padding: 14px !important; border-radius: 8px !important;
|
| 674 |
+
margin-bottom: 12px !important; border: 1px solid #b0b8c4 !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
}
|
| 676 |
+
.aux-info-box * { color: #111 !important; }
|
| 677 |
+
.aux-info-box img { border: 1px solid #999; border-radius: 4px; }
|
| 678 |
"""
|
| 679 |
|
| 680 |
+
with gr.Blocks(title="MBench 标注 (NEW)", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
|
| 681 |
+
gr.Markdown("# 🎬 MBench 视频标注平台 (新结构)")
|
| 682 |
|
| 683 |
with gr.Tabs():
|
| 684 |
+
# ───── V Binary ─────
|
| 685 |
+
with gr.Tab("MBench-V Binary"):
|
| 686 |
+
gr.Markdown("## 📺 MBench-V — 单视频记忆问题判断\n\n"
|
| 687 |
+
"请观看视频并阅读 prompt,判断是否出现了**记忆问题**(场景/物体/人物前后不一致)。")
|
| 688 |
+
vb_stats = gr.Markdown("")
|
| 689 |
+
vb_state = gr.State({})
|
| 690 |
+
with gr.Row():
|
| 691 |
+
vb_name = gr.Textbox(label="标注员名字", placeholder="例如: charlie", scale=4)
|
| 692 |
+
vb_login = gr.Button("开始标注", variant="primary", scale=1)
|
| 693 |
+
vb_status = gr.Markdown("")
|
| 694 |
+
vb_video = gr.HTML("<p>请先登录</p>")
|
| 695 |
+
vb_info = gr.Markdown("")
|
| 696 |
+
vb_prompt = gr.Textbox(label="Prompt / 文本描述", lines=10, elem_id="prompt_box")
|
| 697 |
+
vb_verdict = gr.Radio(["是", "否"], value="否", label="是否出现了记忆问题?")
|
| 698 |
+
vb_note = gr.Textbox(label="备注(可选)", lines=1)
|
| 699 |
+
with gr.Row():
|
| 700 |
+
vb_submit = gr.Button("✅ 提交并下一组", variant="primary")
|
| 701 |
+
vb_skip = gr.Button("⏭️ 跳过")
|
| 702 |
+
vb_outs = [vb_state, vb_video, vb_info, vb_prompt, vb_status, vb_stats, vb_verdict, vb_note]
|
| 703 |
+
vb_login.click(v_binary_start, [vb_name, vb_state],
|
| 704 |
+
[vb_state, vb_video, vb_info, vb_prompt, vb_status, vb_stats])
|
| 705 |
+
vb_name.submit(v_binary_start, [vb_name, vb_state],
|
| 706 |
+
[vb_state, vb_video, vb_info, vb_prompt, vb_status, vb_stats])
|
| 707 |
+
vb_submit.click(v_binary_submit, [vb_state, vb_verdict, vb_note], vb_outs)
|
| 708 |
+
vb_skip.click(v_binary_skip, [vb_state], vb_outs)
|
| 709 |
+
|
| 710 |
+
# ───── V Pairwise ─────
|
| 711 |
+
with gr.Tab("MBench-V Pairwise"):
|
| 712 |
+
gr.Markdown("## 🎬 MBench-V — 双视频对比 (5 维度)\n\n"
|
| 713 |
+
"比较两个 T2V 模型生成的视频,从 5 个维度独立判断哪个更好。")
|
| 714 |
+
vp_stats = gr.Markdown("")
|
| 715 |
+
vp_state = gr.State({})
|
| 716 |
+
with gr.Row():
|
| 717 |
+
vp_name = gr.Textbox(label="标注员名字", scale=4)
|
| 718 |
+
vp_login = gr.Button("开始标注", variant="primary", scale=1)
|
| 719 |
+
vp_status = gr.Markdown("")
|
| 720 |
+
with gr.Row(equal_height=True):
|
| 721 |
+
with gr.Column(scale=1, min_width=360):
|
| 722 |
+
gr.Markdown("### 视频 A")
|
| 723 |
+
vp_video_l = gr.HTML("<p>请先登录</p>")
|
| 724 |
+
with gr.Column(scale=1, min_width=360):
|
| 725 |
+
gr.Markdown("### 视频 B")
|
| 726 |
+
vp_video_r = gr.HTML("<p>请先登录</p>")
|
| 727 |
+
vp_info = gr.Markdown("")
|
| 728 |
+
vp_prompt = gr.Textbox(label="Prompt / 文本描述", lines=8, elem_id="prompt_box")
|
| 729 |
+
gr.Markdown("---\n### 请对以下每个维度分别判断:")
|
| 730 |
+
vp_q1 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 1", visible=False)
|
| 731 |
+
vp_q2 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 2", visible=False)
|
| 732 |
+
vp_q3 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 3", visible=False)
|
| 733 |
+
vp_q4 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 4", visible=False)
|
| 734 |
+
vp_q5 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 5", visible=False)
|
| 735 |
+
vp_note = gr.Textbox(label="备注(可选)", lines=1)
|
| 736 |
+
with gr.Row():
|
| 737 |
+
vp_submit = gr.Button("✅ 提交并下一组", variant="primary")
|
| 738 |
+
vp_skip = gr.Button("⏭️ 跳过")
|
| 739 |
+
vp_outs = [vp_state, vp_status, vp_video_l, vp_video_r, vp_info, vp_prompt,
|
| 740 |
+
vp_q1, vp_q2, vp_q3, vp_q4, vp_q5, vp_note, vp_stats]
|
| 741 |
+
vp_login.click(v_pairwise_start, [vp_name, vp_state], vp_outs)
|
| 742 |
+
vp_name.submit(v_pairwise_start, [vp_name, vp_state], vp_outs)
|
| 743 |
+
vp_submit.click(v_pairwise_submit,
|
| 744 |
+
[vp_state, vp_q1, vp_q2, vp_q3, vp_q4, vp_q5, vp_note], vp_outs)
|
| 745 |
+
vp_skip.click(v_pairwise_skip, [vp_state], vp_outs)
|
| 746 |
+
|
| 747 |
+
# ───── A Pairwise ─────
|
| 748 |
+
with gr.Tab("MBench-A Pairwise"):
|
| 749 |
+
gr.Markdown("## 🌍 MBench-A — 世界模型双视频对比 (≤6 维度)\n\n"
|
| 750 |
+
"比较两个世界模型的长视频(25 秒),评估相机运动结束后的记忆一致性。")
|
| 751 |
a_stats = gr.Markdown("")
|
| 752 |
a_state = gr.State({})
|
|
|
|
| 753 |
with gr.Row():
|
| 754 |
+
a_name = gr.Textbox(label="标注员名字", scale=4)
|
| 755 |
a_login = gr.Button("开始标注", variant="primary", scale=1)
|
|
|
|
| 756 |
a_status = gr.Markdown("")
|
|
|
|
|
|
|
| 757 |
a_aux = gr.HTML("")
|
|
|
|
|
|
|
| 758 |
with gr.Row(equal_height=True):
|
| 759 |
with gr.Column(scale=1, min_width=360):
|
| 760 |
gr.Markdown("### 视频 A")
|
| 761 |
+
a_video_l = gr.HTML("<p>请先登录</p>")
|
| 762 |
with gr.Column(scale=1, min_width=360):
|
| 763 |
gr.Markdown("### 视频 B")
|
| 764 |
+
a_video_r = gr.HTML("<p>请先登录</p>")
|
| 765 |
+
a_info = gr.Markdown("")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 766 |
gr.Markdown("---\n### 请对以下每个维度分别判断:")
|
| 767 |
a_q1 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 1", visible=False)
|
| 768 |
a_q2 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 2", visible=False)
|
|
|
|
| 770 |
a_q4 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 4", visible=False)
|
| 771 |
a_q5 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 5", visible=False)
|
| 772 |
a_q6 = gr.Radio(["A更好", "差不多", "B更好"], value="差不多", label="维度 6", visible=False)
|
|
|
|
| 773 |
a_note = gr.Textbox(label="备注(可选)", lines=1)
|
|
|
|
| 774 |
with gr.Row():
|
| 775 |
a_submit = gr.Button("✅ 提交并下一组", variant="primary")
|
| 776 |
a_skip = gr.Button("⏭️ 跳过")
|
| 777 |
+
a_outs = [a_state, a_status, a_aux, a_video_l, a_video_r, a_info,
|
| 778 |
+
a_q1, a_q2, a_q3, a_q4, a_q5, a_q6, a_note, a_stats]
|
| 779 |
+
a_login.click(a_start, [a_name, a_state], a_outs)
|
| 780 |
+
a_name.submit(a_start, [a_name, a_state], a_outs)
|
| 781 |
+
a_submit.click(a_submit,
|
| 782 |
+
[a_state, a_q1, a_q2, a_q3, a_q4, a_q5, a_q6, a_note], a_outs)
|
| 783 |
+
a_skip.click(a_skip, [a_state], a_outs)
|
|
|
|
|
|
|
|
|
|
| 784 |
|
| 785 |
# ---------------------------------------------------------------------------
|
| 786 |
# Video proxy
|
|
|
|
| 795 |
_video_client = httpx.AsyncClient(timeout=30.0, follow_redirects=True)
|
| 796 |
|
| 797 |
async def _do_proxy(upstream: str, request: Request):
|
|
|
|
| 798 |
req_headers = {}
|
| 799 |
if (rng := request.headers.get("range")):
|
| 800 |
req_headers["range"] = rng
|
|
|
|
| 805 |
)
|
| 806 |
except Exception as e:
|
| 807 |
raise HTTPException(502, f"upstream fetch failed: {e}")
|
| 808 |
+
passthrough = {}
|
| 809 |
for h in ("content-type", "content-length", "accept-ranges",
|
| 810 |
"content-range", "etag", "last-modified"):
|
| 811 |
if h in upstream_resp.headers:
|
| 812 |
+
passthrough[h] = upstream_resp.headers[h]
|
| 813 |
+
passthrough.setdefault("content-type", "video/mp4")
|
| 814 |
+
passthrough["cache-control"] = "public, max-age=300"
|
| 815 |
|
| 816 |
async def _body():
|
| 817 |
try:
|
|
|
|
| 819 |
yield chunk
|
| 820 |
finally:
|
| 821 |
await upstream_resp.aclose()
|
| 822 |
+
return StreamingResponse(_body(), status_code=upstream_resp.status_code, headers=passthrough)
|
| 823 |
|
| 824 |
+
async def _proxy_v_video(model: str, subset: str, sample_id: str, request: Request):
|
| 825 |
+
sid = sample_id.replace(".mp4", "")
|
| 826 |
+
if model not in V_MODELS:
|
| 827 |
+
raise HTTPException(404, f"unknown V model: {model}")
|
| 828 |
+
upstream = _v_video_hf_url(model, subset, sid)
|
|
|
|
|
|
|
| 829 |
return await _do_proxy(upstream, request)
|
| 830 |
|
| 831 |
+
async def _proxy_a_video(model: str, subset: str, sample_id: str, condition_id: str, request: Request):
|
| 832 |
+
cond = condition_id.replace(".mp4", "")
|
| 833 |
+
if model not in A_MODELS:
|
| 834 |
+
raise HTTPException(404, f"unknown A model: {model}")
|
| 835 |
+
upstream = _a_video_hf_url(model, subset, sample_id, cond)
|
| 836 |
return await _do_proxy(upstream, request)
|
| 837 |
|
| 838 |
+
_orig = _GradioApp.create_app
|
| 839 |
+
def _patched(*args, **kwargs):
|
| 840 |
+
app = _orig(*args, **kwargs)
|
| 841 |
+
app.add_api_route("/video_v/{model}/{subset}/{sample_id}",
|
| 842 |
+
_proxy_v_video, methods=["GET", "HEAD"], include_in_schema=False)
|
| 843 |
+
app.add_api_route("/video_a/{model}/{subset}/{sample_id}/{condition_id}",
|
| 844 |
+
_proxy_a_video, methods=["GET", "HEAD"], include_in_schema=False)
|
| 845 |
+
print("[ann-new] video proxy routes registered")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 846 |
return app
|
| 847 |
+
_GradioApp.create_app = staticmethod(_patched)
|
| 848 |
|
|
|
|
| 849 |
demo.queue(default_concurrency_limit=16).launch(ssr_mode=False)
|
sampling/new_task_pools.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|