| | import json |
| | import io |
| | from datasets import Dataset, Features, Sequence, Value, Image |
| | from PIL import Image as PILImage |
| |
|
| | |
| | with open("Train_QA_10k_noFreeForm.json", "r") as f: |
| | records = json.load(f) |
| |
|
| | |
| | ds = Dataset.from_list(records) |
| |
|
| | |
| | def read_image_bytes(example): |
| | with open(example["path"], "rb") as img_f: |
| | example["image_bytes"] = img_f.read() |
| | return example |
| |
|
| | |
| | ds = ds.map(read_image_bytes, remove_columns=[]) |
| |
|
| | |
| | features = Features({ |
| | "problem_id": Value("int64"), |
| | "problem": Value("string"), |
| | "data_type": Value("string"), |
| | "problem_type": Value("string"), |
| | "options": Sequence(Value("string")), |
| | "solution": Value("string"), |
| | "data_source": Value("string"), |
| | |
| | "answer": Value("string"), |
| | "path": Value("string"), |
| | "image_bytes": Value("binary"), |
| | }) |
| | ds = ds.cast(features) |
| |
|
| | |
| | ds = ds.rename_column("image_bytes", "images") |
| | ds = ds.cast_column("images", Image(decode=True)) |
| |
|
| | |
| | img0 = ds[0]["images"] |
| | print(img0) |
| | |
| |
|
| | |
| | ds.to_parquet("./hf_data/Train_QA_10k_noFreeForm.parquet") |
| |
|