Video-R1 / create_data.py

Add files using upload-large-folder tool

bb7f76d verified 24 days ago

12.8 kB

	# import re
	# from pathlib import Path
	# from datasets import load_dataset, Dataset, DatasetDict, Features, Value, Image
	# import re
	# from typing import Dict, List, Optional
	# from pathlib import Path
	# from datasets import Dataset, DatasetDict, concatenate_datasets, Features, Value, Sequence


	# # ------------------------------------------------------------------
	# # 0) Load your JSON → `raw_ds` exactly as before
	# # ------------------------------------------------------------------

	# files = [
	# "pool_multiple_choice_chunk_01.json",
	# "pool_multiple_choice_chunk_02.json",
	# "pool_multiple_choice_chunk_03.json",
	# "pool_multiple_choice_chunk_04.json",
	# "pool_numerical_chunk_01.json",
	# "pool_numerical_chunk_02.json",
	# "pool_numerical_chunk_03.json",
	# "pool_regression_chunk_01.json",
	# ]

	# # ---- 1-4. load, trim, normalise ----------------------------------------
	# def load_trim_normalise(fp, cap=10_000):
	# ds = Dataset.from_json(fp)

	# # a) truncate
	# ds = ds.select(range(min(cap, len(ds))))

	# # b) make sure `options` exists and is always list[str]
	# if "options" not in ds.column_names:
	# ds = ds.add_column("options", [[]] * len(ds))
	# else:
	# ds = ds.map(
	# lambda ex: {"options": [str(o) for o in (ex["options"] or [])]},
	# remove_columns=[], num_proc=4,
	# )

	# return ds

	# ds_list = [load_trim_normalise(fp) for fp in files]

	# # ---- 4. align feature schema explicitly (all files now identical) -------
	# common_features = Features({
	# "problem_id" : Value("int64"),
	# "problem" : Value("string"),
	# "data_type" : Value("string"),
	# "problem_type": Value("string"),
	# "options" : Sequence(Value("string")),
	# "solution" : Value("string"),
	# "path" : Value("string"),
	# "data_source" : Value("string"),
	# })
	# ds_list = [d.cast(common_features) for d in ds_list]

	# # ---- 5. concatenate -----------------------------------------------------
	# raw_train = concatenate_datasets(ds_list)
	# raw_ds = DatasetDict({"train": raw_train})

	# # ------------------------------------------------------------------
	# # 1) Build the question (unchanged)
	# # ------------------------------------------------------------------
	# def build_question(example):
	# q = (
	# example["problem"] + " Options:\n" + "\n".join(example["options"])
	# if example["problem_type"] == "multiple choice"
	# else example["problem"]
	# )
	# example["problem"] = q
	# return example


	# def extract_answer(predict: str) -> Optional[str]:
	# """
	# Extracts the content of the <answer>…</answer> block from `predict`.
	# Returns the inner text (with leading/trailing whitespace stripped),
	# or None if no <answer> tag is found.
	# """
	# match = re.search(r"<answer>([\s\S]*?)</answer>", predict, re.DOTALL)
	# if not match:
	# return predict
	# return match.group(1).strip()



	# def add_answer(example):
	# # assumes the ground-truth answer (tagged) is in `solution`
	# example["answer"] = extract_answer(example["solution"])
	# return example

	# # ------------------------------------------------------------------
	# # 3) Embed image bytes (column name stays "images")
	# # ------------------------------------------------------------------
	# def to_embedded_image(example):
	# if example["data_type"] != "image":
	# example["images"] = None
	# return example
	# with open(example["path"], "rb") as f:
	# img_bytes = f.read()
	# example["images"] = {"bytes": img_bytes, "path": None}
	# return example

	# # ------------------------------------------------------------------
	# # 4) Full pipeline
	# # ------------------------------------------------------------------
	# processed = (
	# raw_ds["train"]
	# .map(build_question, num_proc=4)
	# .map(add_answer, num_proc=4)
	# .map(to_embedded_image, num_proc=4)
	# .remove_columns([
	# "path", "data_type", "options", "problem_type", "solution",
	# "problem_id", "data_source" # ← drop these too
	# ])
	# )

	# # ------------------------------------------------------------------
	# # 5) Schema must match the final column names
	# # ------------------------------------------------------------------
	# features = Features({
	# "problem": Value("string"),
	# "answer" : Value("string"),
	# "images" : Image(), # keep plural name
	# })
	# processed = processed.cast(features)

	# # ------------------------------------------------------------------
	# # 6) Write Parquet shards (file prefix inside the folder)
	# # ------------------------------------------------------------------
	# out_dir = Path("qwen2.5_vl_portable")
	# out_dir.mkdir(parents=True, exist_ok=True)

	# # processed.to_parquet(str(out_dir / "train.parquet")) # → train-00000-of-00001.parquet
	# processed.to_parquet(str("./hf_data/train.parquet"))
	# print("✓ Dataset written with embedded images and answers →", out_dir.resolve())


	# import re
	# from pathlib import Path
	# from typing import Dict, List, Optional

	# from datasets import (
	# Dataset,
	# DatasetDict,
	# concatenate_datasets,
	# Features,
	# Value,
	# Sequence,
	# Image,
	# )

	# # ------------------------------------------------------------------
	# # 0) Inputs
	# # ------------------------------------------------------------------
	# files = [
	# "pool_multiple_choice_chunk_01.json",
	# "pool_multiple_choice_chunk_02.json",
	# "pool_multiple_choice_chunk_03.json",
	# "pool_multiple_choice_chunk_04.json",
	# "pool_numerical_chunk_01.json",
	# "pool_numerical_chunk_02.json",
	# "pool_numerical_chunk_03.json",
	# "pool_regression_chunk_01.json",
	# ]

	# # ------------------------------------------------------------------
	# # 1) Define common meta schema (what you want to keep in the output)
	# # ------------------------------------------------------------------
	# common_features = Features({
	# "problem_id" : Value("int64"),
	# "problem" : Value("string"),
	# "data_type" : Value("string"),
	# "problem_type": Value("string"),
	# "options" : Sequence(Value("string")),
	# "solution" : Value("string"),
	# "path" : Value("string"),
	# "data_source" : Value("string"),
	# })

	# # Final (superset) schema to write: meta + new columns
	# full_features = common_features.copy()
	# full_features["answer"] = Value("string")
	# full_features["images"] = Image() # plural name kept, binary-friendly


	# # ------------------------------------------------------------------
	# # 2) Load + normalize each JSON
	# # ------------------------------------------------------------------
	# def load_trim_normalise(fp: str, cap: int = 10_000) -> Dataset:
	# ds = Dataset.from_json(fp)

	# # truncate if desired
	# ds = ds.select(range(min(cap, len(ds))))

	# # ensure `options` exists and is always list[str]
	# if "options" not in ds.column_names:
	# ds = ds.add_column("options", [[]] * len(ds))
	# else:
	# ds = ds.map(
	# lambda ex: {"options": [str(o) for o in (ex["options"] or [])]},
	# remove_columns=[],
	# num_proc=4,
	# )

	# # align to the common meta schema early (helps concat)
	# # Some JSONs may not have all fields; add missing with defaults first.
	# missing_cols = [k for k in common_features.keys() if k not in ds.column_names]
	# for mc in missing_cols:
	# # create sensible defaults
	# if mc == "options":
	# ds = ds.add_column(mc, [[]] * len(ds))
	# elif common_features[mc].dtype == "int64":
	# ds = ds.add_column(mc, [0] * len(ds))
	# else:
	# ds = ds.add_column(mc, [""] * len(ds))

	# ds = ds.cast(common_features)
	# return ds

	# ds_list = [load_trim_normalise(fp) for fp in files]

	# # Concatenate shards
	# raw_train = concatenate_datasets(ds_list)
	# raw_ds = DatasetDict({"train": raw_train})


	# # ------------------------------------------------------------------
	# # 3) Processing fns
	# # ------------------------------------------------------------------
	# def build_question(example: Dict) -> Dict:
	# """
	# If multiple-choice, append the options to the text.
	# Overwrites the `problem` field in-place (kept in output).
	# """
	# if example["problem_type"] == "multiple choice":
	# opts = example.get("options") or []
	# q = example["problem"] + " Options:\n" + "\n".join(opts)
	# example["problem"] = q
	# return example


	# def extract_answer(predict: str) -> Optional[str]:
	# """
	# Return inner text of <answer>...</answer>, stripped.
	# If no tag is found, return the original string.
	# """
	# if predict is None:
	# return None
	# match = re.search(r"<answer>([\s\S]*?)</answer>", predict, re.DOTALL)
	# if not match:
	# return predict
	# return match.group(1).strip()


	# def add_answer(example: Dict) -> Dict:
	# example["answer"] = extract_answer(example.get("solution", ""))
	# return example


	# def to_embedded_image(example: Dict) -> Dict:
	# """
	# If data_type == 'image', embed bytes for HF Image() feature.
	# Otherwise leave as None.
	# """
	# if example.get("data_type") != "image":
	# example["images"] = None
	# return example

	# path = example.get("path")
	# if not path:
	# example["images"] = None
	# return example

	# try:
	# with open(path, "rb") as f:
	# img_bytes = f.read()
	# example["images"] = {"bytes": img_bytes, "path": None}
	# except Exception:
	# # If image is missing or unreadable, keep None so cast still works
	# example["images"] = None
	# return example


	# # ------------------------------------------------------------------
	# # 4) Apply pipeline (do NOT drop meta columns you want to keep)
	# # ------------------------------------------------------------------
	# processed = (
	# raw_ds["train"]
	# .map(build_question, num_proc=4)
	# .map(add_answer, num_proc=4)
	# .map(to_embedded_image, num_proc=4)
	# .cast(full_features) # <- ensure final schema
	# )

	# # Optional: control output column ordering
	# processed = processed.select_columns(list(full_features.keys()))

	# # ------------------------------------------------------------------
	# # 5) Write Parquet
	# # ------------------------------------------------------------------
	# out_dir = Path("./hf_data")
	# out_dir.mkdir(parents=True, exist_ok=True)

	# out_path = out_dir / "train.parquet"
	# processed.to_parquet(str(out_path))

	# print("✓ Wrote:", out_path.resolve())
	# print("Columns:", list(processed.features.keys()))


	# ------------------------------------------------------------------
	# 4.1) Downsample to 30k, mainly reducing math-heavy sources
	# ------------------------------------------------------------------
	from collections import Counter

	TARGET_SIZE = 30_000
	MATH_SHARE = 0.20 # keep ~20% math (tweak if you want)
	SEED = 2025

	# Define which sources are "mathy"
	MATH_SOURCES = {
	"Multimath-300k",
	"TabMWP",
	"Geometry3K",
	"CLEVR-Math",
	"DVQA",
	"FigureQA",
	"ChartQA",
	"PlotQA",
	"EXAMS-V-train/Mathematics",
	"UniGeo",
	"GeoQA+",
	}

	def is_math_source(name: Optional[str]) -> bool:
	if not name:
	return False
	return name in MATH_SOURCES or ("math" in name.lower())

	# Split
	math_ds = processed.filter(lambda ex: is_math_source(ex.get("data_source")), num_proc=4)
	non_math_ds = processed.filter(lambda ex: not is_math_source(ex.get("data_source")), num_proc=4)

	# Decide quotas
	non_math_quota = min(len(non_math_ds), int(TARGET_SIZE * (1 - MATH_SHARE)))
	math_quota = TARGET_SIZE - non_math_quota
	math_quota = min(math_quota, len(math_ds)) # guard if math is too small

	# Sample deterministically
	non_math_sample = non_math_ds.shuffle(seed=SEED).select(range(non_math_quota))
	math_sample = math_ds.shuffle(seed=SEED).select(range(math_quota))

	# Combine and shuffle
	final = concatenate_datasets([non_math_sample, math_sample]).shuffle(seed=SEED)

	# Quick sanity printout
	cnt = Counter(final["data_source"])
	total = len(final)
	print(f"Final size: {total} (non-math {non_math_quota}, math {math_quota})")
	for name, n in sorted(cnt.items(), key=lambda x: -x[1])[:25]:
	pct = n / total
	print(f"{name:30s} {n:6d} {pct:7.3%}")

	# Use this 'final' dataset for writing
	processed = final
	out_path = out_dir / "train_30k.parquet"
	processed.to_parquet(str(out_path))
	print("✓ Wrote:", out_path.resolve())