xinghe
/
zhogn

 
			
							import json
import os
import re
import random
import pandas as pd

from make_arrow import make_arrow, make_arrow_vqa, make_arrow_melinda


def prepro_vqa_vqa_rad():
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }

    data_root = "data/finetune_data/vqa_rad/"
    image_root = f"{data_root}/images"

    for split in ["train", "val", "test"]:
        with open(f"{data_root}/{split}set.json", "r") as fp:
            samples = json.load(fp)
            for sample in samples:
                img_path = os.path.join(image_root, sample["image_name"])
                qid = sample["qid"]
                question = sample["question"]
                answer = sample["answer"]
                answer_type = sample["answer_type"]
                data[split].append({
                    "img_path": img_path,
                    "qid": qid,
                    "question": question,
                    "answer": answer,
                    "answer_type": answer_type
                })
    make_arrow_vqa(data, "vqa_vqa_rad", "data/finetune_arrows/")


def prepro_vqa_slack():
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }

    data_root = "data/finetune_data/slack/"
    image_root = f"{data_root}/imgs"

    for split, file in zip(["train", "val", "test"], ["train.json", "validate.json", "test.json"]):
        with open(f"{data_root}/{file}", "r") as fp:
            samples = json.load(fp)
            for sample in samples:
                if sample["q_lang"] != "en":
                    continue
                img_path = os.path.join(image_root, sample["img_name"])
                qid = sample["qid"]
                question = sample["question"]
                answer = sample["answer"]
                answer_type = sample["answer_type"]
                data[split].append({
                    "img_path": img_path,
                    "qid": qid,
                    "question": question,
                    "answer": answer,
                    "answer_type": answer_type
                })
    make_arrow_vqa(data, "vqa_slack", "data/finetune_arrows/")


def prepro_vqa_medvqa2019():
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }

    data_root = "data/finetune_data/medvqa_2019/"
    image_root = "data/finetune_data/medvqa_2019/{}/images"

    offset = 0
    for split in ["train", "val", "test"]:
        samples = open(f"{data_root}/{split}/QA/Modality.csv").read().strip().split("\n") + \
                          open(f"{data_root}/{split}/QA/Organ.csv").read().strip().split("\n") + \
                          open(f"{data_root}/{split}/QA/Plane.csv").read().strip().split("\n")
        samples = [[idx + offset] + question.split("|") for idx, question in enumerate(samples)]
        offset += len(samples)
        for sample in samples:
            img_path = os.path.join(image_root.format(split), sample[1] + ".jpg")
            qid = sample[0]
            question = sample[2]
            answer = sample[3]
            answer_type = "OPEN"
            data[split].append({
                "img_path": img_path,
                "qid": qid,
                "question": question,
                "answer": answer,
                "answer_type": answer_type
            })
    make_arrow_vqa(data, "vqa_medvqa_2019", "data/finetune_arrows/")


def prepro_cls_melinda():
        random.seed(42)

        data = {
            "train": [],
            "val": [],
            "test": []
        }

        data_root = "data/finetune_data/melinda"
        image_root = f"{data_root}/melinda_images"

        for split, file in zip(["train", "val", "test"], ["train.csv", "dev.csv", "test.csv"]):
            samples = pd.read_csv(f"{data_root}/{file}")
            for sample_idx, sample in samples.iterrows():

                img_path = os.path.join(image_root, sample["figure_file"])
                texts = [sample["caption"]]
                i_meth = sample["i_meth"]
                p_meth = sample["p_meth"]
                i_meth_label = sample["i_meth_label"]
                p_meth_label = sample["p_meth_label"]

                if len(texts) > 0:
                    data[split].append({
                        "img_path": img_path,
                        "texts": texts,
                        "i_meth": i_meth,
                        "p_meth": p_meth,
                        "i_meth_label": i_meth_label,
                        "p_meth_label": p_meth_label
                    })

        make_arrow_melinda(data, "cls_melinda", "data/finetune_arrows/")


def prepro_irtr_roco(min_length=3):
    random.seed(42)

    data = {
        "train": [],
        "val": [],
        "test": []
    }
    roco_data_root = "data/pretrain_data/roco"
    roco_image_root = "data/pretrain_data/roco/{}/radiology/images/"

    for split in ["train", "val", "test"]:
        with open(f"{roco_data_root}/{split}/radiology/captions.txt", "r") as fp:
            lines = fp.read().strip().split("\n")
            random.shuffle(lines)
            for line_idx, line in enumerate(lines):
                str_splits = line.strip().split('\t')
                if len(str_splits) == 2:
                    img_path = os.path.join(roco_image_root.format(split), str_splits[0] + ".jpg")
                    texts = [str_splits[1]]
                    texts = [re.sub(r"\s+", " ", text) for text in texts]
                    texts = [text for text in texts if len(text.split()) >= min_length]
                    if len(texts) > 0:
                        data[split].append({
                            "img_path": img_path,
                            "texts": texts
                        })
                        if split == "val" and len(data[split]) == 2000:
                            break
                        if split == "test" and len(data[split]) == 2000:
                            break
    make_arrow(data, "irtr_roco", "data/finetune_arrows/")


if __name__ == '__main__':
    prepro_vqa_vqa_rad()
    # prepro_vqa_slack()
    # prepro_vqa_medvqa2019()
    # prepro_cls_melinda()
    # prepro_irtr_roco()